In [1]:
!pip install transformers tqdm spacy



In [2]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import pandas as pd
import re
import numpy as np
import concurrent.futures
import time
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import spacy
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    # If model isn't installed, download it
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Function to read the Excel file
def read_jd_data(file_path):
    """Read job descriptions from Excel file."""
    try:
        df = pd.read_excel(file_path)
        print(f"Successfully loaded {len(df)} job descriptions")
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return pd.DataFrame()

# Model 1: Rule-based Regex Pattern Matching
def extract_experience_regex(text):
    """
    Extract experience requirements using regex patterns.
    Handles various formats like "X+ years", "X to Y years", "minimum X years", etc.
    """
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    # Clean text - remove newlines, extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Pattern for explicit year mentions
    patterns = [
        r'(\d+\+)\s*years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # 5+ years experience
        r'(\d+)[-–]\s*(\d+)\s+years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # 5-7 years experience
        r'minimum\s+(?:of\s+)?(\d+)\s+years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # minimum 5 years experience
        r'at\s+least\s+(\d+)\s+years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # at least 5 years experience
        r'(\d+)\s+(?:or\s+)?more\s+years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # 5 or more years experience
        r'(?:with\s+)?(\d+)\s+years?\s+(?:of\s+)?(?:work\s+)?(?:experience|exp)',  # with 5 years experience
        r'experience:?\s+(\d+)(?:\+)?\s+years?',  # experience: 5+ years
        r'(?:work\s+)?experience\s+(?:of\s+)?(?:at\s+least\s+)?(\d+)(?:\+)?\s+years?'  # work experience of 5+ years
    ]
    
    # Check for job titles with experience
    title_exp_pattern = r'(?:senior|sr\.?|lead|principal|experienced|staff)\s+([a-zA-Z\s]+)'
    
    # Check for seniority levels
    seniority_mapping = {
        'entry level': 0, 
        'junior': 1,
        'mid-level': 3,
        'intermediate': 3,
        'senior': 5,
        'lead': 7,
        'principal': 8,
        'director': 10,
        'executive': 12,
        'vp': 15,
        'chief': 15
    }
    
    # First try to find explicit year mentions
    for pattern in patterns:
        matches = re.findall(pattern, text.lower())
        if matches:
            if isinstance(matches[0], tuple):  # For ranges like "5-7 years"
                min_years, max_years = matches[0]
                return (int(min_years) + int(max_years)) / 2  # Return average as estimation
            elif matches[0].endswith('+'):  # For "5+" format
                return int(matches[0][:-1])
            else:
                return int(matches[0])
    
    # If no explicit years, check for seniority in job title
    title_match = re.search(title_exp_pattern, text.lower())
    if title_match:
        for level, years in seniority_mapping.items():
            if level in text.lower():
                return years
    
    # Look for seniority mentions in the text
    for level, years in seniority_mapping.items():
        if level in text.lower():
            return years
    
    return None

# Model 2: SpaCy NER with Custom Rules
def extract_experience_spacy(text):
    """Extract experience using SpaCy NER and custom rules."""
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    # Process text with SpaCy
    doc = nlp(text)
    
    # Look for sentences containing experience-related terms
    experience_sentences = []
    for sent in doc.sents:
        sent_text = sent.text.lower()
        if any(term in sent_text for term in ["experience", "experienced", "years", "year"]):
            experience_sentences.append(sent_text)
    
    # If no relevant sentences found, return None
    if not experience_sentences:
        return None
    
    # Apply regex patterns to these specific sentences for more targeted extraction
    combined_text = " ".join(experience_sentences)
    
    # Use the same regex patterns as in the regex function
    return extract_experience_regex(combined_text)

# Model 3: GPT-based Zero-shot Classification
def extract_experience_zero_shot(text, classifier):
    """Extract experience using zero-shot classification."""
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    # First try to find experience requirements in text
    candidate_labels = ["0 years", "1-2 years", "3-5 years", "5-7 years", "7-10 years", "10+ years"]
    
    try:
        # Use smaller chunks of text to avoid token limits
        chunks = [text[i:i+512] for i in range(0, len(text), 512)]
        results = []
        
        for chunk in chunks:
            if any(term in chunk.lower() for term in ["experience", "years", "senior", "junior"]):
                result = classifier(chunk, candidate_labels)
                results.append(result)
        
        if not results:
            return None
        
        # Find the most confident result
        best_result = max(results, key=lambda x: max(x['scores']))
        
        # Map label to numeric value
        label_to_years = {
            "0 years": 0,
            "1-2 years": 1.5,
            "3-5 years": 4,
            "5-7 years": 6,
            "7-10 years": 8.5,
            "10+ years": 12
        }
        
        return label_to_years[best_result['labels'][0]]
    except Exception as e:
        print(f"Error in zero-shot classification: {e}")
        return None

# Model 4: Transformer-based Sequence Classification
class ExperienceClassifier:
    def __init__(self):
        # Initialize tokenizer and model for fine-tuned BERT
        # Note: In a real implementation, you would fine-tune a model on labeled data
        self.tokenizer = None
        self.model = None
        self.initialized = False
        
    def initialize(self):
        """Initialize the model - here using a sentiment model as proxy."""
        # In a real implementation, you would load a custom fine-tuned model
        # For this demonstration, we'll use a sentiment model as a stand-in
        try:
            self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
            self.model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
            self.initialized = True
        except Exception as e:
            print(f"Error initializing transformer model: {e}")
    
    def extract_experience(self, text):
        """Extract experience using sequence classification."""
        if pd.isna(text) or not isinstance(text, str):
            return None
        
        if not self.initialized:
            self.initialize()
            
        if not self.initialized:
            return None
            
        try:
            # Extract sentences containing experience-related terms
            sentences = re.split(r'[.!?]', text)
            experience_sentences = [s for s in sentences if any(term in s.lower() for term in ["experience", "years"])]
            
            if not experience_sentences:
                return None
            
            # Process sentences through model
            combined_text = " ".join(experience_sentences)
            
            # In a real implementation, this would use a custom model
            # For this demonstration, we'll fallback to regex after finding relevant sentences
            return extract_experience_regex(combined_text)
            
        except Exception as e:
            print(f"Error in transformer classification: {e}")
            return None

# Model 5: Named Entity Recognition with Custom Rules
def extract_experience_custom_ner(text):
    """Extract experience using custom NER rules."""
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    # Process text
    text = text.lower()
    
    # Dictionary of experience level terms and corresponding years
    experience_levels = {
        'entry level': 0,
        'entry-level': 0,
        'beginner': 0,
        'junior': 1,
        'mid level': 3,
        'mid-level': 3,
        'intermediate': 3,
        'experienced': 5,
        'senior': 5,
        'expert': 7,
        'lead': 7,
        'principal': 8,
        'director': 10,
        'executive': 12
    }
    
    # Check for seniority terms
    for level, years in experience_levels.items():
        pattern = fr'\b{re.escape(level)}\b'
        if re.search(pattern, text):
            return years
    
    # If no seniority terms found, use regex patterns
    return extract_experience_regex(text)

# Model 6: Hybrid Text Classification with Feature Extraction
def extract_experience_hybrid(text):
    """
    A hybrid approach combining multiple strategies:
    1. Look for explicit mentions of years
    2. Identify seniority levels
    3. Detect implicit experience requirements
    """
    if pd.isna(text) or not isinstance(text, str):
        return None
    
    # 1. Try regex patterns first
    regex_result = extract_experience_regex(text)
    if regex_result is not None:
        return regex_result
    
    # 2. Try custom NER
    ner_result = extract_experience_custom_ner(text)
    if ner_result is not None:
        return ner_result
    
    # 3. Look for education requirements as proxy for experience
    edu_patterns = {
        r'phd': 8,
        r'doctorate': 8,
        r'master\'?s': 5,
        r'mba': 5,
        r'bachelor\'?s': 3,
        r'bs': 3,
        r'ba': 3,
        r'associate\'?s': 1
    }
    
    for pattern, years in edu_patterns.items():
        if re.search(fr'\b{pattern}\b', text.lower()):
            return years
    
    # 4. Check for skill proficiency as proxy
    skill_levels = {
        'advanced': 5,
        'proficient': 3,
        'familiar': 1
    }
    
    for level, years in skill_levels.items():
        if re.search(fr'\b{level}\b', text.lower()):
            return years
    
    return None

# Function to extract experience using all models
def process_jd(jd, zero_shot_classifier=None, experience_classifier=None):
    """Process a single JD with all models and return results."""
    regex_result = extract_experience_regex(jd)
    spacy_result = extract_experience_spacy(jd)
    zero_shot_result = extract_experience_zero_shot(jd, zero_shot_classifier) if zero_shot_classifier else None
    transformer_result = experience_classifier.extract_experience(jd) if experience_classifier else None
    custom_ner_result = extract_experience_custom_ner(jd)
    hybrid_result = extract_experience_hybrid(jd)
    
    return {
        'Regex': regex_result,
        'SpaCy': spacy_result,
        'Zero-Shot': zero_shot_result,
        'Transformer': transformer_result,
        'Custom NER': custom_ner_result,
        'Hybrid': hybrid_result
    }

# Function to process all JDs in parallel
def process_all_jds(df, jd_column='JD_Text'):
    """Process all JDs in the dataframe."""
    print("Initializing models...")
    
    # Initialize zero-shot classifier
    try:
        zero_shot_classifier = pipeline("zero-shot-classification", 
                                        model="facebook/bart-large-mnli", 
                                        device=0 if torch.cuda.is_available() else -1)
    except Exception as e:
        print(f"Error initializing zero-shot classifier, will skip this model: {e}")
        zero_shot_classifier = None
    
    # Initialize transformer classifier
    experience_classifier = ExperienceClassifier()
    
    results = []
    
    print("Processing job descriptions...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        # Submit tasks
        future_to_index = {executor.submit(process_jd, 
                                          row[jd_column], 
                                          zero_shot_classifier, 
                                          experience_classifier): i 
                          for i, row in df.iterrows()}
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(df)):
            index = future_to_index[future]
            try:
                result = future.result()
                result['Index'] = index
                results.append(result)
            except Exception as e:
                print(f"Error processing JD at index {index}: {e}")
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Index')
    results_df.set_index('Index', inplace=True)
    
    # Join with original dataframe
    output_df = df.join(results_df)
    
    # Calculate agreement metrics and add recommendation
    output_df = analyze_model_performance(output_df)
    
    return output_df

# Function to analyze model performance
def analyze_model_performance(df):
    """Analyze performance of each model and add recommendations."""
    # Get model columns
    model_columns = ['Regex', 'SpaCy', 'Zero-Shot', 'Transformer', 'Custom NER', 'Hybrid']
    available_models = [col for col in model_columns if col in df.columns]
    
    if len(available_models) == 0:
        return df
    
    # Calculate agreement between models (where non-null)
    def get_agreement(row):
        values = [row[col] for col in available_models if pd.notna(row[col])]
        if len(values) <= 1:
            return None
        
        # Calculate standard deviation as measure of agreement
        return np.std(values)
    
    df['Model_Agreement'] = df.apply(get_agreement, axis=1)
    
    # Calculate success rate (non-null extractions)
    model_success = {}
    for model in available_models:
        model_success[model] = df[model].notna().mean() * 100
    
    # Simple voting mechanism for recommended value
    def get_recommended_value(row):
        values = [row[col] for col in available_models if pd.notna(row[col])]
        if not values:
            return None
        
        # If there's only one valid value, use it
        if len(values) == 1:
            return values[0]
        
        # If hybrid model has a value, prioritize it
        if 'Hybrid' in available_models and pd.notna(row['Hybrid']):
            return row['Hybrid']
        
        # Otherwise, get the most common value
        # In case of tie, prefer the more common years of experience
        from collections import Counter
        counts = Counter(values)
        most_common = counts.most_common()
        
        if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
            # If tie, take the average
            return sum(values) / len(values)
        else:
            return most_common[0][0]
    
    df['Recommended_Value'] = df.apply(get_recommended_value, axis=1)
    
    # Add model performance summary
    model_performance = pd.DataFrame({
        'Success_Rate': model_success
    }).sort_values('Success_Rate', ascending=False)
    
    # Determine best model based on success rate
    best_model = model_performance.index[0] if not model_performance.empty else None
    
    # Print performance summary
    print("\nModel Performance Summary:")
    print(model_performance)
    print(f"\nRecommended Model: {best_model}")
    
    return df

# Main function
def main():
    """Main function to run the experience extractor."""
    print("Experience Extractor for Job Descriptions")
    print("=======================================")
    
    # File path
    file_path = input("Enter the path to the Excel file containing job descriptions: ")
    
    # Read data
    df = read_jd_data(file_path)
    if df.empty:
        print("No data to process. Exiting.")
        return
    
    # Check column names
    print("\nAvailable columns:")
    for col in df.columns:
        print(f"- {col}")
    
    jd_column = input("\nEnter the name of the column containing job descriptions [default: JD_Text]: ")
    if not jd_column:
        jd_column = "JD_Text"
    
    if jd_column not in df.columns:
        print(f"Column '{jd_column}' not found. Exiting.")
        return
    
    # Process JDs
    start_time = time.time()
    result_df = process_all_jds(df, jd_column)
    end_time = time.time()
    
    print(f"\nProcessing completed in {end_time - start_time:.2f} seconds")
    
    # Save results
    output_path = input("Enter the path to save the results Excel file [default: experience_extraction_results.xlsx]: ")
    if not output_path:
        output_path = "experience_extraction_results.xlsx"
    
    result_df.to_excel(output_path)
    print(f"Results saved to {output_path}")
    
    # Print some sample results
    print("\nSample Results (first 5 rows):")
    print(result_df.head())

if __name__ == "__main__":
    main()

Experience Extractor for Job Descriptions


Enter the path to the Excel file containing job descriptions:  /kaggle/input/data-new-assignment/Job Descriptions 2.xlsx


Successfully loaded 2997 job descriptions

Available columns:
- JD_Text



Enter the name of the column containing job descriptions [default: JD_Text]:  


Initializing models...


Device set to use cuda:0


Processing job descriptions...


100%|██████████| 2997/2997 [32:50<00:00,  1.52it/s] 



Model Performance Summary:
             Success_Rate
Zero-Shot       81.481481
Hybrid          78.478478
Custom NER      71.871872
Regex           69.002336
Transformer     43.843844
SpaCy           41.207875

Recommended Model: Zero-Shot

Processing completed in 1972.57 seconds


Enter the path to save the results Excel file [default: experience_extraction_results.xlsx]:  


Results saved to experience_extraction_results.xlsx

Sample Results (first 5 rows):
                                             JD_Text  Regex  SpaCy  Zero-Shot  \
0  \n**Overview  \n  \n** Lazydays RV is looking ...    3.0    3.0       12.0   
1  \n**Ãrea De AtuaÃ§Ã£o  \n  \n** TÃ©cnico em I...    NaN    NaN        NaN   
2  \n\n\nðŸ”µ Capitole is still growing and we wa...    NaN    NaN        8.5   
3  \n\n\nAs a Solutions Engineer, you will work c...   12.0    NaN        8.5   
4  \n\n\n**ABOUT DAYONE**\n\nDayOne is a global l...    5.0    5.0        4.0   

   Transformer  Custom NER  Hybrid  Model_Agreement  Recommended_Value  
0          3.0         3.0     3.0         3.354102                3.0  
1          NaN         NaN     NaN              NaN                NaN  
2          NaN         NaN     NaN              NaN                8.5  
3          NaN         7.0    12.0         2.190177               12.0  
4          5.0         5.0     5.0         0.372678             