<a href="https://colab.research.google.com/github/prasad1482/AI-Resume-Matcher-Smart-Resume-to-Job-Description-Matching-with-BERT/blob/main/preprocessing_AI_resume_matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/expanded_job_resume_dataset.csv")

# Display first few rows
print(df.head())


            Job Title                                Job Skills Required  \
0      Data Scientist  Python, Machine Learning, Pandas, TensorFlow, SQL   
1   Marketing Manager  SEO, Google Ads, Content Marketing, Brand Stra...   
2  Frontend Developer                JavaScript, React, CSS, HTML, Redux   
3   Marketing Manager  SEO, Google Ads, Content Marketing, Brand Stra...   
4  Frontend Developer                JavaScript, React, CSS, HTML, Redux   

  Candidate Name                                   Candidate Skills  \
0          Frank  Recruitment, Employee Relations, Payroll, HR A...   
1            Bob  Python, Machine Learning, Pandas, TensorFlow, SQL   
2            Eve  SEO, Google Ads, Content Marketing, Brand Stra...   
3            Bob  Python, Machine Learning, Pandas, TensorFlow, SQL   
4            Bob  Python, Machine Learning, Pandas, TensorFlow, SQL   

   Match Label  
0            0  
1            0  
2            0  
3            0  
4            0  


#Install necessary libraries


In [2]:
!pip install nltk
!pip install sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

#clean dataset


In [3]:
print(df.columns)


Index(['Job Title', 'Job Skills Required', 'Candidate Name',
       'Candidate Skills', 'Match Label'],
      dtype='object')


In [4]:
import re # Import the re module for regular expressions
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""  # Handle missing values
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

# Apply cleaning function
df["cleaned_resume"] = df["Candidate Skills"].apply(clean_text)
df["cleaned_job_desc"] = df["Job Skills Required"].apply(clean_text)

# Display cleaned dataset
display(df[["cleaned_resume", "cleaned_job_desc"]].head())

# Save cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,cleaned_resume,cleaned_job_desc
0,recruitment employee relations payroll hr anal...,python machine learning pandas tensorflow sql
1,python machine learning pandas tensorflow sql,seo google ads content marketing brand strategy
2,seo google ads content marketing brand strategy,javascript react css html redux
3,python machine learning pandas tensorflow sql,seo google ads content marketing brand strategy
4,python machine learning pandas tensorflow sql,javascript react css html redux


#Feature extraction


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Combine the cleaned skills for vectorization
job_desc_vectors = vectorizer.fit_transform(df["cleaned_job_desc"])
resume_vectors = vectorizer.transform(df["cleaned_resume"])

# Compute similarity scores
similarity_scores = []
for i in range(len(df)):
    score = cosine_similarity(resume_vectors[i], job_desc_vectors[i])
    similarity_scores.append(score[0][0])

# Add similarity scores to the DataFrame
df["match_score"] = similarity_scores

# Display the updated dataset
df[["Candidate Name", "Job Title", "match_score"]].head()


Unnamed: 0,Candidate Name,Job Title,match_score
0,Frank,Data Scientist,0.0
1,Bob,Marketing Manager,0.0
2,Eve,Frontend Developer,0.0
3,Bob,Marketing Manager,0.0
4,Bob,Frontend Developer,0.0


#model training

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Create the binary classification label
# You can adjust the threshold for a good match, here I set it to 0.6
df["Match Label"] = (df["match_score"] > 0.6).astype(int)

# Split the data into training and testing sets
X = df[["match_score"]]  # Features (match scores)
y = df["Match Label"]    # Labels (good/bad match)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier (or you can use Logistic Regression)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       252
           1       1.00      1.00      1.00        48

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



#save the model

In [7]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'resume_job_match_model.pkl')


['resume_job_match_model.pkl']

#Use Better Text Representation
TF-IDF to BERT Embeddings



In [None]:
from sentence_transformers import SentenceTransformer

# Load Pretrained BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert resumes & job descriptions to BERT embeddings
resume_embeddings = bert_model.encode(df["cleaned_resume"].tolist())
jd_embeddings = bert_model.encode(df["cleaned_job_desc"].tolist())



**Measure Similarity Beyond Words**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity scores
similarity_scores = cosine_similarity(resume_embeddings, jd_embeddings)

# Convert scores into a DataFrame
df["similarity_score"] = [max(scores) for scores in similarity_scores]


Implement NER for Skills Extraction



In [12]:
import spacy

# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

# Extract Skills & Titles
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "WORK_OF_ART"]]
    return " ".join(entities)

df["resume_entities"] = df["cleaned_resume"].apply(extract_entities)
df["jd_entities"] = df["cleaned_job_desc"].apply(extract_entities)

# Assume exp_score and edu_score are based on some criteria,
# here's a placeholder for calculating them:

# Placeholder for calculating exp_score (replace with your logic)
df['exp_score'] = 0  # Initialize with 0 or any default value

# Placeholder for calculating edu_score (replace with your logic)
df['edu_score'] = 0  # Initialize with 0 or any default value


def weighted_score(skill_match, exp_match, edu_match):
    return 0.5 * skill_match + 0.3 * exp_match + 0.2 * edu_match  # Adjust weights

df["final_match_score"] = df.apply(lambda row: weighted_score(row["similarity_score"], row["exp_score"], row["edu_score"]), axis=1)

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Prepare training data (features: skills, experience, education scores)
X_train = df[["similarity_score", "exp_score", "edu_score"]]
y_train = df["Match Label"]

# Train Model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save Model
import joblib
joblib.dump(model, "resume_ranking_model.pkl")


['resume_ranking_model.pkl']

#compute evalution matrices

In [14]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Make predictions on training data
y_pred = model.predict(X_train)

# Print Classification Report
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Classification Report:\n", classification_report(y_train, y_pred))

# Compute ROC-AUC
roc_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
print("ROC-AUC Score:", roc_auc)


Accuracy: 0.833
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       833
           1       0.00      0.00      0.00       167

    accuracy                           0.83      1000
   macro avg       0.42      0.50      0.45      1000
weighted avg       0.69      0.83      0.76      1000

ROC-AUC Score: 0.5409924448821445


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#Solution to improve
Balance the Dataset
Use SMOTE (Synthetic Minority Over-sampling Technique) to generate more "Match (1)" samples.

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [16]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_resampled, y_resampled)


Parameters: { "use_label_encoder" } are not used.



In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Create the binary classification label
# You can adjust the threshold for a good match, here I set it to 0.6
df["Match Label"] = (df["match_score"] > 0.6).astype(int)

# *** Create 'similarity_score', 'exp_score', 'edu_score' columns BEFORE the split ***
# Assume exp_score and edu_score are based on some criteria,
# here's a placeholder for calculating them:

# Placeholder for calculating exp_score (replace with your logic)
df['exp_score'] = 0  # Initialize with 0 or any default value

# Placeholder for calculating edu_score (replace with your logic)
df['edu_score'] = 0  # Initialize with 0 or any default value

# Placeholder for calculating similarity_score
# I will use match_score as a placeholder
df['similarity_score'] = df['match_score']


# Split the data into training and testing sets
X = df[["similarity_score", "exp_score", "edu_score"]]  # Features (match scores)
y = df["Match Label"]    # Labels (good/bad match)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
# prompt: Check Accuracy, Precision, Recall, and F1-score

# ... (Your existing code)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

print("Evaluation on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Evaluation on Test Set:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       252
           1       1.00      1.00      1.00        48

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [22]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC Score: {roc_auc}")


ROC-AUC Score: 1.0


In [23]:
import joblib

# Save the trained model
joblib.dump(model, "resume_matcher.pkl")

print("Model saved successfully!")


Model saved successfully!
