<a href="https://colab.research.google.com/github/snigdha2606/Resume_ranker_system/blob/main/Automatic_resume_ranker_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


data = pd.read_csv("/content/UpdatedResumeDataSet.csv")

# Getting job that are tech area
list_tech = ['Java Developer', 'Testing', 'DevOps Engineer', 'Python Developer',
             'Web Designing', 'ETL Developer', 'Data Science', 'SAP Developer',
             'Network Security Engineer', 'Automation Testing', 'DotNet Developer',
             'Hadoop', 'Database', 'Blockchain']

data['Category Area'] = data['Category'].apply(lambda category: 'Tech' if category in list_tech else 'Non-tech')


data = data.reindex(columns=['Category', 'Category Area', 'Resume'])

# Define a custom PyTorch dataset
class ResumeDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


class SimilarityModel(nn.Module):
    def __init__(self):
        super(SimilarityModel, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        return pooled_output


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = SimilarityModel()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


unique_categories = data['Category'].unique()


top_10_resumes_per_category = {}


def rank_resumes_for_category(category, data):
    category_data = data[data['Category'] == category]
    # Split data into training and testing sets
    train_data, _ = train_test_split(category_data, test_size=0.2, random_state=42)

    encoded_resumes = tokenizer.batch_encode_plus(train_data["Resume"].tolist(),
                                                  max_length=512,
                                                  truncation=True,
                                                  padding='max_length',
                                                  return_tensors='pt')

    resume_dataset = ResumeDataset(encoded_resumes)

    resume_loader = DataLoader(resume_dataset, batch_size=8, shuffle=False)
    # Extract embeddings for resumes
    resume_embeddings = []
    for batch in tqdm(resume_loader, desc=f'Embeddings for {category}', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
        resume_embeddings.append(outputs.cpu().numpy())
    resume_embeddings = np.concatenate(resume_embeddings)
    # Calculate average embeddings for resumes
    category_embedding = np.mean(resume_embeddings, axis=0)
    # Rank resumes based on cosine similarity with category embedding
    similarity_scores = cosine_similarity(resume_embeddings, [category_embedding])
    ranked_indices = np.argsort(similarity_scores.flatten())[::-1]
    ranked_resumes = train_data.iloc[ranked_indices[:10]]  # Select top 10 resumes
    return ranked_resumes

# Rank top 10 resumes for each category
for category in unique_categories:
    top_10_resumes_per_category[category] = rank_resumes_for_category(category, data)


for category, top_10_resumes in top_10_resumes_per_category.items():
    print("Top 10 resumes for category:", category)
    print(top_10_resumes)
    print("\n")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encod

Top 10 resumes for category: Data Science
        Category Category Area  \
38  Data Science          Tech   
8   Data Science          Tech   
18  Data Science          Tech   
28  Data Science          Tech   
2   Data Science          Tech   
22  Data Science          Tech   
32  Data Science          Tech   
31  Data Science          Tech   
11  Data Science          Tech   
1   Data Science          Tech   

                                               Resume  
38  Personal Skills â¢ Ability to quickly grasp t...  
8   Personal Skills â¢ Ability to quickly grasp t...  
18  Personal Skills â¢ Ability to quickly grasp t...  
28  Personal Skills â¢ Ability to quickly grasp t...  
2   Areas of Interest Deep Learning, Control Syste...  
22  Areas of Interest Deep Learning, Control Syste...  
32  Areas of Interest Deep Learning, Control Syste...  
31  Education Details \r\nMay 2013 to May 2017 B.E...  
11  Education Details \r\nMay 2013 to May 2017 B.E...  
1   Education Details \

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# Initialize SVM classifier
svm_clf = SVC(kernel='linear')


encoded_resumes_all = tokenizer.batch_encode_plus(data["Resume"].tolist(),
                                                  max_length=512,
                                                  truncation=True,
                                                  padding='max_length',
                                                  return_tensors='pt')

resume_dataset_all = ResumeDataset(encoded_resumes_all)
resume_loader_all = DataLoader(resume_dataset_all, batch_size=8, shuffle=False)

# Calculate embeddings for all resumes
resume_embeddings = []
for batch in tqdm(resume_loader_all, desc=f'Embeddings for all resumes', leave=False):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
    resume_embeddings.append(outputs.cpu().numpy())
resume_embeddings = np.concatenate(resume_embeddings)


accuracy_per_category = []
precision_per_category = []
recall_per_category = []
cross_val_scores_per_category = []


for category, top_10_resumes in top_10_resumes_per_category.items():

    category_indices = data[data['Category'] == category].index

    # Create true labels (1 for resumes in the current category, 0 otherwise)
    y_true_category = [1 if idx in category_indices else 0 for idx in range(len(data))]

    # Create predicted labels (1 for top 10 resumes, 0 otherwise)
    y_pred_category = [1 if idx in top_10_resumes.index else 0 for idx in range(len(data))]

    accuracy_category = accuracy_score(y_true_category, y_pred_category)
    accuracy_per_category.append(accuracy_category)

    precision_category = precision_score(y_true_category, y_pred_category)
    precision_per_category.append(precision_category)

    recall_category = recall_score(y_true_category, y_pred_category)
    recall_per_category.append(recall_category)

    # Perform cross-validation for the current category
    cross_val_scores_category = cross_val_score(svm_clf, resume_embeddings, y_pred_category, cv=5)
    cross_val_scores_per_category.append(cross_val_scores_category)

for i, category in enumerate(top_10_resumes_per_category.keys()):
    print("Category:", category)
    print("Accuracy:", accuracy_per_category[i])
    print("Precision:", precision_per_category[i])
    print("Recall:", recall_per_category[i])
    print("Cross-Validation Scores:", cross_val_scores_per_category[i])
    print()


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Category: Data Science
Accuracy: 0.9688149688149689
Precision: 1.0
Recall: 0.25
Cross-Validation Scores: [0.94818653 1.         1.         1.         1.        ]

Category: HR
Accuracy: 0.9646569646569647
Precision: 1.0
Recall: 0.22727272727272727
Cross-Validation Scores: [0.90673575 1.         0.96354167 1.         1.        ]

Category: Advocate
Accuracy: 0.9896049896049897
Precision: 1.0
Recall: 0.5
Cross-Validation Scores: [0.96891192 0.99481865 0.99479167 1.         1.        ]

Category: Arts
Accuracy: 0.972972972972973
Precision: 1.0
Recall: 0.2777777777777778
Cross-Validation Scores: [0.97927461 1.         1.         1.         1.        ]

Category: Web Designing
Accuracy: 0.9636174636174636
Precision: 1.0
Recall: 0.2222222222222222
Cross-Validation Scores: [0.95854922 0.98963731 0.98958333 0.98958333 0.98958333]

Category: Mechanical Engineer
Accuracy: 0.9688149688149689
Precision: 1.0
Recall: 0.25
Cross-Validation Scores: [0.99481865 0.97409326 1.         0.99479167 1.      