# Testing different similarity scores

## Importing Libraries

In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bert_score import score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import re

## Useful functions

In [2]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(["overqualified", "underqualified", "mismatch", "good"])

def preprocess_text(text):
    """Preprocess text by removing unwanted symbols, normalizing, and removing stopwords."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)
    text = re.sub(r"-(?!\d)", "", text)  # Preserve hyphens only when followed by a number
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)  # Remove periods unless in numbers
    text = text.replace("\n", " ").replace("\r", " ")
    text = text.replace("show less", "").replace("show more", "")
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")

def chunk_text(text, max_length=510, overlap=50):
    """Splits long text into overlapping chunks to fit the model's context limit."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length - overlap):
        chunk = " ".join(words[i : i + max_length])
        chunks.append(chunk)
    return chunks

def get_text_embedding(text):
    """Generates an aggregated embedding for long text using chunking and mean pooling."""
    chunks = chunk_text(text)
    chunk_embeddings = model.encode(chunks)  # Get embeddings for each chunk
    
    if len(chunk_embeddings) == 0:
        return np.zeros(model.get_sentence_embedding_dimension())  # Return zero vector if no embeddings
    
    # Mean pooling to aggregate chunk embeddings into a single vector
    final_embedding = np.mean(chunk_embeddings, axis=0)  
    return final_embedding

In [4]:

# 1. Compute BERTScore (Semantic Similarity)
def compute_bertscore(candidate, reference):
    P, R, F1 = score([candidate], [reference], lang="en", model_type="roberta-base")
    return [P.item(), R.item(), F1.item()]  # Use F1 score for evaluation


# 2. Compute Cosine Similarity (Lexical Similarity using TF-IDF)
def compute_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]


# 3. Compute Jaccard Similarity (Word Overlap Measure)
def compute_jaccard_similarity(text1, text2):
    words1 = set(word_tokenize(text1.lower())) - stop_words
    words2 = set(word_tokenize(text2.lower())) - stop_words
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    return len(intersection) / len(union) if union else 0

# 4. Compute cosine Similarity (with sentence-transformers embeddings)
def compare_job_resume(resume_text, job_text):
    """Computes similarity score between a job description and a resume."""
    job_embedding = get_text_embedding(job_text)
    resume_embedding = get_text_embedding(resume_text)

    # Compute cosine similarity
    similarity_score = cosine_similarity([job_embedding], [resume_embedding])[0][0]
    return similarity_score

## Defining simple job descriptions and resumes

In [5]:
# Job Description & Resumes
job_description = """We’re looking for a Data Scientist to develop and deploy machine learning models that drive business insights and operational efficiency. You’ll collaborate with cross-functional teams to analyze complex data, optimize AI-driven applications, and extract actionable intelligence.

What You’ll Do:

Develop, train, and deploy machine learning models.
Analyze and interpret large datasets to uncover insights.
Optimize model performance and application efficiency.
Collaborate with subject matter experts to validate data-driven decisions.
Document findings, best practices, and technical workflows.


Preferred Qualifications:

MS/PhD in Computer Science, Statistics, or a related field.
Strong background in machine learning, regression, and classification.
Proficiency in Python, SQL, and data visualization tools.
Experience with time-series analysis and scalable ML solutions.
Excellent analytical, problem-solving, and communication skills."""


resumes = {
    # 1. Complete Mismatch  
    "Complete Mismatch" : """John Doe  
    123 Main St, City, State, 12345  
    Email: johndoe@example.com | Phone: (123) 456-7890  

    Objective: Seeking a role as a Graphic Designer where I can utilize my creativity and design skills.  

    Experience:  
    - Freelance Graphic Designer (2018-Present)  
      - Created marketing materials for small businesses.  
      - Designed logos, brochures, and social media assets.  

    Skills:  
    - Adobe Photoshop, Illustrator, InDesign  
    - Branding and visual storytelling  
    - Social media marketing  

    Education:  
    - B.A. in Fine Arts, University of XYZ (2016)""",

    # 2. Underwhelming Candidate  
    "Underwhelming" : """Jane Smith  
    456 Elm St, City, State, 12345  
    Email: janesmith@example.com | Phone: (234) 567-8901  

    Objective: Entry-level data analyst eager to apply basic data analysis skills in a professional setting.  

    Experience:  
    - Data Entry Clerk, ABC Corp (2022-Present)  
      - Input and processed customer data in Excel.  
      - Assisted in generating basic reports.  

    Skills:  
    - Excel, Basic SQL  
    - Basic Python (Pandas, NumPy)  
    - Data cleaning and entry  

    Education:  
    - B.A. in Business Administration, University of ABC (2021)""",

    # 3. Good Fit Candidate  
    "Good fit" : """Michael Johnson  
    789 Oak St, City, State, 12345  
    Email: michaeljohnson@example.com | Phone: (345) 678-9012  

    Objective: Data Scientist with a passion for leveraging machine learning to drive business insights and operational efficiency.  

    Experience:  
    - Data Scientist, XYZ Tech (2021-Present)  
      - Developed and deployed ML models for customer segmentation and fraud detection.  
      - Optimized machine learning pipelines for large-scale data processing.  
      - Collaborated with cross-functional teams to integrate AI-driven solutions.  

    Skills:  
    - Python (Scikit-learn, TensorFlow, Pandas)  
    - SQL, Data Visualization (Tableau, Matplotlib)  
    - Time-Series Analysis, Classification, Regression  

    Education:  
    - M.S. in Data Science, University of DEF (2020)""",

    # 4. Overqualified Candidate  
    "Overqualified" : """Dr. Emily Carter  
    321 Maple St, City, State, 12345  
    Email: emilycarter@example.com | Phone: (456) 789-0123  

    Objective: AI/ML researcher seeking to lead high-impact projects in advanced machine learning and deep learning applications.  

    Experience:  
    - Principal Data Scientist, Global AI Labs (2018-Present)  
      - Designed and implemented cutting-edge deep learning models for autonomous systems.  
      - Led a team of data scientists and engineers in developing scalable AI solutions.  
      - Published multiple papers in top-tier AI conferences.  

    Skills:  
    - Deep Learning (Transformer models, GANs)  
    - Big Data (Spark, Hadoop), Cloud ML Deployment (AWS, GCP)  
    - Advanced Statistical Modeling, Bayesian Inference  

    Education:  
    - Ph.D. in Machine Learning, MIT (2015)"""
}

## Testing different similarity metrics

In [6]:
# Compute Similarity Scores
scores = []
for fit in resumes.keys():
    resume_data = preprocess_text(resumes[fit])
    job_data = preprocess_text(job_description)

    bert_scores = compute_bertscore(resume_data, job_data)
    cosine_score = compute_cosine_similarity(resume_data, job_data)
    cosine_score_with_embeddings = compare_job_resume(resume_data, job_data)
    jaccard_score = compute_jaccard_similarity(resume_data, job_data)


    print("Resume fit category: ", fit)
    print("- Bert Scores: ", bert_scores)
    print("- Cosine Similarity with tfidf: ", cosine_score)
    print("- Cosine Similarity with sentence embeddings: ", cosine_score_with_embeddings)
    print("- Jaccard Similarity: ", jaccard_score)

    print("\n***************************************************************************\n")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resume fit category:  Complete Mismatch
- Bert Scores:  [0.7952219247817993, 0.8138419389724731, 0.8044242262840271]
- Cosine Similarity with tfidf:  0.01647419402894092
- Cosine Similarity with sentence embeddings:  0.37291026
- Jaccard Similarity:  0.01652892561983471

***************************************************************************



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resume fit category:  Underwhelming
- Bert Scores:  [0.7990970611572266, 0.8183722496032715, 0.8086197972297668]
- Cosine Similarity with tfidf:  0.11258922797674731
- Cosine Similarity with sentence embeddings:  0.6030619
- Jaccard Similarity:  0.06086956521739131

***************************************************************************



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resume fit category:  Good fit
- Bert Scores:  [0.8329084515571594, 0.8659994602203369, 0.84913170337677]
- Cosine Similarity with tfidf:  0.3316573301657758
- Cosine Similarity with sentence embeddings:  0.6797707
- Jaccard Similarity:  0.22807017543859648

***************************************************************************



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resume fit category:  Overqualified
- Bert Scores:  [0.8102753162384033, 0.8361140489578247, 0.8229919672012329]
- Cosine Similarity with tfidf:  0.20017990502141964
- Cosine Similarity with sentence embeddings:  0.60573125
- Jaccard Similarity:  0.09090909090909091

***************************************************************************

