Load the Data

In [3]:
import pandas as pd

# Load cleaned resumes
resumes_df = pd.read_csv("../data/resumes_cleaned.csv")

# Load JD dataset
jd_df = pd.read_csv("../data/job_descriptions.csv")

# Check data shape
print(f"Resumes: {resumes_df.shape}")
print(f"Job Descriptions: {jd_df.shape}")

# Preview samples
resumes_df.head(2), jd_df.head(2)



Resumes: (962, 3)
Job Descriptions: (75, 2)


(       Category                                             Resume  \
 0  Data Science  Skills * Programming Languages: Python (pandas...   
 1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...   
 
                                       Cleaned_Resume  
 0  skill programming language python panda numpy ...  
 1  education detail             uit rgpv data sci...  ,
          Category                                            JD_Text
 0  Java Developer  **Job Title**: Java Backend Developer\n**Locat...
 1  Java Developer  **Job Title**: Full Stack Java Developer\n**Lo...)

Import SpaCy and Load Model

In [4]:
import spacy
import re

# Load SpaCy English NLP model
nlp = spacy.load("en_core_web_sm")


Define the Cleaning Function

In [5]:
# Text cleaner
def clean_text_spacy(text):
    text = text.lower()
    text = re.sub(r'\n|\r|\t', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and len(token.text) > 2
    ]
    return ' '.join(tokens)


Apply Cleaning to JD Text

Clean and Add Cleaned_JD Column

In [6]:
jd_df['Cleaned_JD'] = jd_df['JD_Text'].apply(clean_text_spacy)
jd_df[['Category', 'Cleaned_JD']].head(2)


Unnamed: 0,Category,Cleaned_JD
0,Java Developer,job title java backend developer location ...
1,Java Developer,job title stack java developer location ...


Make JD Categories Unique


In [7]:
# Append row index to category to make unique JD labels
jd_df['Unique_Category'] = jd_df['Category'] + "_" + jd_df.index.astype(str)

# These will become the column labels in the similarity matrix
jd_labels = jd_df['Unique_Category'].tolist()



Load Pretrained SentenceTransformer

In [8]:
from sentence_transformers import SentenceTransformer

# Load a fast and lightweight model (great for similarity)
model = SentenceTransformer('all-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm


Generate BERT Resume Embeddings

In [9]:
# Get cleaned resume text list
resume_texts = resumes_df['Cleaned_Resume'].tolist()

# Generate embeddings
resume_embeddings = model.encode(resume_texts, show_progress_bar=True)


Batches: 100%|██████████| 31/31 [00:07<00:00,  4.26it/s]


Generate BERT JD Embeddings

In [10]:
# Get cleaned JD text list
jd_texts = jd_df['Cleaned_JD'].tolist()

# Generate embeddings
jd_embeddings = model.encode(jd_texts, show_progress_bar=True)


Batches: 100%|██████████| 3/3 [00:00<00:00,  9.07it/s]


Compute Cosine Similarity and Create DataFrame

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Similarity matrix
similarity_matrix = cosine_similarity(resume_embeddings, jd_embeddings)
similarity_matrix = np.round(similarity_matrix, 4)

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, columns=jd_labels)
similarity_df['Resume Preview'] = resumes_df['Resume'].apply(lambda x: x[:100] + '...')
similarity_df = similarity_df[['Resume Preview'] + jd_labels]

# Preview
similarity_df.head()



Unnamed: 0,Resume Preview,Java Developer_0,Java Developer_1,Java Developer_2,DevOps Engineer_3,DevOps Engineer_4,DevOps Engineer_5,Python Developer_6,Python Developer_7,Python Developer_8,...,PMO_65,HR_66,HR_67,HR_68,Arts_69,Arts_70,Arts_71,ETL Developer_72,ETL Developer_73,ETL Developer_74
0,Skills * Programming Languages: Python (pandas...,0.3915,0.3392,0.4121,0.4429,0.4301,0.3702,0.4548,0.5501,0.5071,...,0.4064,0.4656,0.4597,0.4464,0.431,0.4178,0.4258,0.3931,0.3987,0.3941
1,Education Details \r\nMay 2013 to May 2017 B.E...,0.3897,0.3704,0.4749,0.4441,0.4116,0.3821,0.4291,0.572,0.5456,...,0.4325,0.4906,0.4822,0.4554,0.4822,0.4284,0.4509,0.3995,0.4126,0.398
2,"Areas of Interest Deep Learning, Control Syste...",0.3733,0.3143,0.4364,0.3303,0.3537,0.2923,0.4047,0.4598,0.4794,...,0.4328,0.4438,0.4315,0.418,0.3928,0.3909,0.3975,0.3487,0.35,0.3483
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,0.3952,0.3145,0.4127,0.3751,0.3747,0.3034,0.4307,0.4509,0.4575,...,0.4625,0.4786,0.4638,0.4858,0.4429,0.4384,0.4661,0.4263,0.4365,0.4352
4,"Education Details \r\n MCA YMCAUST, Faridab...",0.5317,0.4772,0.633,0.4628,0.4643,0.3991,0.4928,0.5586,0.6065,...,0.5399,0.5508,0.5626,0.5347,0.5226,0.5069,0.5059,0.4599,0.4819,0.4695


Save Full Similarity Matrix

In [12]:
similarity_df.to_csv("../data/resume_jd_similarity_scores.csv", index=False)
print("✅ Saved full similarity matrix")



✅ Saved full similarity matrix


Get Top 3 JD Matches per Resume

In [14]:
def get_top_matches_per_resume(sim_df, top_n=3):
    jd_cols = sim_df.columns[1:]  # skip 'Resume Preview'
    top_matches = []

    for i, row in sim_df.iterrows():
        row_scores = row[jd_cols].sort_values(ascending=False).head(top_n)
        result = {"Resume Preview": row['Resume Preview']}
        for j in range(top_n):
            result[f"Top{j+1}_JD"] = row_scores.index[j]
            result[f"Top{j+1}_Score"] = row_scores.iloc[j]
        top_matches.append(result)

    return pd.DataFrame(top_matches)

top_resume_matches_df = get_top_matches_per_resume(similarity_df, top_n=3)
top_resume_matches_df.to_csv("../data/top_jd_matches_per_resume.csv", index=False)
top_resume_matches_df.head()



Unnamed: 0,Resume Preview,Top1_JD,Top1_Score,Top2_JD,Top2_Score,Top3_JD,Top3_Score
0,Skills * Programming Languages: Python (pandas...,Data Science_49,0.5502,Python Developer_7,0.5501,Data Science_48,0.5295
1,Education Details \r\nMay 2013 to May 2017 B.E...,Python Developer_7,0.572,Data Science_48,0.5531,Data Science_49,0.5523
2,"Areas of Interest Deep Learning, Control Syste...",Python Developer_8,0.4794,Operations Manager_11,0.4649,Operations Manager_10,0.4616
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Data Science_49,0.5564,Database_47,0.5345,Data Science_48,0.5332
4,"Education Details \r\n MCA YMCAUST, Faridab...",Data Science_50,0.665,Data Science_48,0.6542,Data Science_49,0.6343


Top 3 Resumes per JD

In [15]:
def get_top_matches_per_jd(sim_df, top_n=3):
    jd_cols = sim_df.columns[1:]
    jd_top_matches = []

    for jd in jd_cols:
        top_rows = sim_df.sort_values(by=jd, ascending=False).head(top_n)
        for idx, row in top_rows.iterrows():
            jd_top_matches.append({
                "JD Category": jd,
                "Resume Preview": row['Resume Preview'],
                "Score": row[jd]
            })

    return pd.DataFrame(jd_top_matches)

top_resume_per_jd_df = get_top_matches_per_jd(similarity_df, top_n=3)
top_resume_per_jd_df.to_csv("../data/top_resume_matches_per_jd.csv", index=False)
top_resume_per_jd_df.head()



Unnamed: 0,JD Category,Resume Preview,Score
0,Java Developer_0,"TECHNICAL SKILLS Skills: Java, SQL, PL/SQL, C,...",0.6398
1,Java Developer_0,"TECHNICAL SKILLS Skills: Java, SQL, PL/SQL, C,...",0.6398
2,Java Developer_0,"TECHNICAL SKILLS Skills: Java, SQL, PL/SQL, C,...",0.6398
3,Java Developer_1,Technical Skills Key Skills MS Technology .Net...,0.5816
4,Java Developer_1,CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...,0.5803
