In [4]:
!pip install contractions
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [1]:
import numpy as np
import pandas as pd

import re
import string # for text cleaning
import contractions # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Resume Data

In [7]:
df = pd.read_csv('/content/drive/MyDrive/CV-JD-Matching/pdf_extracted_skills_education.csv')
df.head()

Unnamed: 0,Skills,Education,ID,Category
0,Accounting; General Accounting; Accounts Payab...,Northern Maine Community College 1994 Associat...,10554236,ACCOUNTANT
1,"accounting, accounts payable, Accounts Receiva...","Bachelor of Science : Accounting , May 2010 Un...",10674770,ACCOUNTANT
2,"accounts payables, accounts receivables, Accou...",Computer Applications Specialist Certificate P...,11163645,ACCOUNTANT
3,"accounting, balance sheet, budgets, client, cl...","EMORY UNIVERSITY, Goizueta Business School 5 2...",11759079,ACCOUNTANT
4,Aderant/CMS Financial reporting,Bachelor of Business Administration : Accounti...,12065211,ACCOUNTANT


In [8]:
df.shape

(2484, 4)

## Loading JD Data

In [4]:
jd_data = load_dataset('jacob-hugging-face/job-descriptions', split="train")
jd_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

training_data.csv:   0%|          | 0.00/3.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/853 [00:00<?, ? examples/s]

Dataset({
    features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
    num_rows: 853
})

In [5]:
jd_df = pd.DataFrame(jd_data)
jd_df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [6]:

print(jd_df['job_description'][0])
print(jd_df['model_response'][0])

minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizations more product

## Basic text cleaning

And doing some preprocessing in the data as discussed in the EDA notebook

In [9]:
def text_cleaning(text:str) -> str:
    if pd.isnull(text):
        return
    text = text.lower().strip()
    translator = str.maketrans('', '', string.punctuation)
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\S+@\S+', '', text) # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text) # Remove phone numbers
    text = text.translate(translator) # Remove puctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text) # Remove other non-alphanumeric characters

    return text.strip()

In [10]:
cv_df = df[~(df['Skills'].isna() & df['Education'].isna())].reset_index(drop=True)
cv_df = cv_df.fillna(value='')
cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']
cv_df['CV'] = cv_df['CV'].progress_apply(text_cleaning)

Progress Bar: 100%|██████████| 2469/2469 [00:00<00:00, 5591.07it/s]


In [11]:
cv_df.shape

(2469, 5)

In [12]:
job_descriptions = jd_df['job_description'].apply(text_cleaning)[:15].to_list() # jd_df['job_description'][:15]
resumes = cv_df['CV'].to_list()

In [13]:
print(resumes[0])

accounting general accounting accounts payable program management northern maine community college  associate  accounting city  state  usa emphasis in business  associates  accounting city  state  usa gpa gpa  accounting gpa   hours quarter attended husson college major accounting    semester hours toward bachelors degree professional military comptroller school  wk  managerial accounting i  interestedbased bargaining training for management   hrs  auditing methods and concepts  organizational leadership   hrs  management development ii   hrs


In [14]:
print(job_descriptions[0])

minimum qualifications bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles preferred qualifications  years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills about the job as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizations more product

## Creating Embedding using `DistilBertTokenizer`, `DistilBertModel`

In [15]:
%%time

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Tokenize and embed job descriptions
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CPU times: user 17.7 s, sys: 3.34 s, total: 21 s
Wall time: 33.3 s


In [16]:
# Tokenize and embed resumes
resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])

In [None]:
save_directory = '/content/drive/MyDrive/distilbert_model'

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/distilbert_model/tokenizer_config.json',
 '/content/drive/MyDrive/distilbert_model/special_tokens_map.json',
 '/content/drive/MyDrive/distilbert_model/vocab.txt',
 '/content/drive/MyDrive/distilbert_model/added_tokens.json')

In [17]:
job_description_embeddings[0].shape, resume_embeddings[0].shape

((768,), (768,))

In [18]:
len(job_description_embeddings), len(resume_embeddings)

(15, 2469)

## Calculating Similarity Score & Getting Top n Candidates

In [19]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.82088774, 0.77677274, 0.7848917 , ..., 0.8589794 , 0.7705459 ,
        0.60554785],
       [0.7844252 , 0.7171359 , 0.7481307 , ..., 0.8324866 , 0.7884388 ,
        0.6784491 ],
       [0.81398576, 0.7895926 , 0.78363687, ..., 0.84983325, 0.7580299 ,
        0.6202629 ],
       ...,
       [0.81677645, 0.7708843 , 0.7748317 , ..., 0.884326  , 0.7708575 ,
        0.6345933 ],
       [0.8439074 , 0.77827585, 0.7971461 , ..., 0.87831897, 0.82519805,
        0.68138146],
       [0.83300304, 0.7873483 , 0.77523583, ..., 0.88907975, 0.7842255 ,
        0.64485663]], dtype=float32)

In [20]:
# Rank job descriptions for each candidate based on similarity scores
num_top_jobs = 10
top_jobs_for_candidates = []

for candidate_index in range(len(resumes)):
    job_scores = list(enumerate(similarity_scores[:, candidate_index]))
    job_scores.sort(key=lambda x: x[1], reverse=True)
    top_jobs_for_candidate = job_scores[:num_top_jobs]
    top_jobs_for_candidates.append(top_jobs_for_candidate)

# Print the top job descriptions for each candidate
for candidate_index in range(len(resumes)):
    candidate_id = cv_df['ID'][candidate_index]
    candidate_category = cv_df['Category'][candidate_index]

    print(f"\n==============================")
    print(f"Top {num_top_jobs} Job Matches for Candidate {candidate_index + 1}")
    print(f"Candidate ID: {candidate_id}, Category: {candidate_category}")
    print("==============================\n")

    for rank, (job_index, score) in enumerate(top_jobs_for_candidates[candidate_index], start=1):
        job_title = jd_df['position_title'][job_index]
        print(f"{rank}. Job Title: {job_title}")
        print(f"   Job ID: {job_index + 1}")
        print(f"   Similarity Score: {score:.4f}\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3. Job Title: Web Designer
   Job ID: 8
   Similarity Score: 0.9012

4. Job Title: Frontend Web Developer
   Job ID: 6
   Similarity Score: 0.8995

5. Job Title: Web Developer
   Job ID: 12
   Similarity Score: 0.8945

6. Job Title: UI Web Designer
   Job ID: 15
   Similarity Score: 0.8944

7. Job Title: Web Developer
   Job ID: 11
   Similarity Score: 0.8936

8. Job Title: Licensing Coordinator - Consumer Products
   Job ID: 3
   Similarity Score: 0.8885

9. Job Title: SR. Web Designer
   Job ID: 10
   Similarity Score: 0.8859

10. Job Title: Senior UI Designer
   Job ID: 13
   Similarity Score: 0.8815


Top 10 Job Matches for Candidate 2362
Candidate ID: 85918100, Category: SALES

1. Job Title: Web Developer
   Job ID: 11
   Similarity Score: 0.8378

2. Job Title: Web Designer
   Job ID: 4
   Similarity Score: 0.8322

3. Job Title: Wordpress Web Developer
   Job ID: 14
   Similarity Score: 0.8215

4. Job Title: Web Desi

In [21]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(job_descriptions):
    job_title = jd_df['position_title'][i]

    print(f"\n==============================")
    print(f"Top Candidates for Job Description {i + 1}")
    print(f"Position: {job_title}")
    print("==============================\n")

    for rank, (candidate_index, score) in enumerate(top_candidates[i], start=1):
        candidate_id = cv_df['ID'][candidate_index]
        candidate_category = cv_df['Category'][candidate_index]

        print(f"{rank}. Candidate {candidate_index + 1}")
        print(f"   Similarity Score: {score:.4f}")
        print(f"   Candidate ID: {candidate_id}, Category: {candidate_category}")
        print(f"   Resume Link: {candidate_category}/{candidate_id}.pdf\n")



Top Candidates for Job Description 1
Position: Sales Specialist

1. Candidate 1949
   Similarity Score: 0.9415
   Candidate ID: 18827609, Category: HR
   Resume Link: HR/18827609.pdf

2. Candidate 291
   Similarity Score: 0.9388
   Candidate ID: 62994611, Category: AGRICULTURE
   Resume Link: AGRICULTURE/62994611.pdf

3. Candidate 478
   Similarity Score: 0.9387
   Candidate ID: 43622023, Category: ARTS
   Resume Link: ARTS/43622023.pdf

4. Candidate 28
   Similarity Score: 0.9377
   Candidate ID: 16237710, Category: ACCOUNTANT
   Resume Link: ACCOUNTANT/16237710.pdf

5. Candidate 1803
   Similarity Score: 0.9314
   Candidate ID: 10466208, Category: HEALTHCARE
   Resume Link: HEALTHCARE/10466208.pdf


Top Candidates for Job Description 2
Position: Apple Solutions Consultant

1. Candidate 168
   Similarity Score: 0.9236
   Candidate ID: 22391901, Category: ADVOCATE
   Resume Link: ADVOCATE/22391901.pdf

2. Candidate 904
   Similarity Score: 0.9165
   Candidate ID: 95382114, Category: B