#This is a project that analyzes candidate CVs and job descriptions, and returns the top 5 CVs per JD.

# PDF data extractor
Here we are prepping the CVs and extracting relevant data (skills, education, job role).

### Importing required libraries:

In [27]:
!pip install pdfplumber



In [28]:
import pdfplumber as plum
import re, spacy, glob, os

In [29]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Parsing through the PDFs and preprocessing the text:

In [30]:
def extracting_text(pdf):
    text = ""

    for i in range(len(pdf.pages)):
        pg = pdf.pages[i]
        text += pg.extract_text()

    return text

In [31]:
def preprocessing_text(text):

    text = text.replace("\n", " ").strip()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()

    return text

### Getting the job category of the CV:

In [32]:
def get_category(text):

     return text.split("\n")[0]

### Getting the educational qualifications from the CV:

In [33]:
def get_education(text):

    keywords = [
        "High School", "Certificate", "Associate", "Diploma", "High School Diploma", "GED", "Undergraduate", "UG", "PG"
        "B.A.", "BA", "B.S.", "BS", "B.Sc.", "BSc", "B.Engg", "B.Eng.", "BTech", "B.Tech", "Bachelor", "Graduate",
        "M.A.", "M.S.", "M.Sc.", "M.Eng.", "MBA", "Master", "Postgraduate",
        "Ph.D.", "Doctorate", "Doctor", "Doctor of Medicine", "Doctor of Science"
    ]

    edu = []

    for i in keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(i))
        match = re.search(pattern, text)
        if match: edu.append(match.group())

    return edu

### Getting the skills from the CV:

In [34]:
nlp = spacy.load("en_core_web_sm")


def get_skills(text):
    skills = []

    for word in nlp(text):
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

### Uploading the CV dataset as a ZIP file and then extracting the CVs:

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# location/path of zip file in colab runtime storage
resume_folder = "/content/drive/MyDrive/resumes.zip"

if zipfile.is_zipfile(resume_folder):
    with zipfile.ZipFile(resume_folder, 'r') as zip_ref:
        zip_ref.extractall()
        path = os.getcwd() + "/" + os.path.splitext(os.path.basename(resume_folder))[0]
else:
    print("# ERROR: Please upload a valid ZIP file with resume PDFs.")

### Storing CV data embeddings:

In [38]:
! pip install tqdm



In [41]:
import tqdm

skill_emb = []
edu_emb = []
categories = []

pdf_directory = "/content/resumes/"
# In the above variable, you have to paste the path of the extracted folder (PDF dataset folder) from Colab runtime storage

all_resumes = glob.glob(os.path.join(pdf_directory, "*.pdf"))

# Use tqdm to add a progress bar
for pdf in tqdm.tqdm(all_resumes):  # Use tqdm.tqdm instead of enumerate
    currPDF = plum.open(pdf)
    text = extracting_text(currPDF)
    text = preprocessing_text(text)

    skill_emb.append(get_embeddings(get_skills(text)))
    edu_emb.append(get_embeddings(get_education(text)))
    categories.append(get_category(text))


100%|██████████| 2484/2484 [1:11:23<00:00,  1.72s/it]


# Job Description analyzer:
Here we are working with 15 job descriptions from the given Hugging Face dataset.

### Loading the dataset:

In [42]:
!pip install datasets



In [43]:
from datasets import load_dataset

In [44]:
jd_dataset = load_dataset("jacob-hugging-face/job-descriptions")

### Saving the features of 15 job roles in a dictionary:

In [47]:
tempDict = {
    "company_name" : [],
    "position" : [],
    "req_skills" : [],
    "req_edu" : []
}


# jobs range
jobs = 15


for i in tqdm.tqdm(range(jobs)):
    # appending company name
    tempDict["company_name"].append(jd_dataset["train"][i]["company_name"])

    # appending offered position
    tempDict["position"].append(jd_dataset["train"][i]["position_title"])


    model_response = eval(jd_dataset["train"][i]["model_response"])

    # appending required skills
    tempDict["req_skills"].append(model_response["Required Skills"])

    # appending required education
    tempDict["req_edu"].append(model_response["Educational Requirements"])



100%|██████████| 15/15 [00:00<00:00, 869.79it/s]


### Tokenizing the skills required for a given job description:

In [62]:
JD_req_skills_embeddings_list = []
JD_req_edu_embeddings_list = []
JD_position_list = []

for i in tqdm.tqdm(range(jobs)):

    JD_req_skills_embeddings_list.append(get_embeddings(tempDict["req_skills"][i]))

    JD_req_edu_embeddings_list.append(get_embeddings(tempDict["req_edu"][i]))

    JD_position_list.append(tempDict["position"][i])


100%|██████████| 15/15 [00:14<00:00,  1.07it/s]


# Creating embeddings using DistilBERT and calculating cosine similarity:
Now we will proceed to use DistilBERT to create the embeddings we require and then use them to calculate cosine similarities that will give us an idea of how relevant the given CV is to the job role applied for.

### Installing the Transformers library and importing libraries:

In [63]:
!pip install transformers



In [64]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Creating embeddings:

In [65]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


def get_embeddings(tokenized_array):
    tokenized_array_input = " ".join(tokenized_array)
    tokenized_array_encoding = tokenizer(tokenized_array_input, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        tokenized_array_embedding = model(**tokenized_array_encoding).last_hidden_state.mean(dim=1).numpy()

    return tokenized_array_embedding

### Cosine similarity calculation for Education and Skills:

In [66]:
def get_cosine_similarity(JD_req_skills_embeddings, JD_req_edu_embeddings, CV_skills_embeddings, CV_edu_embeddings):

    skills_score = cosine_similarity(JD_req_skills_embeddings, CV_skills_embeddings)
    edu_score = cosine_similarity(JD_req_edu_embeddings, CV_edu_embeddings)
    return (skills_score + edu_score) / 2.0

### Cosine similarity calculation for Category (job role) matching:

In [67]:
tfidf_vectorizer = TfidfVectorizer()

def get_cosine_sim_for_category(position, category):

    tfidf_matrix = tfidf_vectorizer.fit_transform([position, category])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim[0][0]

### Importing libraries:

In [68]:
import zipfile, os

# Driver function for the project:
This section of the code brings together all the aforementioned functions to create a functional CV-to-JD matcher.

### main() function:

In [71]:
def main():

    for i in tqdm.tqdm(range(jobs)):
        CV_and_score = []

        for j in range(len(all_resumes)):

            # calculating scores
            skill_edu_score = get_cosine_similarity(JD_req_skills_embeddings_list[i], JD_req_edu_embeddings_list[i], skill_emb[j], edu_emb[j])
            pos_score = get_cosine_sim_for_category(JD_position_list[i], categories[j])
            final_score = (skill_edu_score + pos_score) / 2.0
            CV_and_score.append((final_score[0][0], os.path.basename(all_resumes[j])))

        CV_and_score.sort(key=lambda x: x[0], reverse=True)
        top_scores = [score for score, _ in CV_and_score[:5]]
        top_CVs = [filename for _, filename in CV_and_score[:5]]


        print(f'\nCompany: {tempDict["company_name"][i]} ({JD_position_list[i]}): \nTop 5 CVs: {top_CVs} \nCorresponding Scores: {top_scores}\n')


Scores have been left as is, in the previous cell. They can of course be modified to suit your needs.

### Calling main() function

In [72]:
if __name__ == "__main__" : main()

  7%|▋         | 1/15 [00:16<03:54, 16.78s/it]


Company: Google (Sales Specialist): 
Top 5 CVs: ['10289113.pdf', '34131484.pdf', '24767027.pdf', '12082377.pdf', '30608780.pdf'] 
Corresponding Scores: [0.33589116, 0.33202514, 0.32912737, 0.32709986, 0.32338458]



 13%|█▎        | 2/15 [00:30<03:12, 14.84s/it]


Company: Apple (Apple Solutions Consultant): 
Top 5 CVs: ['15535920.pdf', '38457612.pdf', '15119529.pdf', '64017585.pdf', '26291616.pdf'] 
Corresponding Scores: [0.3421532, 0.33897674, 0.33826032, 0.33602178, 0.3348015]



 20%|██        | 3/15 [00:43<02:51, 14.28s/it]


Company: Netflix (Licensing Coordinator - Consumer Products): 
Top 5 CVs: ['10480456.pdf', '51018476.pdf', '50328713.pdf', '15858254.pdf', '87867370.pdf'] 
Corresponding Scores: [0.33528876, 0.3334325, 0.33145708, 0.33035538, 0.33005083]



 27%|██▋       | 4/15 [00:58<02:38, 14.41s/it]


Company: Robert Half (Web Designer): 
Top 5 CVs: ['13807808.pdf', '93828034.pdf', '29147100.pdf', '62312955.pdf', '32532982.pdf'] 
Corresponding Scores: [0.35622367, 0.3560481, 0.35599315, 0.3549928, 0.35468218]



 33%|███▎      | 5/15 [01:12<02:24, 14.42s/it]


Company: TrackFive (Web Developer): 
Top 5 CVs: ['43311839.pdf', '22351830.pdf', '93828034.pdf', '35990852.pdf', '17823436.pdf'] 
Corresponding Scores: [0.36084867, 0.34385094, 0.341695, 0.34035295, 0.34009996]



 40%|████      | 6/15 [01:27<02:09, 14.38s/it]


Company: DesignUps (Frontend Web Developer): 
Top 5 CVs: ['43311839.pdf', '51018476.pdf', '35990852.pdf', '12415691.pdf', '44115326.pdf'] 
Corresponding Scores: [0.34570694, 0.3386988, 0.338319, 0.3368676, 0.33641312]



 47%|████▋     | 7/15 [01:41<01:54, 14.34s/it]


Company: Equisolve, Inc. (Remote Website Designer): 
Top 5 CVs: ['51018476.pdf', '32532982.pdf', '13807808.pdf', '14413257.pdf', '13014900.pdf'] 
Corresponding Scores: [0.34892586, 0.3485852, 0.3465865, 0.345316, 0.34213883]



 53%|█████▎    | 8/15 [01:56<01:42, 14.59s/it]


Company: Zander Insurance Agency (Web Designer): 
Top 5 CVs: ['93828034.pdf', '32532982.pdf', '14413257.pdf', '13807808.pdf', '13014900.pdf'] 
Corresponding Scores: [0.35623568, 0.3552892, 0.3537671, 0.35243168, 0.35243148]



 60%|██████    | 9/15 [02:11<01:27, 14.63s/it]


Company: Tuff (Web Designer): 
Top 5 CVs: ['93828034.pdf', '32532982.pdf', '14413257.pdf', '13807808.pdf', '13014900.pdf'] 
Corresponding Scores: [0.35794306, 0.35656658, 0.35544786, 0.35465357, 0.35365656]



 67%|██████▋   | 10/15 [02:25<01:11, 14.38s/it]


Company: General Dynamics Information Technology (SR. Web Designer): 
Top 5 CVs: ['29524570.pdf', '51018476.pdf', '78149576.pdf', '93828034.pdf', '69243180.pdf'] 
Corresponding Scores: [0.29126108, 0.2848642, 0.28183237, 0.27538842, 0.2751118]



 73%|███████▎  | 11/15 [02:39<00:57, 14.28s/it]


Company: Sony Music Entertainment (Web Developer): 
Top 5 CVs: ['43311839.pdf', '22351830.pdf', '93828034.pdf', '17823436.pdf', '35990852.pdf'] 
Corresponding Scores: [0.35823363, 0.3417488, 0.3402963, 0.3386415, 0.33855093]



 80%|████████  | 12/15 [02:53<00:42, 14.27s/it]


Company: Snapshot Interactive (Web Developer): 
Top 5 CVs: ['43311839.pdf', '93828034.pdf', '29524570.pdf', '16186411.pdf', '36758947.pdf'] 
Corresponding Scores: [0.28815103, 0.2793807, 0.27754253, 0.27396056, 0.27330464]



 87%|████████▋ | 13/15 [03:08<00:28, 14.40s/it]


Company: Deloitte (Senior UI Designer): 
Top 5 CVs: ['13807808.pdf', '32532982.pdf', '51018476.pdf', '14413257.pdf', '29764492.pdf'] 
Corresponding Scores: [0.3583517, 0.35722688, 0.351238, 0.35001788, 0.34998867]



 93%|█████████▎| 14/15 [03:23<00:14, 14.54s/it]


Company: Themesoft Inc (Wordpress Web Developer): 
Top 5 CVs: ['28109594.pdf', '37001381.pdf', '29524570.pdf', '22232367.pdf', '51018476.pdf'] 
Corresponding Scores: [0.25826812, 0.25681317, 0.25679633, 0.25552374, 0.25547096]



100%|██████████| 15/15 [03:38<00:00, 14.55s/it]


Company: Western Governors University (UI Web Designer): 
Top 5 CVs: ['51018476.pdf', '32532982.pdf', '37664296.pdf', '13807808.pdf', '43311839.pdf'] 
Corresponding Scores: [0.34856406, 0.34833303, 0.34728515, 0.3459283, 0.34507674]






Execution will begin from the above line of code.