#This is a project that analyzes candidate CVs and job descriptions, and returns the top 5 CVs per JD.

# PDF data extractor
Here we are prepping the CVs and extracting relevant data (skills, education, job role).

### Importing required libraries:

In [27]:
!pip install pdfplumber



In [28]:
import pdfplumber as plum
import re, spacy, glob, os

In [29]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Parsing through the PDFs and preprocessing the text:

In [30]:
def extracting_text(pdf):
    text = ""

    for i in range(len(pdf.pages)):
        pg = pdf.pages[i]
        text += pg.extract_text()

    return text

In [31]:
def preprocessing_text(text):

    text = text.replace("\n", " ").strip()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()

    return text

### Getting the job category of the CV:

In [32]:
def get_category(text):

     return text.split("\n")[0]

### Getting the educational qualifications from the CV:

In [33]:
def get_education(text):

    keywords = [
        "High School", "Certificate", "Associate", "Diploma", "High School Diploma", "GED", "Undergraduate", "UG", "PG"
        "B.A.", "BA", "B.S.", "BS", "B.Sc.", "BSc", "B.Engg", "B.Eng.", "BTech", "B.Tech", "Bachelor", "Graduate",
        "M.A.", "M.S.", "M.Sc.", "M.Eng.", "MBA", "Master", "Postgraduate",
        "Ph.D.", "Doctorate", "Doctor", "Doctor of Medicine", "Doctor of Science"
    ]

    edu = []

    for i in keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(i))
        match = re.search(pattern, text)
        if match: edu.append(match.group())

    return edu

### Getting the skills from the CV:

In [34]:
nlp = spacy.load("en_core_web_sm")


def get_skills(text):
    skills = []

    for word in nlp(text):
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

### Uploading the CV dataset as a ZIP file and then extracting the CVs:

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# location/path of zip file in colab runtime storage
resume_folder = "/content/drive/MyDrive/resumes.zip"

if zipfile.is_zipfile(resume_folder):
    with zipfile.ZipFile(resume_folder, 'r') as zip_ref:
        zip_ref.extractall()
        path = os.getcwd() + "/" + os.path.splitext(os.path.basename(resume_folder))[0]
else:
    print("# ERROR: Please upload a valid ZIP file with resume PDFs.")

### Storing CV data embeddings:

In [None]:
skill_emb = []
edu_emb = []
categories = []


pdf_directory = "/content/resumes/"
#in the above variable, you have to paste the path of the extracted folder (PDF dataset folder) from Colab runtime storage

all_resumes = glob.glob(os.path.join(pdf_directory, "*.pdf"))

for pdf in all_resumes:
    currPDF = plum.open(pdf)
    text = extracting_text(currPDF)
    text = preprocessing_text(text)

    skill_emb.append(get_embeddings(get_skills(text)))
    edu_emb.append(get_embeddings(get_education(text)))
    categories.append(get_category(text))

# Job Description analyzer:
Here we are working with 15 job descriptions from the given Hugging Face dataset.

### Loading the dataset:

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
jd_dataset = load_dataset("jacob-hugging-face/job-descriptions")

### Saving the features of 15 job roles in a dictionary:

In [None]:
tempDict = {
    "company_name" : [],
    "position" : [],
    "req_skills" : [],
    "req_edu" : []
}


# jobs range
jobs = 15


for i in range(jobs):
    # appending company name
    tempDict["company_name"].append(jd_dataset["train"][i]["company_name"])

    # appending offered position
    tempDict["position"].append(jd_dataset["train"][i]["position_title"])


    model_response = eval(jd_dataset["train"][i]["model_response"])

    # appending required skills
    tempDict["req_skills"].append(model_response["Required Skills"])

    # appending required education
    tempDict["req_edu"].append(model_response["Educational Requirements"])



### Tokenizing the skills required for a given job description:

In [None]:
JD_req_skills_embeddings_list = []

JD_req_edu_embeddings_list = []

JD_position_list = []

jobs = 10

for i in range(jobs):

    JD_req_skills_embeddings_list.append(get_embeddings(tempDict["req_skills"][i]))

    JD_req_edu_embeddings_list.append(get_embeddings(tempDict["req_edu"][i]))

    JD_position_list.append(tempDict["position"][i])


# Creating embeddings using DistilBERT and calculating cosine similarity:
Now we will proceed to use DistilBERT to create the embeddings we require and then use them to calculate cosine similarities that will give us an idea of how relevant the given CV is to the job role applied for.

### Installing the Transformers library and importing libraries:

In [None]:
!pip install transformers

In [None]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Creating embeddings:

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


def get_embeddings(tokenized_array):
    tokenized_array_input = " ".join(tokenized_array)
    tokenized_array_encoding = tokenizer(tokenized_array_input, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        tokenized_array_embedding = model(**tokenized_array_encoding).last_hidden_state.mean(dim=1).numpy()

    return tokenized_array_embedding

### Cosine similarity calculation for Education and Skills:

In [None]:
def get_cosine_similarity(JD_req_skills_embeddings, JD_req_edu_embeddings, CV_skills_embeddings, CV_edu_embeddings):

    skills_score = cosine_similarity(JD_req_skills_embeddings, CV_skills_embeddings)
    edu_score = cosine_similarity(JD_req_edu_embeddings, CV_edu_embeddings)
    return (skills_score + edu_score) / 2.0

### Cosine similarity calculation for Category (job role) matching:

In [None]:
tfidf_vectorizer = TfidfVectorizer()

def get_cosine_sim_for_category(position, category):

    tfidf_matrix = tfidf_vectorizer.fit_transform([position, category])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim[0][0]

### Importing libraries:

In [None]:
import zipfile, os

# Driver function for the project:
This section of the code brings together all the aforementioned functions to create a functional CV-to-JD matcher.

### main() function:

In [None]:
def main():
    # selected jobNum
    jobs = 15



    for i in range(jobs):
        CV_and_score = []

        for j in range(len(all_resumes)):

            # calculating scores
            skill_edu_score = get_cosine_similarity(JD_req_skills_embeddings_list[i], JD_req_edu_embeddings_list[i], skill_emb[j], edu_emb[j])
            pos_score = get_cosine_sim_for_category(JD_position_list[i], categories[j])
            final_score = (skill_edu_score + pos_score) / 2.0
            CV_and_score.append((final_score[0][0], os.path.basename(all_resumes[j])))

        CV_and_score.sort(key=lambda x: x[0], reverse=True)
        top_scores = [score for score, _ in CV_and_score[:5]]
        top_CVs = [filename for _, filename in CV_and_score[:5]]


        print(f'\nCompany: {tempDict["company_name"][i]} ({JD_position_list[i]}): \nTop 5 CVs: {top_CVs} \nCorresponding Scores: {top_scores}\n')


Scores have been left as is, in the previous cell. They can of course be modified to suit your needs.

In [None]:
if __name__ == "__main__" : main()

Execution will begin from the above line of code.