In [1]:
import silence_tensorflow.auto
from pymongo import MongoClient
import pandas as pd
import pickle
from umap import UMAP
from sentence_transformers import SentenceTransformer
import spacy
import re
from tqdm.notebook import tqdm
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from pyresparser import ResumeParser
from PyPDF4 import PdfFileReader
import tensorflow as tf
import skill_extraction_from_job_description_util

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## STAGE - 1: Assign the cluster label to resume documents that matches with predicted job description cluster

### Retrieve all the resume from MongoDB Database

In [2]:
client = MongoClient('mongodb://localhost:27017/') 

# Access the database
db = client['job-resume-db']  

# Access the collection
collection = db['resume'] 

# Fetch all the resume in dataframe
documents = collection.find()

In [3]:
resume_df = pd.DataFrame(documents)

In [4]:
resume_df

Unnamed: 0,_id,index,category,text,parsed_resume
0,649d8da410170921a743bf10,SD_resume_5.pdf,software developer,rsum david baumgold david baumgold fullstack w...,"{'name': 'David Baumgold', 'email': 'david@dav..."
1,649d8da610170921a743bf11,SD_resume_8.pdf,software developer,rakesh neela resume r k e h n e e l h sanfanci...,"{'name': 'STATE UNIVERSITY', 'email': 'rakeshn..."
2,649d8da810170921a743bf12,SD_resume_2.pdf,software developer,cobaohieuresume co bao hieu hochiminh city vn ...,"{'name': 'Bao Hieu', 'email': 'cobaohieu@gmail..."
3,649d8da910170921a743bf13,SD_resume_6.pdf,software developer,andrew dillon resume andrew dillon 402 6317966...,{'name': 'cid:57)ORK E(cid:58)PER(cid:882)ENCE...
4,649d8daa10170921a743bf14,SD_resume_4.pdf,software developer,joel verhagen seattle washington joelverhagen ...,"{'name': 'Joel Verhagen', 'email': 'joel.verha..."
5,649d8dab10170921a743bf15,SD_resume_9.pdf,software developer,shubham singh junior software developers shubh...,"{'name': 'S SHUBHAM', 'email': 'shubh2014shiv@..."
6,649d8dad10170921a743bf16,SD_resume_1.pdf,software developer,resume ayush gupta ayushg3112 919013363330 ski...,"{'name': 'Ayush Gupta', 'email': 'AyushG3112@g..."
7,649d8dae10170921a743bf17,SD_resume_3.pdf,software developer,resume ayush gupta ayushg3112 919013363330 ski...,"{'name': 'Ayush Gupta', 'email': 'AyushG3112@g..."
8,649d8db010170921a743bf18,SD_resume_7.pdf,software developer,dipta das software engineer dipta670 1 2547179...,"{'name': 'DIPTA DAS', 'email': 'dipta670@gmail..."
9,649d8db110170921a743bf19,PM_resume_1.pdf,product manager,kiran kumar parasa 1234 apple street pune maha...,"{'name': 'Kiran Kumar', 'email': 'YourName@gma..."


### Encoding the Resume from Pre-Trained Sentence BERT Transformer

In [5]:
# Get the parsed resume documents
resume_text = resume_df['text'] 

In [6]:
# Load the sentence BERT Transformer
all_mpnet_base_v2_sentence_transformer_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
sentence_transformers_resume_embeddings = all_mpnet_base_v2_sentence_transformer_model.encode(resume_text)

In [7]:
print("Shape of Sentence BERT embeddings for Resume: {}".format(sentence_transformers_resume_embeddings.shape))

Shape of Sentence BERT embeddings for Resume: (21, 768)


### Performing UMAP Dimensionality Reduction

Here, the same UMAP class object is used for dimensionality reduction of embedded resume, which was earlier used to perform the dimensionality reduction of embedded job description.

In [8]:
class DimensionalityReduction:
    def __init__(self, n_components=2, random_state=42):
        self.n_components = n_components
        self.random_state = random_state
        self.umap_model = None

    def apply_umap(self, document_embeddings):
        print("Performing the reduction.")
        umap_model = UMAP(n_components=self.n_components, random_state=self.random_state)
        document_embeddings_2d = umap_model.fit_transform(document_embeddings)
        self.umap_model = umap_model
        return document_embeddings_2d

Retreiving the saved information of sentence BERT model from Stage 1 in job description document clustering

In [9]:
## get the dimesionality reduction class used in job description clustering
file_name = '/home/ubuntu/Thesis/Experiments/STAGE I - Document Clustering/resources/models/sentence_bert_model/sentence_bert_HDBSCAN_model_information.pkl'

# Load the model data from the pickle file
with open(file_name, 'rb') as f:
    saved_sentence_bert_model_information_from_job_description = pickle.load(f)

In [10]:
# Checking what information is saved 
saved_sentence_bert_model_information_from_job_description.keys()

dict_keys(['full_size_embedding', 'dim_reduction_class', 'two_d_embeddings', 'final_hdbscan_model'])

In [11]:
# Get the saved dimensionality reduction class from job descriptions STAGE 1 to reduce the dimensions of resume embedding to 2 dimensions
resume_embeddings_2d = saved_sentence_bert_model_information_from_job_description['dim_reduction_class'].umap_model.transform(sentence_transformers_resume_embeddings)

### Using the trained K-Means Clustering Model to predict the cluster labels of resume documents

In [12]:
trained_kmeans_model_path = '/home/ubuntu/Thesis/Experiments/STAGE I - Document Clustering/resources/models/final model/final_trained_k-means_model.pkl'
# Load the model data from the pickle file
with open(trained_kmeans_model_path, 'rb') as f:
    trained_kmeans_model = pickle.load(f)

In [13]:
predicted_resume_clusters = trained_kmeans_model.predict(resume_embeddings_2d)

In [14]:
predicted_resume_clusters

array([0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
      dtype=int32)

In [15]:
# assign the cluster labels to resume
resume_df['predicted_cluster_label'] = predicted_resume_clusters

In [16]:
resume_df

Unnamed: 0,_id,index,category,text,parsed_resume,predicted_cluster_label
0,649d8da410170921a743bf10,SD_resume_5.pdf,software developer,rsum david baumgold david baumgold fullstack w...,"{'name': 'David Baumgold', 'email': 'david@dav...",0
1,649d8da610170921a743bf11,SD_resume_8.pdf,software developer,rakesh neela resume r k e h n e e l h sanfanci...,"{'name': 'STATE UNIVERSITY', 'email': 'rakeshn...",0
2,649d8da810170921a743bf12,SD_resume_2.pdf,software developer,cobaohieuresume co bao hieu hochiminh city vn ...,"{'name': 'Bao Hieu', 'email': 'cobaohieu@gmail...",0
3,649d8da910170921a743bf13,SD_resume_6.pdf,software developer,andrew dillon resume andrew dillon 402 6317966...,{'name': 'cid:57)ORK E(cid:58)PER(cid:882)ENCE...,0
4,649d8daa10170921a743bf14,SD_resume_4.pdf,software developer,joel verhagen seattle washington joelverhagen ...,"{'name': 'Joel Verhagen', 'email': 'joel.verha...",3
5,649d8dab10170921a743bf15,SD_resume_9.pdf,software developer,shubham singh junior software developers shubh...,"{'name': 'S SHUBHAM', 'email': 'shubh2014shiv@...",0
6,649d8dad10170921a743bf16,SD_resume_1.pdf,software developer,resume ayush gupta ayushg3112 919013363330 ski...,"{'name': 'Ayush Gupta', 'email': 'AyushG3112@g...",0
7,649d8dae10170921a743bf17,SD_resume_3.pdf,software developer,resume ayush gupta ayushg3112 919013363330 ski...,"{'name': 'Ayush Gupta', 'email': 'AyushG3112@g...",0
8,649d8db010170921a743bf18,SD_resume_7.pdf,software developer,dipta das software engineer dipta670 1 2547179...,"{'name': 'DIPTA DAS', 'email': 'dipta670@gmail...",0
9,649d8db110170921a743bf19,PM_resume_1.pdf,product manager,kiran kumar parasa 1234 apple street pune maha...,"{'name': 'Kiran Kumar', 'email': 'YourName@gma...",2


### Function to assign the cluater label to a specific Resume using Sentence BERT and trained K-Means Clustering

In [17]:
def assign_cluster_label_to_resume(resume_id,
                                     k_means_model,
                                     sentence_bert_model,
                                     dimensionality_reduction_model):
    # Connect to MongoDB
    client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB connection string

    # Access the database and collection
    db = client["job-resume-db"]
    collection = db["resume"]

    # Query the collection to find the document with the given resume_id
    query = {"index": resume_id}
    resume = collection.find_one(query)
    resume_text = resume['text']

    # Get the Sentence BERT Embedding for resume
    resume_embeddings = sentence_bert_model.encode([resume_text])

    # Get the saved dimensionality reduction class from job descriptions STAGE 1 to reduce the dimensions of resume embedding to 2 dimensions
    resume_embeddings_2d = dimensionality_reduction_model.transform(resume_embeddings)

    assigned_resume_cluster_label = k_means_model.predict(resume_embeddings_2d)

    return assigned_resume_cluster_label[0]
    

In [18]:
# Assigning cluster label to resume ID: 'ML_resume_3.pdf'
resume_cluster_label = assign_cluster_label_to_resume(resume_id = 'ML_resume_3.pdf',
                                                        k_means_model = trained_kmeans_model,
                                                        sentence_bert_model = all_mpnet_base_v2_sentence_transformer_model,
                                                        dimensionality_reduction_model = saved_sentence_bert_model_information_from_job_description['dim_reduction_class'].umap_model)

print('Job Descriptions Cluster Label assigned to Resume ID "{}" is {}'.format('ML_resume_3.pdf',resume_cluster_label))

Job Descriptions Cluster Label assigned to Resume ID "ML_resume_3.pdf" is 0


## STAGE - 2: Obtain the Matching Score based on Overall Similarity between Given Resume and Job Description in a cluster

In [19]:
# Fixing one resume that has rich set of information
RESUME_ID = 'ML_resume_3.pdf'

In [20]:
# Function for retrieving the job descriptions from MongoDB
def retrieve_job_descriptions_with_predicted_cluster_labels(cluster_label):
    client = MongoClient('mongodb://localhost:27017/') 

    # Access the database
    db = client['job-resume-db']  

    # Access the collection containing the predicted cluster label for job descriptions
    collection = db['job-descriptions_with_predicted_cluster_labels'] 

    # Fetch all the job descriptions
    documents = collection.find() 

    job_descriptions_with_predicted_cluster_labels_df = pd.DataFrame(documents)

    job_descriptions_with_predicted_cluster_labels_df['_id'] = \
        job_descriptions_with_predicted_cluster_labels_df['_id'].apply(lambda x: str(x))

    return job_descriptions_with_predicted_cluster_labels_df[job_descriptions_with_predicted_cluster_labels_df['predicted_cluster']==cluster_label]


# Function to retrieve the resume based on a specific resume ID
def retrieve_specific_resume(resume_id):
    # Connect to MongoDB
    client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB connection string

    # Access the database and collection
    db = client["job-resume-db"]
    collection = db["resume"]

    # Query the collection to find the document with the given resume_id
    query = {"index": resume_id}
    result = collection.find_one(query)

    # Check if the document was found
    if result:
        # Return the retrieved document as a dictionary
        return result
    else:
        return None  # Return None if document with resume_id is not found



# Function to calculate the similarity between resume and job description
def cosine_similarity_between_vectors(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

### Preprocess and Cleaning the resume for performing the document similarity with Job Description

In [21]:
# Load Spacy model
nlp = spacy.load('en_core_web_trf')

In [22]:
# Open the text file containing the noisy words and read the lines into a list
with open('/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/noisy_words.txt', 'r') as f:
    words = f.readlines()

# Remove any newline characters from each word
noisy_words = [word.strip() for word in words]

In [23]:
# Function to preprocess the Resume Documents
def preprocess_text(text):
    # Define words to remove
    words_to_remove = noisy_words

    # Remove 'title'
    text = re.sub(r'\btitle\b', '', text, flags=re.I)
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Apply Spacy model
    doc = nlp(text)
    # Only keep relevant POS: noun and adjectives, and not stop words, and meet the other conditions
    return " ".join([token.lemma_.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN']
                     and not token.is_stop
                     and token.ent_type_ != "GPE"  # This checks that the word is not a location
                     and (len(token.text) > 1 or token.text.lower() == 'c' or token.text.lower() == 'r')  # This checks that the word is not a single letter, except 'c'
                     and not (len(token.text) == 2 and token.text[0] == token.text[1])  # This checks that the word is not a two-letter word with repeating letters
                     and token.lemma_.lower() not in words_to_remove])  # This checks if the word is in the list of words to remove


In [24]:
# Preprocess all descriptions
#preprocessed_resume = [preprocess_text(resume) for resume in tqdm(resume_text, desc="Preprocessing Resume for document similarity: ")]

In [25]:
# Functions for cleaning the resume by removing the noisy words if exists for document similarity
def load_noisy_words(filepaths):
    noisy_words = []
    for filepath in filepaths:
        with open(filepath, 'r') as f:
            words = f.read().splitlines()
            noisy_words.extend(words)
    return noisy_words

def clean_text(texts, noisy_words):
    cleaned_texts = []
    for text in texts:
        cleaned_text = text
        for word in noisy_words:
            pattern = r'\b' + word + r'\b'
            cleaned_text = re.sub(pattern, '', cleaned_text)
        # Remove extra whitespaces
        cleaned_text = re.sub(' +', ' ', cleaned_text)
        cleaned_texts.append(cleaned_text.strip())
    return cleaned_texts

In [26]:
# Load noisy words from text files
filepaths = ['/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/bottom_100_unigram_words.txt',
              '/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/bottom_100_bigram_words.txt',
                '/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/least_common.txt',
                  '/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/noisy_words.txt']
noisy_words = load_noisy_words(filepaths)

# Clean job descriptions
#cleaned_resumes = clean_text(preprocessed_resume, noisy_words)

In [27]:
# use the same sentence transformer information that was used to encode the job description for document similarity
sentence_transformer_from_job_descriptions__doc_similarity__path = '/home/ubuntu/Thesis/Experiments/STAGE II - Document Similarity/sentence_bert_model_information_for_doc_sim.pkl'
# Load the model data from the pickle file
with open(sentence_transformer_from_job_descriptions__doc_similarity__path, 'rb') as f:
    sentence_transformer_from_job_descriptions__doc_similarity = pickle.load(f)

### Function to obtain Matching Score between specific Resume and Job Description in same cluster

In [37]:
resume_text = retrieve_specific_resume(RESUME_ID)['text']
resume_text

'abdallah bashir 447493734669  london uk education saarland university saarbrucken de master science computer science university khartoum khartoum sd bachelor science electrical electronics engineering minor software engineering experience machine learning engineer aug 2022 goldman sachs london uk research implement novel machine learning deep learning algorithm model specialize nlp modeling utilizing llm text classification named entity recognition question answering semantic parsing uncertainty modeling lowresource domain deploy machine learning model production adhering strict constraint around model size training runtime latency design maintain deployment pipeline including continuous integration delivery version control monitoring balance model accuracy efficiency business operational requirement possess technical expertise nlp modeling project management skill deep understanding business operational constraint applied scientist intern apr 2022 jul 2022 amazon seattle usa using tr

In [40]:
preprocessed_resume = preprocess_text(resume_text)

In [44]:
cleaned_resume = clean_text([preprocessed_resume], noisy_words)

In [46]:
cleaned_resume[0]

'abdallah bashir london uk education saarland university de master science computer science university khartoum khartoum sd bachelor science electronic engineering minor software engineering machine learning engineer aug goldman sachs london uk research machine learning learning algorithm model specialize nlp modeling llm text classification entity recognition question answering semantic parsing uncertainty modeling lowresource domain deploy machine learning model production constraint model size training runtime latency design deployment pipeline integration delivery version control monitoring balance model accuracy efficiency business expertise nlp modeling project management business constraint applied scientist intern apr jul amazon transformer model endtoend framework topic alexa traffic achievement framework robustness detection model alexa refining quality labeling data research intern jan mar allen institute artificial intelligence conduct research natural language proceeding a

In [49]:
def obtain_matching_score_based_on_document_similarity(resume_id,
                                                        assigned_resume_cluster_label,
                                                        sentence_bert_model,
                                                        noisy_words):
    matching_scores_based_on_document_similarity = {}
    
    # Retrive the resume using resume id
    resume_text = retrieve_specific_resume(resume_id)['text']

    # preprocess resume
    preprocessed_resume = preprocess_text(resume_text)

    # Clean resume
    cleaned_resume = clean_text([preprocessed_resume], noisy_words)[0]

    # resume embeddings
    resume_embeddings = sentence_bert_model.encode(cleaned_resume)

    # Retrieve all job descriptions belonging to cluster assigned to resume
    job_descriptions = retrieve_job_descriptions_with_predicted_cluster_labels(cluster_label = assigned_resume_cluster_label)
    
    for id, job_description in tqdm(zip(job_descriptions['_id'], job_descriptions['description']), total=len(job_descriptions), desc="Calculating Matching Score between a Given Resume and All job Descriptions in a Cluster: "):
        # Perform the same preprocessing steps on job descriptions for document similarity
        pre_processed_description = preprocess_text(job_description)

        # Perform the same cleaning on job descriptions for document similarity
        cleaned_description = clean_text([pre_processed_description], noisy_words)

        # job description embeddings
        job_description_embeddings = np.ravel(sentence_bert_model.encode(cleaned_description))

        # Obtain the matching based on cosine document similarity
        matching_scores_based_on_document_similarity[id] = cosine_similarity_between_vectors(resume_embeddings, job_description_embeddings)

    # Sort the dictionary based on values in descending order
    sorted_matching_scores_based_on_document_similarity = {k: v for k, v in sorted(matching_scores_based_on_document_similarity.items(), key=lambda item: item[1], reverse=True)}
    return sorted_matching_scores_based_on_document_similarity

In [50]:
matching_scores_based_on_document_similarity = \
    obtain_matching_score_based_on_document_similarity(resume_id = RESUME_ID,
                                                        assigned_resume_cluster_label = resume_cluster_label,
                                                        sentence_bert_model = sentence_transformer_from_job_descriptions__doc_similarity['sentence_bert_transformer_job_descriptions_doc_sim'],
                                                        noisy_words = noisy_words)

Calculating Matching Score between a Given Resume and All job Descriptions in a Cluster:   0%|          | 0/93…

In [51]:
matching_scores_based_on_document_similarity

{'649cd599eed36cf2eea931d0': 0.8455585,
 '649cd599eed36cf2eea9323b': 0.8117122,
 '649cd599eed36cf2eea931e3': 0.8101116,
 '649cd599eed36cf2eea931ee': 0.7979156,
 '649cd599eed36cf2eea93201': 0.79223037,
 '649cd599eed36cf2eea931c3': 0.7915017,
 '649cd599eed36cf2eea9321d': 0.7873909,
 '649cd599eed36cf2eea931b9': 0.7708134,
 '649cd599eed36cf2eea93205': 0.76861435,
 '649cd599eed36cf2eea931c9': 0.76466155,
 '649cd599eed36cf2eea931ce': 0.76216847,
 '649cd599eed36cf2eea931bf': 0.7613545,
 '649cd599eed36cf2eea93229': 0.7548926,
 '649cd599eed36cf2eea931df': 0.7539602,
 '649cd599eed36cf2eea931ed': 0.75144166,
 '649cd599eed36cf2eea931c6': 0.7512038,
 '649cd599eed36cf2eea9320f': 0.74924433,
 '649cd599eed36cf2eea931b7': 0.74920785,
 '649cd599eed36cf2eea93238': 0.74920785,
 '649cd599eed36cf2eea93209': 0.74913865,
 '649cd599eed36cf2eea931dd': 0.74830854,
 '649cd599eed36cf2eea931d5': 0.74495304,
 '649cd599eed36cf2eea931b2': 0.7442357,
 '649cd599eed36cf2eea9323a': 0.74357253,
 '649cd599eed36cf2eea93220':

## STAGE - 3: Obtain the Matching Score based on Skills extracted from Given Resume and Job Description in a cluster

### Load trained LSTM model for skill-set extraction

In [52]:
trained_lstm_model_path = '/home/ubuntu/Thesis/Experiments/STAGE III - Skill Extraction/trained_lstm_model_for_extracting_skills.hdf5'

# Load the model
lstm_model_for_skill_extraction = tf.keras.models.load_model(trained_lstm_model_path)

### Load the saved tokzenizer for skill extraction from job description 

In [53]:
# use the same tokenizer that was used to tokenize the job description
job_description_tokenizer_path = '/home/ubuntu/Thesis/Experiments/STAGE III - Skill Extraction/tokenizer_saved_information.pkl'
# Load the model data from the pickle file
with open(job_description_tokenizer_path, 'rb') as f:
    job_description_tokenizer_information = pickle.load(f)

In [54]:
job_description_tokenizer = job_description_tokenizer_information['tokenizer']
max_length = job_description_tokenizer_information['max_length']

### Function to extract skills from Job description from trained LSTM model 

In [55]:
def extract_skills_from_job_description(job_description):
    clean_job_description_for_skill_extraction = skill_extraction_from_job_description_util.clean_text(job_description,
                                                                              noisy_words_path='noisy_words.txt')
    
    skill_extracted_from_job_description = \
    skill_extraction_from_job_description_util.extract_skills_using_lstm(
                                                    trained_lstm_model = lstm_model_for_skill_extraction,
                                                    raw_job_description = clean_job_description_for_skill_extraction,
                                                    tokenizer = job_description_tokenizer,
                                                    max_length = max_length
                                                    )
    
    return skill_extracted_from_job_description

### Obtain Matching Scores based on of Skills extracted from both Resume and Job Descriptions using BERT Embeddings of each skill

In [56]:
# Function to get the BERT Embedding of a given skill
def embed_text(text, bert_model, bert_tokenizer):
    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    input_ids = bert_tokenizer.encode(text, add_special_tokens=True)
    input_ids = input_ids[:512]  # Truncate if too long
    input_tensor = torch.tensor([input_ids])  # Create a torch tensor for the input IDs
    with torch.no_grad():
        last_hidden_states = bert_model(input_tensor)  # Models outputs are now tuples

    # Take the embeddings from the first position of last_hidden_state (output is tuple)
    embeddings = last_hidden_states[0]
    sentence_embedding = torch.mean(embeddings, dim=1).squeeze()  # Mean pooling
    return sentence_embedding.numpy()

In [57]:
def obtain_matching_score_based_on_skills(resume_id, assigned_resume_cluster_label, bert_model, bert_tokenizer):
    
    matching_score_based_on_skills = {}
    # Retrive all the job descriptions belonging to a specific cluster
    job_descriptions = retrieve_job_descriptions_with_predicted_cluster_labels(cluster_label=assigned_resume_cluster_label)
    
    job_descriptions['_id'] = job_descriptions['_id'].apply(lambda x: str(x))
    
    # Extract the skills from Resume using Python Library called ResumeParser 
    resume_skills = [resume_skills.lower() for resume_skills in retrieve_specific_resume(resume_id)['parsed_resume']['skills']]
    
    # Get the BERT Embedding vector of each skill in resume
    resume_skills_bert_embeddings = np.array([embed_text(resume_skill, bert_model, bert_tokenizer) for resume_skill in resume_skills])
    
    for id, job_description in tqdm(zip(job_descriptions['_id'], job_descriptions['description']), total=len(job_descriptions), desc="Calculating Matching Score between SKILLS in Given Resume and SKILLS in All job Descriptions in a Cluster"):
        # Extract the skills in a job description using trained LSTM model
        job_description_skills = extract_skills_from_job_description(job_description)
        
        # Get the BERT Embedding Vector of each skill in job description 
        job_description_skills_bert_embeddings = np.array([embed_text(jd_skill, bert_model, bert_tokenizer) for jd_skill in job_description_skills])
        
        # Get the pairwise cosine similarity to get the matching scores of skills using BERT Embeddings
        cosine_similarity_matrix = cosine_similarity(resume_skills_bert_embeddings, job_description_skills_bert_embeddings)
        
        # Compute the overall cosine similarity score by taking the average of all similarity scores
        overall_cosine_similarity = np.mean(cosine_similarity_matrix)
        
        matching_score_based_on_skills[id] = overall_cosine_similarity

    # Sort the dictionary based on values in descending order
    sorted_matching_score_based_on_skills = {k: v for k, v in sorted(matching_score_based_on_skills.items(), key=lambda item: item[1], reverse=True)}

    return sorted_matching_score_based_on_skills

In [58]:
# Load pre-trained model tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

matching_scores_based_on_skills = \
obtain_matching_score_based_on_skills(resume_id = RESUME_ID,
                                        assigned_resume_cluster_label = resume_cluster_label,
                                        bert_model = bert_model,
                                        bert_tokenizer = bert_tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Calculating Matching Score between SKILLS in Given Resume and SKILLS in All job Descriptions in a Cluster:   0…



In [59]:
matching_scores_based_on_skills

{'649cd599eed36cf2eea93043': 0.7035018,
 '649cd599eed36cf2eea92ee2': 0.69225806,
 '649cd599eed36cf2eea92dec': 0.6904941,
 '649cd599eed36cf2eea9307b': 0.6882743,
 '649cd598eed36cf2eea92a4e': 0.68808466,
 '649cd599eed36cf2eea92e95': 0.6874175,
 '649cd599eed36cf2eea92d71': 0.6854467,
 '649cd599eed36cf2eea931c2': 0.68356365,
 '649cd599eed36cf2eea93230': 0.68356365,
 '649cd599eed36cf2eea92e69': 0.6823074,
 '649cd598eed36cf2eea92947': 0.6812032,
 '649cd599eed36cf2eea92c85': 0.6811963,
 '649cd599eed36cf2eea92faf': 0.6806816,
 '649cd599eed36cf2eea93034': 0.68011653,
 '649cd599eed36cf2eea92cac': 0.6797886,
 '649cd599eed36cf2eea9312b': 0.6797886,
 '649cd599eed36cf2eea931dd': 0.679555,
 '649cd599eed36cf2eea92eb6': 0.6793219,
 '649cd598eed36cf2eea92a5e': 0.67926484,
 '649cd599eed36cf2eea92f83': 0.6788555,
 '649cd598eed36cf2eea92a30': 0.6786896,
 '649cd598eed36cf2eea929c7': 0.6780124,
 '649cd599eed36cf2eea92db6': 0.6767992,
 '649cd599eed36cf2eea93222': 0.6766749,
 '649cd599eed36cf2eea92e1b': 0.6766

## Final Matching Score by Comining Matching Scores based on Resume to Job Descriptions Similarity and Skills 

### Get the matching scores from STAGE II and STAGE III as dataframe

In [60]:
matching_scores_based_on_document_similarity_df = pd.DataFrame({
    'Job Description ID': list(matching_scores_based_on_document_similarity.keys()),
    'Matching Score based on Resume to job Descriptions Similarity': list(matching_scores_based_on_document_similarity.values())
})

In [61]:
matching_scores_based_on_skills_similarity_df = pd.DataFrame({
    'Job Description ID': list(matching_scores_based_on_skills.keys()),
    'Matching Score based on Resume to job Descriptions Similarity': list(matching_scores_based_on_skills.values())
})

### Join two Dataframes using primary key as 'Job Description ID'

In [62]:
matching_scores_df = pd.merge(matching_scores_based_on_document_similarity_df, matching_scores_based_on_skills_similarity_df, on='Job Description ID', suffixes=('_document', '_skills'))

In [63]:
matching_scores_df.head()

Unnamed: 0,Job Description ID,Matching Score based on Resume to job Descriptions Similarity_document,Matching Score based on Resume to job Descriptions Similarity_skills
0,649cd599eed36cf2eea931d0,0.845559,0.611372
1,649cd599eed36cf2eea9323b,0.811712,0.635294
2,649cd599eed36cf2eea931e3,0.810112,0.625837
3,649cd599eed36cf2eea931ee,0.797916,0.621333
4,649cd599eed36cf2eea93201,0.79223,0.623492


### Calculate average matching scores 

In [64]:
matching_scores_df['average_matching_score'] = (matching_scores_df['Matching Score based on Resume to job Descriptions Similarity_document'] + 
matching_scores_df['Matching Score based on Resume to job Descriptions Similarity_skills'])/2

### Recommend the top 10 Job Description IDs based on average matching score

In [65]:
# Sort the DataFrame based on 'average_matching_score' column in descending order
sorted_matching_scores_df = matching_scores_df.sort_values(by='average_matching_score', ascending=False)

# Get the top 10 recommendations
top_10_recommendations = sorted_matching_scores_df.head(10)

In [66]:
top_10_recommendations

Unnamed: 0,Job Description ID,Matching Score based on Resume to job Descriptions Similarity_document,Matching Score based on Resume to job Descriptions Similarity_skills,average_matching_score
0,649cd599eed36cf2eea931d0,0.845559,0.611372,0.728465
1,649cd599eed36cf2eea9323b,0.811712,0.635294,0.723503
2,649cd599eed36cf2eea931e3,0.810112,0.625837,0.717974
20,649cd599eed36cf2eea931dd,0.748309,0.679555,0.713932
6,649cd599eed36cf2eea9321d,0.787391,0.638697,0.713044
3,649cd599eed36cf2eea931ee,0.797916,0.621333,0.709624
4,649cd599eed36cf2eea93201,0.79223,0.623492,0.707861
9,649cd599eed36cf2eea931c9,0.764662,0.648402,0.706532
15,649cd599eed36cf2eea931c6,0.751204,0.656969,0.704086
5,649cd599eed36cf2eea931c3,0.791502,0.609848,0.700675


In [85]:
top_10_matching_job_description_ids = list(top_10_recommendations['Job Description ID'].apply(lambda x: str(x)).values)

### Recommend top 10 Best Matching Job Descriptions for a given resume Based on Matching Score

In [88]:
import warnings
warnings.filterwarnings(action='ignore')

In [93]:
def recommend_top_10_job_description_from_database(ids:list):
    
    best_matching_job_descriptions = []

    client = MongoClient("mongodb://localhost:27017/")
    database = client["job-resume-db"]
    collection = database["job-descriptions"]

    # Find the document by its unique ID
    document = collection.find({})

    job_descriptions = pd.DataFrame(document)

    job_descriptions_id_and_descriptions = job_descriptions[['_id','description']]
    job_descriptions_id_and_descriptions['_id'] = job_descriptions_id_and_descriptions['_id'].apply(lambda x: str(x))

    for id in ids:
        best_matching_job_descriptions.append(job_descriptions_id_and_descriptions[job_descriptions_id_and_descriptions['_id']==id]['description'].values[0])

    return best_matching_job_descriptions

In [94]:
recommend_top_10_job_description_from_database(top_10_matching_job_description_ids)

['NLP Data Scientist - Machine Learning/Deep Learning BRANE ENTERPRISES PRIVATE LIMITED  Hyderabad, Telangana, India On-site 1 month ago  2 applicants  About the job This job is sourced from a job board. Learn moreJob Description  As a key member of the Digital Mind module, he/she will be responsible for providing Natural Language Processing support to help improve our NLP products and create new NLP applications. NLP Engineer responsibilities include transforming natural language data into useful features using NLP techniques to feed classification algorithms. The individual is expected to possess strong statistical knowledge, good understanding of machine learning methods and text representation techniques.  Responsibilities  2+ years of relevant experience specialized in Machine learning and NLP technology. Understanding business objectives and developing models that help to achieve them, along with metrics to track their progress. Proficiency in data science and applying appropriat