<a href="https://colab.research.google.com/github/rayen03/RecruitFlow/blob/model-improvements/Model_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install --upgrade gensim



In [17]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
dataDir='/content/drive/MyDrive/RecruitFlow'


Mounted at /content/drive


In [18]:

cv_data = pd.read_csv('/content/drive/MyDrive/RecruitFlow/parsedCV.csv')
cv_data = cv_data.dropna()

job_data = pd.read_csv('/content/drive/MyDrive/RecruitFlow/job_descriptions.csv')
job_data = job_data.dropna()

# Split
cv_train, cv_test = train_test_split(cv_data, test_size=0.2, random_state=42)

# Preprocess the data and create TaggedDocuments for Doc2Vec
train_corpus = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(cv_train['Parsed_text'])]
test_corpus = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(cv_test['Parsed_text'])]

# Train  Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=300, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(train_corpus)
doc2vec_model.train(train_corpus, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Embed job descriptions and CVs into vectors
job_vectors = [doc2vec_model.infer_vector(text.split()) for text in job_data['job_description']]
cv_vectors = [doc2vec_model.infer_vector(text.split()) for text in cv_test['Parsed_text']]

# Calculate cosine similarity between job descriptions and CVs
similarity_matrix = cosine_similarity(job_vectors, cv_vectors)

# Normalize the similarity scores to be between 0 and 1
scaler = MinMaxScaler()
normalized_similarity = scaler.fit_transform(similarity_matrix.T).T

# Reset the index of cv_test DataFrame
cv_test.reset_index(drop=True, inplace=True)

In [5]:
#debug
import numpy as np

ranking_scores = normalized_similarity.argmax(axis=0) + 1
print("Shape of job_vectors:", np.shape(job_vectors))
print("Shape of cv_vectors:", np.shape(cv_vectors))
print("Shape of similarity_matrix:", np.shape(similarity_matrix))
print("Shape of normalized_similarity:", np.shape(normalized_similarity))
print("Length of cv_test:", len(cv_test))
print("Length of ranking_scores:", len(ranking_scores))






Shape of job_vectors: (109, 300)
Shape of cv_vectors: (497, 300)
Shape of similarity_matrix: (109, 497)
Shape of normalized_similarity: (109, 497)
Length of cv_test: 497
Length of ranking_scores: 497


In [19]:
# Add the 'Ranking_Score' column to cv_test
cv_test['Ranking_Score'] = ranking_scores[:len(cv_test)]


ranked_cvs_df = cv_test[['CV_ID', 'Parsed_text', 'Ranking_Score']].copy()

# Display the first 10 ranked CVs
top_10_cvs = ranked_cvs_df.sort_values(by='Ranking_Score', ascending=False).head(10).copy()
top_10_cvs['CV_ID'] = top_10_cvs['CV_ID'].apply(lambda x: x.replace('/content/drive/MyDrive/RecruitFlow/ResumeDataset/', ''))

top_10_cvs.reset_index(drop=True, inplace=True)

print(top_10_cvs)

                        CV_ID  \
0   CONSTRUCTION/25098739.pdf   
1   CONSTRUCTION/15721849.pdf   
2   CONSTRUCTION/30311725.pdf   
3  DIGITAL-MEDIA/14771530.pdf   
4  DIGITAL-MEDIA/10005171.pdf   
5        FINANCE/28398216.pdf   
6     HEALTHCARE/23918545.pdf   
7             HR/20417897.pdf   
8     ACCOUNTANT/78403342.pdf   
9  DIGITAL-MEDIA/31909493.pdf   

                                         Parsed_text  Ranking_Score  
0  CONSTRUCTION WORKER\nSummary\nA motivated hard...            109  
1  CONSTRUCTION PROJECT REGIONAL MANAGER\nSummary...            109  
2  SENIOR PROJECT MANAGER\nProfessional Summary\n...            109  
3  DIGITAL PRODUCER\nSummary\nPersonable Project ...            106  
4  MEDIA ACTIVITIES SPECIALIST\nSummary\n\nMulti-...            106  
5  FINANCE OFFICER\nProfessional Summary\nTo atta...            104  
6  CLAIMS SERVICE SPECIALIST\nProfessional Summar...            104  
7  EXECUTIVE ASSISTANT HR\nSummary\nSkillful and ...            104  
8  Sel

In [25]:
# Save the  model

doc2vec_model.save("/content/drive/MyDrive/RecruitFlow/modelData/doc2vec_model.model")

#save vectors
vectors = [doc2vec_model.dv[i] for i in range(len(doc2vec_model.dv))]
np.save("/content/drive/MyDrive/RecruitFlow/doc2vec_vectors.npy", vectors)
