<a href="https://colab.research.google.com/github/rayen03/RecruitFlow/blob/model-improvements/Model_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
dataDir='/content/drive/MyDrive/RecruitFlow'


Mounted at /content/drive


In [17]:

cv_data = pd.read_csv('/content/drive/MyDrive/RecruitFlow/test_parsedCV.csv')
cv_data = cv_data.dropna()

job_data = pd.read_csv('/content/drive/MyDrive/RecruitFlow/job_descriptions.csv')
job_data = job_data.dropna()

# Split
cv_train, cv_test = train_test_split(cv_data, test_size=0.2, random_state=42)

# Preprocess the data and create TaggedDocuments for Doc2Vec
train_corpus = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(cv_train['Parsed_text'])]
test_corpus = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(cv_test['Parsed_text'])]

# Train  Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=300, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(train_corpus)
doc2vec_model.train(train_corpus, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Embed job descriptions and CVs into vectors
job_vectors = [doc2vec_model.infer_vector(text.split()) for text in job_data['job_description']]
cv_vectors = [doc2vec_model.infer_vector(text.split()) for text in cv_test['Parsed_text']]

# Calculate cosine similarity between job descriptions and CVs
similarity_matrix = cosine_similarity(job_vectors, cv_vectors)

# Normalize the similarity scores to be between 0 and 1
scaler = MinMaxScaler()
normalized_similarity = scaler.fit_transform(similarity_matrix.T).T

# Reset the index of cv_test DataFrame
cv_test.reset_index(drop=True, inplace=True)

ranking_scores = normalized_similarity.argmax(axis=1) + 1

# Add the 'Ranking_Score' column to cv_test
cv_test['Ranking_Score'] = ranking_scores[:len(cv_test)]


ranked_cvs_df = cv_test[['CV_ID', 'Parsed_text', 'Ranking_Score']].copy()

# Display the first 10 ranked CVs
top_10_cvs = ranked_cvs_df.sort_values(by='Ranking_Score', ascending=False).head(10).copy()
top_10_cvs['CV_ID'] = top_10_cvs['CV_ID'].apply(lambda x: x.replace('/content/drive/MyDrive/RecruitFlow/ResumeDataset/', ''))

top_10_cvs.reset_index(drop=True, inplace=True)

print(top_10_cvs)



                                 CV_ID  \
0                    ARTS/28471099.pdf   
1    BUSINESS-DEVELOPMENT/22765255.pdf   
2            CONSTRUCTION/28803888.pdf   
3                    ARTS/18885767.pdf   
4  INFORMATION-TECHNOLOGY/28897981.pdf   
5                 FINANCE/18636651.pdf   
6              CONSULTANT/12374933.pdf   
7              ACCOUNTANT/28614791.pdf   
8           DIGITAL-MEDIA/20628003.pdf   
9                     BPO/31064969.pdf   

                                         Parsed_text  Ranking_Score  
0  OWNER\nExecutive Profile\n\nObjective: Driven,...             53  
1  ASSOCIATE DIRECTOR BUSINESS DEVELOPMENT\nSumma...             48  
2  SHORE SENIOR CONSTRUCTION PIPING ENGINEER\nPro...             48  
3  DIRECTOR OF THEATER\nHighlights\n\nEdline, Goo...             47  
4  INFORMATION TECHNOLOGY SPECIALIST (WEB), GS-11...             47  
5  FINANCE AND OPERATIONS MANAGER\nExecutive Prof...             47  
6  IT CONSULTANT\nProfessional Summary\nSupport