In [None]:
import os
project_name = "reco-tut-sjr"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

In [None]:
# !python -m spacy download en_core_web_lg
!ls /usr/local/lib/python3.7/dist-packages/en_core_web_lg

In [None]:
nlp = spacy.load('/usr/local/lib/python3.7/dist-packages/en_core_web_lg/en_core_web_lg-2.2.5')

In [None]:
df_jobs = pd.read_pickle('./data/silver/jobs.p', compression='gzip')
df_jobs = df_jobs.reset_index(drop=True)
df_jobs.head()

In [None]:
df_users = pd.read_pickle('./data/silver/applicants.p', compression='gzip')
df_users = df_users.reset_index(drop=True)
df_users.head()

## Selecting test user

In [None]:
def get_recommendation(top, df_all, scores):
  recommendation = pd.DataFrame(columns = ['ApplicantID', 'JobID',  'title', 'score'])
  count = 0
  for i in top:
      recommendation.at[count, 'ApplicantID'] = u
      recommendation.at[count, 'JobID'] = df_all['Job.ID'][i]
      recommendation.at[count, 'title'] = df_all['Title'][i]
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

In [None]:
u = 10001
index = np.where(df_users['Applicant_id'] == u)[0][0]
user_q = df_users.iloc[[index]]
user_q

## Model 1 - TFIDF

In [None]:
#initializing tfidf vectorizer
##This is a technique to quantify a word in documents, 
#we generally compute a weight to each word which signifies the importance of the word in the document and corpus. 
##This method is a widely used technique in Information Retrieval and Text Mining.
tfidf_vectorizer = TfidfVectorizer()

tfidf_jobid = tfidf_vectorizer.fit_transform((df_jobs['text'])) #fitting and transforming the vector
tfidf_jobid

Computing cosine similarity using tfidf

In [None]:
user_tfidf = tfidf_vectorizer.transform(user_q['text'])
cos_similarity_tfidf = map(lambda x: cosine_similarity(user_tfidf, x), tfidf_jobid)
output2 = list(cos_similarity_tfidf)

top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
list_scores = [output2[i][0][0] for i in top]
get_recommendation(top, df_jobs, list_scores)

## Model 2 - CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()
count_jobid = count_vectorizer.fit_transform((df_jobs['text'])) #fitting and transforming the vector
count_jobid

In [None]:
user_count = count_vectorizer.transform(user_q['text'])
cos_similarity_countv = map(lambda x: cosine_similarity(user_count, x),count_jobid)
output2 = list(cos_similarity_countv)

top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
list_scores = [output2[i][0][0] for i in top]
get_recommendation(top, df_jobs, list_scores)

## Model 3 - Spacy

Transform the copurs text to the *spacy's documents* 

In [None]:
%%time
list_docs = []
for i in range(len(df_jobs)):
  doc = nlp("u'" + df_jobs['text'][i] + "'")
  list_docs.append((doc,i))
print(len(list_docs))

In [None]:
def calculateSimWithSpaCy(nlp, df, user_text, n=6):
    # Calculate similarity using spaCy
    list_sim =[]
    doc1 = nlp("u'" + user_text + "'")
    for i in df.index:
      try:
            doc2 = list_docs[i][0]
            score = doc1.similarity(doc2)
            list_sim.append((doc1, doc2, list_docs[i][1],score))
      except:
        continue

    return  list_sim   

In [None]:
user_q.text[186]

In [None]:
df3 = calculateSimWithSpaCy(nlp, df_jobs, user_q.text[186], n=15)
df_recom_spacy = pd.DataFrame(df3).sort_values([3], ascending=False).head(10)
df_recom_spacy.reset_index(inplace=True)

index_spacy = df_recom_spacy[2]
list_scores = df_recom_spacy[3]

Top recommendations using Spacy

In [None]:
get_recommendation(index_spacy, df_jobs, list_scores)

## Model 4 - KNN

In [None]:
n_neighbors = 11
KNN = NearestNeighbors(n_neighbors, p=2)
KNN.fit(tfidf_jobid)
NNs = KNN.kneighbors(user_tfidf, return_distance=True) 

In [None]:
NNs[0][0][1:]

The top recommendations using KNN

In [None]:
top = NNs[1][0][1:]
index_score = NNs[0][0][1:]

get_recommendation(top, df_jobs, index_score)