<a href="https://colab.research.google.com/github/ArmandDS/jobs_recommendations/blob/master/job_analysis_content_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', -1)
import warnings
warnings.filterwarnings('ignore')
import os
import datetime
import pickle

# load data

In [15]:
root_path = "/Users/snelson/Dropbox/Pearl/job_matching_algorithm/jobs_recommendations-master"

applicant_df = pd.read_csv(f'{root_path}/data/df_applicants.csv')
jobs_df = pd.read_csv(f'{root_path}/data/df_jobs.csv')
tfidf_vectorizer = TfidfVectorizer()

#### Computing cosine similarity using tfidf

In [21]:
class get_recommendations():
    def __init__(self, applicant_name, number_of_results, city_list):
        self.applicant_name = applicant_name
        self.number_of_results = number_of_results
        self.city_list = city_list
        
    
    def filter_jobs_df_by_location(self, jobs_df, filter_on: bool):
        if filter_on:
            jobs_df = jobs_df[jobs_df['City'].isin(self.city_list)]
            jobs_df = jobs_df.reset_index(drop=True)
        else: 
            return jobs_df
        return jobs_df 
    
    
    def create_vector(self, jobs_data):
        # initializing tfidf vectorizer
        # TF-IDF ( Term Frequency - Inverse Document Frequency ) 
        tfidf_jobid = tfidf_vectorizer.fit_transform((jobs_data['text'])) #fitting and transforming the vector
        return tfidf_jobid

    
    def get_applicant_row(self, applicant_data):
        index = np.where(applicant_data['fullName'] == self.applicant_name)[0][0]
        user_q = applicant_data.iloc[[index]]
        return user_q

    
    def compute_cosine_similarity(self, user_q, tfidf_jobid):
        user_tfidf = tfidf_vectorizer.transform(user_q['description_combined_cleaned'])
        cos_similarity_tfidf = map(lambda x: cosine_similarity(user_tfidf, x),tfidf_jobid)
        cosine_simlilarity_output = list(cos_similarity_tfidf)
        return cosine_simlilarity_output
    
    
    def get_top_results(self, cosine_simlilarity_output):
        top_results = sorted(range(len(cosine_simlilarity_output)), key=lambda i: cosine_simlilarity_output[i], reverse=True)[:self.number_of_results]
        return top_results
    
    
    def get_scores(self, cosine_simlilarity_output, top_results):
        scores = [cosine_simlilarity_output[i][0][0] for i in top_results]
        return scores
    
    
    def get_recommendation(self, top_results, jobs_data, scores):
        recommendation = pd.DataFrame(columns = ['ApplicantID', 'JobID',  'title', 'score', 'City', 'Empl_type', 'Position'])
        count = 0
        for i in top_results:
            recommendation.at[count, 'ApplicantID'] = self.applicant_name
            recommendation.at[count, 'JobID'] = jobs_data['Job.ID'][i]
            recommendation.at[count, 'title'] = jobs_data['Title'][i]
            recommendation.at[count, 'City'] = jobs_data['City'][i]
            recommendation.at[count, 'Empl_type'] = jobs_data['Empl_type'][i]
            recommendation.at[count, 'Position'] = jobs_data['Position'][i]
            recommendation.at[count, 'score'] =  scores[count]
            count += 1
        return recommendation 

In [27]:
applicant_name = 'Tomoki Fukazawa'
number_of_results = 2000
#city_list = ['Los Angeles', 'San Francisco', 'Santa Monica']
city_list = None
city_filter = False

In [28]:
recommendations_init = get_recommendations(applicant_name=applicant_name,
                                          number_of_results=number_of_results, 
                                          city_list = city_list)

jobs_df_filtered_location = recommendations_init.filter_jobs_df_by_location(jobs_df=jobs_df,
                                                                          filter_on=city_filter)
tfidf_jobid = recommendations_init.create_vector(jobs_data=jobs_df_filtered_location)
user_q = recommendations_init.get_applicant_row(applicant_data=applicant_df)
cosine_simlilarity_output = recommendations_init.compute_cosine_similarity(user_q=user_q, 
                                                                           tfidf_jobid=tfidf_jobid)
top_results = recommendations_init.get_top_results(cosine_simlilarity_output=cosine_simlilarity_output)
scores = recommendations_init.get_scores(cosine_simlilarity_output=cosine_simlilarity_output, 
                                         top_results=top_results)

recommendation_df = recommendations_init.get_recommendation(top_results=top_results, 
                                                         jobs_data=jobs_df_filtered_location, 
                                                         scores=scores)

In [29]:
recommendation_df

Unnamed: 0,ApplicantID,JobID,title,score,City,Empl_type,Position
0,Tomoki Fukazawa,293694,Visual Designer @ The BOSS Group,0.247034,Alexandria,Seasonal/Temp,Visual Designer
1,Tomoki Fukazawa,293717,Web Designer/Developer @ The BOSS Group,0.233172,Baltimore,Seasonal/Temp,Web Designer/Developer
2,Tomoki Fukazawa,270306,Graphic Design/Web & Media Design Instructors ...,0.215749,Monroe,Part-Time,Graphic Design/Web & Media Design Instructors ...
3,Tomoki Fukazawa,267494,Web Developer @ ConsultNet,0.208096,Commerce,Full-Time/Part-Time,Web Developer
4,Tomoki Fukazawa,276397,Webmaster I (Contractor Temp) @ Education Mana...,0.202554,Pittsburgh,Full-Time/Part-Time,Webmaster I (Contractor Temp)
...,...,...,...,...,...,...,...
1995,Tomoki Fukazawa,262026,Retail Field Representative @ MarketSource,0.059678,Seattle,Part-Time,Retail Field Representative
1996,Tomoki Fukazawa,290434,Contemporary Designers Full Time: Bloomingdale...,0.0596745,Los Angeles,Seasonal/Temp,Contemporary Designers Full Time: Bloomingdale...
1997,Tomoki Fukazawa,262036,Retail Field Representative @ MarketSource,0.0596648,Everett,Part-Time,Retail Field Representative
1998,Tomoki Fukazawa,304440,Retail Field Representative @ MarketSource,0.059662,Bakersfield,Part-Time,Retail Field Representative


In [30]:
recommendation_df.to_csv(f'{root_path}/data/recommendation_df.csv')

In [35]:
# further development

## ways to speed this up:
# - Filter jobs_df by the job location first and then create the cosine similarity

## make it better by:
# - scoring job title and description separately and then weigh scores