fastText
========
- Made by facebook <https://github.com/facebookresearch/fastText>
- Treats each word as the aggregation of its subwords. 
    - Subwords are character n-grams of the word. (e.g. army --> a, r, m, y, ar, rm, my, arm, rmy,
- Pros:
    - Much better thahn Word2Vec on syntactic tasks, especially with small training corpus
    - fastText can be used to obtain vectors for out-of-vocabulary (OOV) words
- Cons
    - Slightly worse than Word2Vec semantic tasks
    - Slower training time than Word2Vec
    - Comparision: <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Word2Vec_FastText_Comparison.ipynb>


<br><br>

Credit:
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-download-auto-examples-tutorials-run-fasttext-py
- wm distances work:
    - Ofir Pele and Michael Werman “A linear time histogram metric for improved SIFT matching”
    - Ofir Pele and Michael Werman “Fast and robust earth mover’s distances”
    - Matt Kusner et al. “From Word Embeddings To Document Distances”.



In [1]:

import numpy as np
import pandas as pd
import pickle
from scipy import spatial

from pprint import pprint
import operator

from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

unable to import 'smart_open.gcs', disabling that module


In [None]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Resume Data

In [None]:
local_resume_cleaned_lemmatized_tokenized_path = '/Users/richardkuzma/coding/NLP_projects/job_recommender_project/data/resumes_tokenized_lemmatized.pickle'
ec2_path = '/home/ubuntu/NLP_projects/job_recommender_project/data/resumes_tokenized_lemmatized.pickle'

# resumes = pd.read_pickle(local_resume_cleaned_lemmatized_tokenized_path)
resumes = pd.read_pickle(ec2_path)
resumes_sentences = resumes['lemmatized_resume'].tolist()
resumes.head()

In [None]:
# a list where each element is a list of strings
resumes_sentences[0][:15]

Training base FT model for resumes
--------------

In [None]:
base_res_model = FT_gensim(
    sentences=None,
    sg=0, #default to CBOW. if sg=1 then skip-gram
    hs=0, #default, if hs=0 & negative =/= 0 then neg. sampling. if hs=1, hierarchical softmax
    negative=5, #5 words selected for negative sampling
    size=100, #size of vector
    alpha=0.025,
    min_count=5, # ignore words with fewer than 20 apearances
    iter=5,
    seed=42,
    cbow_mean=1, #uses mean for CBOW. If it =0 then sums CBOW (provided CBOW not SG)
    min_n=3, # min length of char n-grams
    max_n=6, # max length of char n-grams. If 0 or less than min_n, this turns into W2V
    trim_rule=None, #if you had a rule to trim down vocabulary
    workers=3 # default
)    

    
# build the vocabulary
base_res_model.build_vocab(sentences = resumes_sentences)

# train the model
base_res_model.train(
    sentences=resumes_sentences,
    epochs=base_res_model.epochs,
    total_examples=base_res_model.corpus_count,
    total_words=base_res_model.corpus_total_words
)

print(base_res_model)

In [None]:
base_res_model.save('/home/ubuntu/NLP_projects/job_recommender_project/models/fasttext/base_res_ft')

<br><br><br><br><br><br><br><br><br><br>

<br><br>
### Need to be able to pick individual jobs to compare to all resumes

In [None]:
# load jobs into df

local_jobs_cleaned_lemmatized_tokenized_path = '/Users/richardkuzma/coding/NLP_projects/job_recommender_project/data/large_files/jobs_tokenized_lemmatized.pickle'
ec2_path = '/home/ubuntu/NLP_projects/job_recommender_project/data/large_files/jobs_tokenized_lemmatized.pickle'

jobs = pd.read_pickle(ec2_path)
# jobs = pd.read_pickle(local_jobs_cleaned_lemmatized_tokenized_path)

jobs_list = jobs['lemmatized_combined'].tolist()
jobs.head()

### end experiment zone

In [None]:
#13124 total jobs
jobs.shape[0]

In [None]:
def vectorize_resumes(model=base_res_model):
    """returns list of document vectors """
    res_vecs = []
    for i in range(0, len(resumes_sentences)):
        temp_vec = model.wv[resumes_sentences[i]]
        one_dim_vec = np.mean(temp_vec, axis=0)
        res_vecs.append(one_dim_vec)
    return res_vecs

In [None]:
def pick_job(selection=-999):
    print("There are {} jobs".format(jobs.shape[0]))
    
    # Select a random int from 0 to length of rjob set
    rand_int = np.random.randint(1, jobs.shape[0]+1)
    
    
    
    if selection == -999:
        selection = rand_int
    
    
    
    
    print ('\nselected job is ID #{}'.format(selection))
    
    # pick the job text and ID associated with the random int
    job_label = jobs.iloc[selection - 1, jobs.columns.get_loc('label')] #we could grab ID, but this works for non-indexed labels too
    job_title = jobs.iloc[selection - 1 ]['Title']
    job_company = jobs.iloc[selection - 1 ]['Company']
    job_description = jobs.iloc[selection - 1 ]['JobDescription']
    
    
    print('Job Posting ID is: {}'.format(job_label))
    print('Job Posting Title: {}'.format(job_title))
    print('Job Posting Company: {}'.format(job_company))
    print('Job Posting Description: {}'.format(job_description))
    
    #Convert the sample document into a list and use the infer_vector method to get a vector representation for it
    job_text_to_process = jobs['lemmatized_combined'][selection - 1]
    
    return job_text_to_process


In [None]:
def given_job_find_similar_resumes(job_you_pick, model=base_res_model, resumes_vectors = rv):

        
    #find all distances between chosen job and each resume
    temp_distance = []
    min_dist = float("inf")
    min_index = float("inf")

    #turning chosen job into vector
    job_temp = model.wv[job_you_pick]
    job_vec = np.mean(job_temp, axis=0)
    
    
    for i in range (0, len(resumes_vectors)):
        # print(i)
        # dist = model.wmdistance(job_you_pick, resumes_sentences[i])
        dist = spatial.distance.cosine(job_vec, resumes_vectors[i])
        
        
        if dist < min_dist:
            min_dist = dist
            min_index = i
        temp_distance.append((dist, i))

    #sort list of tuples
    temp_distance.sort(key = operator.itemgetter(0))
    return temp_distance

    

In [None]:
def print_similar_resumes(list_of_similar_resumes):
    num_similar = 10 #or 10, 20, 25
    print('\nPrinting {} MOST similar candidates...\n'.format(num_similar))
    for i in range(0,num_similar):
        print('\n#{} most similar job'.format(i+1))
        print('Resume ID from list: {}'.format(list_of_similar_resumes[i][1]))
        print('Cosine Distance: {}'.format(list_of_similar_resumes[i][0]))
        print('Resume ID from df: {}'.format(resumes.iloc[list_of_similar_resumes[i][1]]['ID']))
        print('Resume text (500 chars): {}'.format(resumes.iloc[list_of_similar_resumes[i][1]]['resume'][0:500]))


In [None]:
def print_dissimilar_resumes(list_of_similar_resumes):
    
    
    num_dissimilar = 10 #or 10, 20, 25
    print('\nPrinting {} LEAST similar candidate resumes...\n'.format(num_dissimilar))
    count = 0
    for i in range(0, len(list_of_similar_resumes)):
        if count == num_dissimilar:
            break
        if resumes.iloc[list_of_similar_resumes[-(1+i)][1]]['resume'] != 'nan':        
            print('\n#{} least similar candidate'.format(count+1))
            print('Candidate ID from list: {}'.format(list_of_similar_resumes[-(1+i)][1]))
            print('Cosine Distance: {}'.format(list_of_similar_resumes[-(1+i)][0]))
            print('Resume ID from df: {}'.format(resumes.iloc[list_of_similar_resumes[-(1+i)][1]]['ID']))
            print('Resume text (500 chars): {}'.format(resumes.iloc[list_of_similar_resumes[-(1+i)][1]]['resume'][:500]))
            count +=1
        
        
        

# Given a job, find similar candidates

In [None]:
rv = vectorize_resumes()

In [None]:
chosen_job = pick_job()
# 5773 data analyst

In [None]:
ordered_resumes_list = given_job_find_similar_resumes(chosen_job)


In [None]:
# base_res_model.save('/home/ubuntu/NLP_projects/job_recommender_project/models/fasttext/base_res_ft')

In [None]:
print_similar_resumes(ordered_resumes_list)

In [None]:
print_dissimilar_resumes(ordered_resumes_list)

# Build model to find jobs for a candidate

In [None]:
jobs_sentences = jobs['lemmatized_combined'].tolist()

In [None]:
base_jobs_model = FT_gensim(
    sentences=None,
    sg=0, #default to CBOW. if sg=1 then skip-gram
    hs=0, #default, if hs=0 & negative =/= 0 then neg. sampling. if hs=1, hierarchical softmax
    negative=5, #5 words selected for negative sampling
    size=100, #size of vector
    alpha=0.025,
    min_count=5, # ignore words with fewer than 20 apearances
    iter=5,
    seed=42,
    cbow_mean=1, #uses mean for CBOW. If it =0 then sums CBOW (provided CBOW not SG)
    min_n=3, # min length of char n-grams
    max_n=6, # max length of char n-grams. If 0 or less than min_n, this turns into W2V
    trim_rule=None, #if you had a rule to trim down vocabulary
    workers=3 # default
)    

    
# build the vocabulary
base_jobs_model.build_vocab(sentences = jobs_sentences)

# train the model
base_jobs_model.train(
    sentences=jobs_sentences,
    epochs=base_jobs_model.epochs,
    total_examples=base_jobs_model.corpus_count,
    total_words=base_jobs_model.corpus_total_words
)

print(base_jobs_model)

In [None]:
base_jobs_model.save('/home/ubuntu/NLP_projects/job_recommender_project/models/fasttext/base_jobs_ft')

In [None]:
def vectorize_jobs(model=base_jobs_model):
    """returns list of document vectors """
    jobs_vecs = []
    for i in range(0, len(jobs_sentences)):
        temp_vec = model.wv[jobs_sentences[i]]
        one_dim_vec = np.mean(temp_vec, axis=0)
        jobs_vecs.append(one_dim_vec)
    return jobs_vecs

In [None]:
def pick_resume(selection=-999):
    print("There are {} resumes".format(resumes.shape[0]))
  
    if selection == -999:
        selection = np.random.randint(1, resumes.shape[0]+1)
    
    
    
    print ('\nselected resume is ID #{}'.format(selection))
    
    # pick the job text and ID associated with the random int
    resume_label = resumes.iloc[selection - 1, resumes.columns.get_loc('ID')] #we could grab ID, but this works for non-indexed labels too
    resume_text = resumes.iloc[selection - 1 ]['resume'][:500] 
    
    
    print('Resume ID is: {}'.format(resume_label))
    print('Resume text is (500 chars): {}'.format(resume_text))
    
    #Convert the sample document into a list and use the infer_vector method to get a vector representation for it
    resume_text_to_process = resumes['lemmatized_resume'][selection - 1]
    
    return resume_text_to_process


In [None]:
def given_resume_find_similar_jobs(resume_you_pick, model=base_jobs_model, jobs_vectors = jv):
  
    #find all distances between chosen job and each resume
    temp_distance = []
    min_dist = float("inf")
    min_index = float("inf")
    
    res_temp = model.wv[resume_you_pick]
    res_vec = np.mean(res_temp, axis=0)
    
    for i in range (0, len(jobs_vectors)):
        dist = spatial.distance.cosine(res_vec, jobs_vectors[i])
#         dist = model.wmdistance(resume_you_pick, jobs_sentences[i])
        if dist < min_dist:
            min_dist = dist
            min_index = i
        temp_distance.append((dist, i))


                
    #sort list of tuples
    temp_distance.sort(key = operator.itemgetter(0))

    return temp_distance

    

In [None]:
def print_similar_jobs(list_of_similar_jobs):
    
    num_similar = 10 #or 10, 20, 25
    print('\nPrinting {} most similar jobs for this candidate...\n'.format(num_similar))
    for i in range(0,num_similar):
        print('\n#{} most similar job'.format(i+1))
        print('Job ID from list: {}'.format(list_of_similar_jobs[i][1]))
        print('Cosine distance: {}'.format(list_of_similar_jobs[i][0]))
        print('Job ID from df: {}'.format(jobs.iloc[list_of_similar_jobs[i][1]]['label']))
        print('Job title: {}'.format(jobs.iloc[list_of_similar_jobs[i][1]]['Title']))
        print('Company: {}'.format(jobs.iloc[list_of_similar_jobs[i][1]]['Company']))
        print('Job Description: {}'.format(jobs.iloc[list_of_similar_jobs[i][1]]['JobDescription']))


In [None]:
def print_dissimilar_jobs(list_of_similar_jobs):
    
    num_dissimilar = 10 #or 10, 20, 25
    print('\nPrinting {} LEAST similar jobs for this candidate...\n'.format(num_dissimilar))
    for i in range(0,num_dissimilar):
        print('\n#{} least similar job'.format(i+1))
        print('Job ID from list: {}'.format(list_of_similar_jobs[-(1+i)][1]))
        print('Cosine Distance: {}'.format(list_of_similar_jobs[-(1+i)][0]))
        print('Job ID from df: {}'.format(jobs.iloc[list_of_similar_jobs[-(1+i)][1]]['label']))
        print('Job title: {}'.format(jobs.iloc[list_of_similar_jobs[-(1+i)][1]]['Title']))
        print('Company: {}'.format(jobs.iloc[list_of_similar_jobs[-(1+i)][1]]['Company']))
        print('Job Description: {}'.format(jobs.iloc[list_of_similar_jobs[-(1+i)][1]]['JobDescription']))
        
        
        

In [None]:

jv = vectorize_jobs()

In [None]:
chosen_resume = pick_resume()

In [None]:
ordered_job_list = given_resume_find_similar_jobs(chosen_resume)

In [None]:
print_similar_jobs(ordered_job_list)

In [None]:
print_dissimilar_jobs(ordered_job_list)