# 4a. Models - Word2Vec model

Description for this notebook

### 4a.0 Importing libraries

In [1]:
# Importing libraries 

import os
from pathlib import Path
import numpy as np 
import pandas as pd 
from sklearn.decomposition import PCA
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

### 4a.1 Gathering data

In [2]:
# Changing to project directory

os.chdir(Path(os.path.realpath("")).resolve().parents[1])

In [3]:
# Importing modules that fetch or save the data

from src.getter.load_application_and_opportunity import *
from src.getter.save_application_and_opportunity import *

# Gathering the data
w2v_data_dictionary = get_interim_data("w2v_data_dictionary")
ppdata = get_interim_data("preprocesseddata")

In [4]:
# Defining list of column names that contains the names of the columns, if they belong to the job or candidate

job_column = ['ExternalBriefDescription','ExternalDescription', 'Title', 
              'JobCategoryName']
uid_column = ['OpportunityId', 'ApplicationId']
can_column = [
    'IsCandidateInternal',
    'BehaviorCriteria', 
    'MotivationCriteria',
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepName', 
    'StepGroup',
    'pass_first_step'
] # Column - StepId has been removed

sel_column = ['IsRejected']

# Defining list of columns based on the type of contents

str_column = [
    'ExternalBriefDescription', 
    'ExternalDescription', 
    'Title', 
    'JobCategoryName', 
    'BehaviorCriteria', 
    'MotivationCriteria', 
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup'
]
bool_column = ['IsCandidateInternal', 'pass_first_step']
float_column = ['Tag']

# Defining list of columns based on the models

model_names = ["w2v", "bert", "dbert"]

In [5]:
# Redefining the data

ppdata = ppdata[uid_column]

In [6]:
# Applying w2v vectors onto the ppdata_uid 

ppdata["opportunity__w2v_hstack"] = ppdata['OpportunityId'].apply(lambda x : w2v_data_dictionary['job_opportunityid_w2v_dict_hstack'][x])
ppdata["opportunity__w2v_vstack"] = ppdata['OpportunityId'].apply(lambda x : w2v_data_dictionary['job_opportunityid_w2v_dict_vstack'][x])
ppdata["candidate__w2v_hstack"] = ppdata['ApplicationId'].apply(lambda x : w2v_data_dictionary['can_applicationid_w2v_dict_hstack'][x])
ppdata["candidate__w2v_vstack"] = ppdata['ApplicationId'].apply(lambda x : w2v_data_dictionary['can_applicationid_w2v_dict_vstack'][x])

### 4a.2.1 Working on word2vec data

Through experimentation, it was realized that the word2vec or as a matter of fact any langugage based vector need not be scalled as scalling would modify the information stored in the embeddings , we do not scale the data before PCA.  

#### 4a.2.1.1 Creating functions that reduce dimensionality

In [7]:
# Generating function that creates arrays for calculating the cosine similarity 
def reduce_dimensionality(data, opp_uid_name, app_uid_name,  opportunity_stack_dict, candidate_stack_dict):
     """
    Reduce the dimensionality of vectors using PCA, if horizontally stacked 
    (ignored if vertically stacked) and create arrays from the data used 
    further for similarity calculations
    
    Parameters:
        candidate_vectors (pandas.DataFrame): Horizontally stacked Word2Vec 
        vectors for candidates
        opportunity_vectors (pandas.DataFrame): Horizontally stacked Word2Vec 
        vectors for opportunities
        
    Returns:
        numpy.ndarray: Transformed array of vectors with reduced dimensionality
        if horizontally stacked else transformed array of vectors with same 
        dimensionality
    """
     data__ = data[[opp_uid_name] + [app_uid_name]]
     data__['opportunity_vectors'] = data__[opp_uid_name].apply(lambda x : opportunity_stack_dict[x])
     data__['candidate_vectors'] = data__[app_uid_name].apply(lambda x : candidate_stack_dict[x])

     opportunity__ = np.array(
          [np.array(x) for x in data__['opportunity_vectors'].tolist()]
        )
    
     candidate__ = np.array(
          [np.array(x) for x in data__['candidate_vectors'].tolist()]
        )

    # Setting the number of dimensions as minimum shape
     no_of_dimensions = min(candidate__.shape[1], opportunity__.shape[1])

     if 'PCA' in dir():
        pca = PCA(n_components = no_of_dimensions)
     else:
         from sklearn.decomposition import PCA 
         pca =  PCA(n_components = no_of_dimensions)

     # Applying PCA 
     if candidate__.shape[1] >= opportunity__.shape[1]: 
          # Exporting pca.fit for app based requirement
         pca_fit = pca.fit(candidate__)
         save_app_data(pca_fit, 'candidate_w2v_pca_model')

         app_array, opp_array = pca.fit_transform(candidate__), opportunity__
     else:
          # Exporting pca.fit for app based requirement
         pca_fit = pca.fit(opportunity__)
         save_app_data(pca_fit, 'opportunity_w2v_pca_model')

         app_array, opp_array = candidate__, pca.fit_transform(opportunity__) 
    
     app_pca_dict, opp_pca_dict = {}, {}
     
     for uid, vector in zip(data__[app_uid_name], app_array):
         app_pca_dict[uid] = vector
     
     for uid, vector in zip(data__[opp_uid_name], opp_array):
         opp_pca_dict[uid] = vector
    
     return opp_pca_dict, app_pca_dict

#### 4a.2.1.2 Deriving dimensionally reduced dictionaries

In [8]:
# Deriving dimensionally reduced dictionaries for opportunity ID
opp__w2v_pca_hstack_dict, app__w2v_pca_hstack_dict = reduce_dimensionality(ppdata, "OpportunityId", "ApplicationId", w2v_data_dictionary['job_opportunityid_w2v_dict_hstack'], w2v_data_dictionary['can_applicationid_w2v_dict_hstack'])


# Beware of using this, this functions overwrites the PCA pickle file - Use with caution
# opp__w2v_pca_vstack_dict, app__w2v_pca_vstack_dict = reduce_dimensionality(ppdata, "OpportunityId", "ApplicationId", w2v_data_dictionary['job_opportunityid_w2v_dict_vstack'], w2v_data_dictionary['can_applicationid_w2v_dict_vstack'])

In [9]:
# Checking the dimensions after PCA

print(len(opp__w2v_pca_hstack_dict), len(app__w2v_pca_hstack_dict))

8473 110267


#### 4a.2.2.1 Creating functions that calculate cosine similarities

In [10]:
# Getting top n-similar cosine vectors
def pairwise_cosine(data, opp_uid_name, app_uid_name, opp_pca_dict, app_pca_dict):
    '''
    Compute pairwise cosine similarity between opportunity and application 
    vectors in DataFrame

    Parameters:
        data(pandas.DataFrame): Input DataFrame containg opportunity and 
        application UIDs. 
        opp_upd_name(str): Columns name with oppotunity IDs
        app_uid_name(str): Columns name with applciation IDs
        opp_pca_dict (dict): Dictionary mapping opportunity uids to the vectors
        app_pca_dict (dict): Dictionary mapping application uids to the vectors
    
    Returns:
        pandas.DataFrame with columns "OpportunityId", "ApplciationId and 
        cosine similarity
    '''
    data__ = data[[opp_uid_name] + [app_uid_name]]
    data__['opportunity_vectors'] = data__[opp_uid_name].apply(lambda x : opp_pca_dict[x])
    data__['candidate_vectors'] = data__[app_uid_name].apply(lambda x : app_pca_dict[x])

    opportunity__ = np.array(
          [np.array(x) for x in data__['opportunity_vectors'].tolist()]
        )
    
    candidate__ = np.array(
          [np.array(x) for x in data__['candidate_vectors'].tolist()]
        )
    
    opportunity__ = (opportunity__/np.linalg.norm(opportunity__, axis = 1)[:, np.newaxis])
    candidate__ = (candidate__/np.linalg.norm(candidate__, axis = 1)[:, np.newaxis])
    
    data__['row_similarity'] = [np.dot(row1, row2) for row1, row2 in zip(opportunity__, candidate__)]
     
    return data__[[opp_uid_name] + [app_uid_name] + ['row_similarity']]

def topn_similar(opp_pca_dict, app_pca_dict, n = 3):
    """
    Calculates top n most similar application IDs for a given opportunity ID

    Parameters:
        opp_pca_dict (dict): Dictionary mapping opportunity IDs to their 
        dimensionally reduced vectors. 
        app_pca_dict (dict): Dictionary mapping application IDs to their 
        dimensionally reduced vectors. 
        n(int, optional - default = 3): Number of top similar application IDs 
        to retrieve
    
    Returns:
    dict: A dictionary with keys as opportunity IDs pointing to value which is 
    a dictionary in itself. This dictionary contaings application IDs as keys 
    and similarity scores as values
    """
    
    similarity_dict = {}

    for key_opp, val_opp in tqdm(opp_pca_dict.items(), desc = "Application IDs: ", total = len(opp_pca_dict)):
        
        temp = {}

        for key_app, val_app in app_pca_dict.items():
            temp_value = np.dot(val_opp/np.linalg.norm(val_opp), val_app/np.linalg.norm(val_app))
            temp[key_app] = temp_value
        
        sorteddict = sorted(temp.items(), key = lambda x: x[1], reverse = True)[:n]

        similarity_dict[key_opp] = sorteddict
    
    return similarity_dict

#### 4a.2.2.2 Applying pariwise - cosine similarity and getting top n(=3 default) similar application IDs and similarity values for each opportunity ID

In [11]:
# Cosine-similarity pairwise
cosine_similarity_w2v_opp_app_hstack = pairwise_cosine(ppdata, "OpportunityId", "ApplicationId", opp__w2v_pca_hstack_dict, app__w2v_pca_hstack_dict)
cosine_similarity_w2v_opp_app_vstack = pairwise_cosine(ppdata, "OpportunityId", "ApplicationId", opp__w2v_pca_vstack_dict, app__w2v_pca_vstack_dict)

NameError: name 'opp__w2v_pca_vstack_dict' is not defined

In [None]:
# Similarity dictionaries Space
similarity_w2v_dict_opp_app_hstack = topn_similar(opp__w2v_pca_hstack_dict, app__w2v_pca_hstack_dict)
similarity_w2v_dict_opp_app_vstack = topn_similar(opp__w2v_pca_vstack_dict, app__w2v_pca_vstack_dict)

Application IDs: 100%|██████████| 8473/8473 [2:34:07<00:00,  1.09s/it]  
Application IDs:  20%|██        | 1715/8473 [22:51<1:32:24,  1.22it/s]

### 4a.2.3 Saving the similarity data - horizontally and vertically stacked data

In [None]:
# Saving the data 
# hstack
save_processed_data(similarity_w2v_dict_opp_app_hstack, "similarity_dict_w2v_hstack")
save_processed_data(cosine_similarity_w2v_opp_app_hstack, "cosine_similarity_w2v_hstack")

#vstack
save_processed_data(similarity_w2v_dict_opp_app_vstack, "similarity_dict_w2v_vstack")
save_processed_data(cosine_similarity_w2v_opp_app_vstack, "cosine_similarity_w2v_vstack")

#Saving the dimensionally reduced vectors for streamlit app output
save_app_data(opp__w2v_pca_hstack_dict, 'opp__w2v_pca_hstack_dict')
save_app_data(app__w2v_pca_hstack_dict, 'app__w2v_pca_hstack_dict')
save_app_data(opp__w2v_pca_vstack_dict, 'opp__w2v_pca_vstack_dict')
save_app_data(app__w2v_pca_vstack_dict, 'app__w2v_pca_vstack_dict')