# 4a. Models - Word 2 Vec model

Description for this notebook

### 4a.0 Importing libraries

In [1]:
# Importing libraries 

import os
from pathlib import Path
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
import torch

### 4a.1 Gathering data

In [2]:
# Changing to project directory

os.chdir(Path(os.path.realpath("")).resolve().parents[2])

In [3]:
# Importing modules that fetch the data

from src.getter.load_application_and_opportunity import get_interim_data

# Gathering the data

featurizeddata_w2v = get_interim_data("featurizeddata_w2v")

In [4]:
# Defining list of column names that contains the names of the columns, if they belong to the job or candidate

job_column = ['ExternalBriefDescription','ExternalDescription', 'Title', 
              'JobCategoryName']
uid_column = ['OpportunityId', 'ApplicationId']
can_column = [
    'IsCandidateInternal',
    'BehaviorCriteria', 
    'MotivationCriteria',
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepName', 
    'StepGroup',
    'pass_first_step'
] # Column - 'Tag' Will be added later, StepId has been removed

sel_column = ['IsRejected']

# Defining list of columns based on the type of contents

str_column = [
    'ExternalBriefDescription', 
    'ExternalDescription', 
    'Title', 
    'JobCategoryName', 
    'BehaviorCriteria', 
    'MotivationCriteria', 
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup'
]
bool_column = ['IsCandidateInternal', 'pass_first_step']
float_column = ['Tag']

# Defining list of columns based on the models

model_names = ["w2v", "bert", "dbert"]

In [5]:
# Gathering necessary arrays and applying Standard Scaller

standardscaler = StandardScaler(copy = False)

In [6]:
# Getting top n-similar cosine vectors
def n_pairwise_cosine_similar(matrix_1, matrix_2, n = 3):

    '''
    Returns diagonal values of cosine similarity and list of index containing 
    that top n similar cosine

    Args:
        matrix_1 (numpy array: 2D): Array containing vectors whose similarity 
        is to be checked
        matrix_2 (numpy array: 2D): Array with which the similarity is to be 
        compared to
        
    n (int, default: 3): number of top similar values to be found

    Returns:
    diag_cosine_similarity (numpy array): Direct cosine similarity between 
    
    '''
    
    matrix_1 = (matrix_1/np.linalg.norm(matrix_1, axis = 1)[:, np.newaxis])
    matrix_2 = (matrix_2/np.linalg.norm(matrix_2, axis = 1)[:, np.newaxis])

    # Moving matrices to GPU
    device = torch.device(
        "mps" if torch.backends.mps.is_available() else "cpu"
        )
    
    
    similarity_dict = {}
    cosine_similarity = {}

    for index_1, values_1 in tqdm(
        enumerate(matrix_1), desc = "Processing", total = len(matrix_1)
    ):
        temp = {}
    
        for index_2, values_2 in enumerate(matrix_2):
            temp_value = np.dot(values_1, values_2)

            if index_1 == index_2:
                cosine_similarity[index_1] = temp_value
                
            temp[index_2] = temp_value

        sorteddict = sorted(
            temp.items(), 
            key = lambda x: x[1], 
            reverse = True
        )[:n]
        
        similarity_dict[index_1] = sorteddict #.keys() if keys are wanted
    
    return cosine_similarity, similarity_dict

### 4a.2.1 Working on word2vec data

Through experimentation, it was realized that the word2vec or as a matter of fact any langugage based vector need not be scalled as scalling would modify the information stored in the embeddings , we do not scale the data before PCA.  

In [7]:
# Deriving arrays for w2v vectors - horizontally stacked

opportunity__w2v_hstack = np.array(
    [np.array(x) for x in featurizeddata_w2v['opportunity__w2v_hstack']]
)
"""
standardscaler.fit_transform(opportunity__w2v) - Experiment to see if minmax 
scalling affects the similarity means
"""

candidate__w2v_hstack = np.array(
    [np.array(x) for x in featurizeddata_w2v['candidate__w2v_hstack']]
)
"""
standardscaler.fit_transform(candidate__w2v) - Experiment to see if minmax 
scalling affetst the similarity means
"""

# Deriving array for w2v vectors - Vertically stacked

opportunity__w2v_vstack = np.array(
    [np.array(x) for x in featurizeddata_w2v['opportunity__w2v_vstack']]
)
candidate__w2v_vstack = np.array(
    [np.array(x) for x in featurizeddata_w2v['candidate__w2v_vstack']]
)

In [8]:
# Reducing the dimensionality of horizontally stacked w2v vectors through PCA

no_of_dimensions = min(
    candidate__w2v_hstack.shape[1], opportunity__w2v_hstack.shape[1]
)

pca = PCA(n_components = no_of_dimensions, copy = False)

if candidate__w2v_hstack.shape[1] >= opportunity__w2v_hstack.shape[1]:
    candidate__w2v_hstack = pca.fit_transform(candidate__w2v_hstack)
else:
    opportunity__w2v_hstack = pca.fit_transform(opportunity__w2v_hstack)

In [9]:
# Checking the dimensions after PCA

print(candidate__w2v_hstack.shape, opportunity__w2v_hstack.shape)

(110267, 2048) (110267, 2048)


In [10]:
# Running the n_pairwise_cosine_similar func for horizontally stacked vectors

(
    cosine_similarity__w2v_hstack, 
    similarity_dict__w2v_hstack
) = n_pairwise_cosine_similar(
    opportunity__w2v_hstack, candidate__w2v_hstack, n = 3
)

Processing: 100%|██████████| 110267/110267 [7:41:51<00:00,  3.98it/s] 


In [11]:
# Running the n_pairwise_cosine_similar func for vertically stacked vectors

(
    cosine_similarity__w2v_vstack, similarity_dict__w2v_vstack
) = n_pairwise_cosine_similar(
    opportunity__w2v_vstack, candidate__w2v_vstack, n = 3
)

Processing: 100%|██████████| 110267/110267 [4:03:37<00:00,  7.54it/s] 


### 4a.2.2 Saving the similarity data - horizontally and vertically stacked data

In [12]:
# Adding dictionaries into the variables for pickle
similarity_w2v = {}

# Adding dictionaries
similarity_w2v["cosine_similarity__w2v_hstack"] = cosine_similarity__w2v_hstack
similarity_w2v["similarity_dict__w2v_hstack"] = similarity_dict__w2v_hstack

similarity_w2v["cosine_similarity__w2v_vstack"] = cosine_similarity__w2v_vstack
similarity_w2v["similarity_dict__w2v_vstack"] = similarity_dict__w2v_vstack

In [23]:
# Saving the data - Gathering necessary libraries

from src.getter.save_application_and_opportunity import save_processed_data

# Saving the data 
save_processed_data(similarity_w2v, "similarity_dict_w2v")