# 4c. Models - distill BERT

Description of the notebook

### 4c.0 Importing libraries

In [1]:
# Importing libraries

import os
from pathlib import Path
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
import torch

### 4c.1 Gathering data

In [2]:
# Changing to project directory

os.chdir(Path(os.path.realpath("")).resolve().parents[2])

In [3]:
# Importing modules that fetch the data

from src.getter.load_application_and_opportunity import get_interim_data

# Gathering the data

featurizeddata_dbert = get_interim_data("featurizeddata_dbert")

In [2]:
# Defining list of column names that contains the names of the columns, if they belong to the job or candidate

job_column = ['ExternalBriefDescription','ExternalDescription', 'Title', 
              'JobCategoryName']
uid_column = ['OpportunityId', 'ApplicationId']
can_column = [
    'IsCandidateInternal',
    'BehaviorCriteria', 
    'MotivationCriteria',
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepName', 
    'StepGroup',
    'pass_first_step'
] # Column - 'Tag' Will be added later, StepId has been removed
sel_column = ['IsRejected']

# Defining list of columns based on the type of contents

str_column = [
    'ExternalBriefDescription', 
    'ExternalDescription', 
    'Title', 
    'JobCategoryName', 
    'BehaviorCriteria', 
    'MotivationCriteria', 
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup'
]
bool_column = ['IsCandidateInternal', 'pass_first_step']
float_column = ['Tag']

# Defining list of columns based on the models

model_names = ["w2v", "bert", "dbert"]

# Setting the local folder location
dataloc = '/Users/rathish/Documents/Projects/Opportunity_Application_Ranker/inputs/data'

In [3]:
# Gathering necessary arrays and applying Standard Scaller

standardscaler = StandardScaler(copy = False)

In [4]:
# Getting top n-similar cosine vectors
def n_pairwise_cosine_similar(matrix_1, matrix_2, n = 3):

    '''
    Returns diagonal values of cosine similarity and list of index containing 
    that top n similar cosine

    Args:
        matrix_1 (numpy array: 2D): Array containing vectors whose similarity 
        is to be checked
        matrix_2 (numpy array: 2D): Array with which the similarity is to be 
        compared to
        
    n (int, default: 3): number of top similar values to be found

    Returns:
    diag_cosine_similarity (numpy array): Direct cosine similarity between 
    
    '''
    
    matrix_1 = (matrix_1/np.linalg.norm(matrix_1, axis = 1)[:, np.newaxis])
    matrix_2 = (matrix_2/np.linalg.norm(matrix_2, axis = 1)[:, np.newaxis])

    # Moving matrices to GPU
    device = torch.device(
        "mps" if torch.backends.mps.is_available() else "cpu"
        )
    
    similarity_dict = {}
    cosine_similarity = {}

    for index_1, values_1 in tqdm(
        enumerate(matrix_1), desc = "Processing", total = len(matrix_1)
    ):
        temp = {}
    
        for index_2, values_2 in enumerate(matrix_2):
            temp_value = np.dot(values_1, values_2)

            if index_1 == index_2:
                cosine_similarity[index_1] = temp_value
                
            temp[index_2] = temp_value

        sorteddict = sorted(
            temp.items(), 
            key = lambda x: x[1], 
            reverse = True
        )[:n]
        
        similarity_dict[index_1] = sorteddict #.keys() if keys are wanted
    
    return cosine_similarity, similarity_dict

## 4c.2.1 Working on distill-BERT data

In [4]:
# Checking columns

featurizeddata_dbert.columns

Index(['opportunity__dbert_hstack', 'candidate__dbert_hstack',
       'opportunity__dbert_vstack', 'candidate__dbert_vstack'],
      dtype='object')

In [6]:
# Deriving arrays for dbert vectors - horizontally stacked vectors

opportunity__dbert_hstack = np.array(
    [np.array(x) for x in featurizeddata_dbert['opportunity__dbert_hstack']]
)
# standardscaler.fit_transform(opportunity__dbert)

candidate__dbert_hstack = np.array(
    [np.array(x) for x in featurizeddata_dbert['candidate__dbert_hstack']]
)
# standardscaler.fit_transform(candidate__dbert)


# Deriving arrays for dbert vectors - vertically stacked vectors

opportunity__dbert_vstack = np.array(
    [np.array(x) for x in featurizeddata_dbert['opportunity__dbert_vstack']]
)
candidate__dbert_vstack = np.array(
    [np.array(x) for x in featurizeddata_dbert['candidate__dbert_vstack']]
)

In [7]:
# Reducing the dimensionality of the dbert vectors through PCA

no_of_dimensions = min(
    candidate__dbert_hstack.shape[1], opportunity__dbert_hstack.shape[1]
)
pca = PCA(n_components = no_of_dimensions, copy = False)

if candidate__dbert_hstack.shape[1] >= opportunity__dbert_hstack.shape[1]:
    candidate__dbert_hstack = pca.fit_transform(candidate__dbert_hstack)
else:
    opportunity__dbert_hstack = pca.fit_transform(opportunity__dbert_hstack)

In [8]:
# Checking the dimension after PCA
print(candidate__dbert_hstack.shape, opportunity__dbert_hstack.shape)

(110267, 3072) (110267, 3072)


In [9]:
# Running the n_pairwise_cosine_similar for horizontally stacked vectors

(
    cosine_similarity__dbert_hstack, similarity_dict__dbert_hstack
) = n_pairwise_cosine_similar(
    opportunity__dbert_hstack, candidate__dbert_hstack, n= 3
)

Processing: 100%|██████████| 110267/110267 [3:44:16<00:00,  8.19it/s] 


In [10]:
# Running the n_pairwise_cosine_similar for vertically stacked vectors

(
    cosine_similarity__dbert_vstack, similarity_dict__dbert_vstack
) = n_pairwise_cosine_similar(
    opportunity__dbert_vstack, candidate__dbert_vstack, n= 3
)

Processing: 100%|██████████| 110267/110267 [3:11:41<00:00,  9.59it/s] 


### 4c.2.1 Saving the similarity data - distillBERT

In [11]:
# Adding dictionaries into the variables for pickle
similarity_dbert = {}

# Adding dictionaries
similarity_dbert[
    "cosine_similarity__dbert_hstack"
] = cosine_similarity__dbert_hstack
similarity_dbert[
    "similarity_dict__dbert_hstack"
] = similarity_dict__dbert_hstack

similarity_dbert[
    "cosine_similarity__dbert_vstack"
] = cosine_similarity__dbert_vstack
similarity_dbert[
    "similarity_dict__dbert_vstack"
] = similarity_dict__dbert_vstack

In [None]:
# Saving the data - Gathering necessary libraries

from src.getter.save_application_and_opportunity import save_processed_data

# Saving the data 
save_processed_data(similarity_dbert, "similarity_dict_dbert")