# 3a. Featurizing the data - Word 2 Vec model

## 3a.0 Importing libraries

In [1]:
# Importing libraries 

import os
from pathlib import Path
import numpy as np 
import pandas as pd 
import gensim
import nltk
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

## 3a.1.1 Gathering data

In [2]:
# Moving to parent directory

os.chdir(Path(os.path.realpath("")).resolve().parents[1])

# Importing the data gathering modules

from src.getter.load_application_and_opportunity import *
from src.getter.save_application_and_opportunity import *

fdata = get_interim_data("preprocesseddata")
fdata.head()

Unnamed: 0,OpportunityId,ApplicationId,ExternalBriefDescription,ExternalDescription,Title,JobCategoryName,IsRejected,IsCandidateInternal,BehaviorCriteria,MotivationCriteria,...,SkillCriteria__trnsfrmrpp,WorkExperiences__trnsfrmrpp,Educations__trnsfrmrpp,LicenseAndCertifications__trnsfrmrpp,Skills__trnsfrmrpp,Motivations__trnsfrmrpp,Behaviors__trnsfrmrpp,StepId__trnsfrmrpp,StepName__trnsfrmrpp,StepGroup__trnsfrmrpp
0,MbzeABKVn06G8irkoHJeIg==,nTzdqGj020CYqTouPocGSg==,"$16.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$16.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Call Cente...,Degree Some college Description None Graduatio...,,ScaleValue 4 ScaleValueName Advanced Skill Clo...,Description Inspired to perform well by moneta...,Description Devoted to a task or purpose with ...,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
1,7SPt0A57/kyzM9hE9vxDRg==,QVk5MFCZ70WvlZE9FzAW9g==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Coordinato...,Degree Diploma Description None GraduationMont...,,ScaleValue 5 ScaleValueName Expert Skill Sales...,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
2,7SPt0A57/kyzM9hE9vxDRg==,I1kcPlAw3E+rqceh0qrutQ==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Direct Car...,Degree HIGH SCHOOL DIPLOMA Description None Gr...,,ScaleValue 4 ScaleValueName Advanced Skill Cas...,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
3,zolSWBFjWESbfkj8AXLYwA==,VTCXZK6/ZUWJDpxTcm2CRg==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear 2019.0 JobTitle Package ...,Degree Associate in Early Description None Gra...,,ScaleValue 5 ScaleValueName Expert Skill Cashi...,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
4,zolSWBFjWESbfkj8AXLYwA==,I6KgcL0jdkG8wBnL1+BZ/g==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Warehouse ...,Degree Bachelor of Business Admin Description ...,,ScaleValue 5 ScaleValueName Expert Skill Forkl...,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined


### 3a.1.2 Defining column names for featurization 

In [3]:
# Defining list containing names of the columns

job_column = [
    'ExternalBriefDescription',
    'ExternalDescription', 
    'Title', 
    'JobCategoryName'
]

uid_column = ['OpportunityId', 'ApplicationId']

can_column = [
    'IsCandidateInternal',
    'BehaviorCriteria', 
    'MotivationCriteria',
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 'Motivations', 
    'Behaviors', 
    'StepName', 
    'Tag', 
    'StepGroup',
    'pass_first_step'
]

sel_column = ['IsRejected']

# Defining list of columns based on the type of contents

str_column = [
    'ExternalBriefDescription', 
    'ExternalDescription', 
    'Title', 
    'JobCategoryName', 
    'BehaviorCriteria', 
    'MotivationCriteria', 
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup'
]

bool_column = ['IsCandidateInternal', 'pass_first_step']

float_column = ['Tag']

## 3a.2 TFIDF weighted word2vec vectorization

Let's use TFIDF weighted word2vec for generating the necessary features to measure similarity. 
To do this, let's generate two functions - tfidf_weighted_word2vec, which inputs the column name and the data to generate the TFIDF vocabulary, TFIDF's - idf values, and word2vec vocab:vector after training the data. 
The second function i.e. TFIDF w2v_vectorizer intakes all of the variables generated in the first function along and applies it to a particular text to achieve TFIDF weighted word to vec. 

### 3a.2.1 Creating functions that derive TFIDF weighted word2vec information

In [4]:
def w2vbased_embedder(data, uid_column_name, str_column, bool_column, float_column):
    """
    Embeds TF-IDF weighted Word2Vec for string columns and encodes and pads the
    boolean float columns to finally concatenate into horizontally and 
    vertically stacked vectors.

    Args:
        data (pandas.DataFrame): Input dataset.
        uid_column_name (str): Name of the user ID column.
        str_column (list): List of string column names.
        bool_column (list): List of boolean column names.
        float_column (list): List of float column names.
        vector_dim (int): Dimension of the word vectors.

    Returns:
        dict_hstack (dict): Dictionary with user ID as keys and hstacked 
        vectors as values.
        dict_vstack (dict): Dictionary with user ID as keys and vstacked
          vectors as values.

    """
    def tfidf_weighted_word2vec(data, colname, vector_dim = 768):
        """
        Function generates necessary components to derive TF-IDF weighted Word2Vec
        from an entire data column.    

        Args:
            data (pandas.DataFrame): Dataset containing columns text for converting 
            into word2vec
            col_names (str) : Name of the target column on which the operation 
            needs to be performed

        Returns: 
            w2v (dict) : Dictionary with keys as words and values as i.e. word2vec 
            model generated respective vectors
            word2weight (dict) : Dictionary with words and their corresponding 
            TF-IDF weights 
            vocab(dict) : Vocabulary dictionary with word indices
        """

        # Generating coldata
        c = data[colname].tolist()
        c = [str(x) for x in c]
        coldata = []
        
        # Creating model and tokenizing words for the moedl
        model = Word2Vec(
            window = 2, min_count = 3, sg = 1, vector_size = vector_dim
        )
        
        for x in c:
            coldata.append(gensim.utils.simple_preprocess(x))

        # Creating model vocabulary from the tokens and then training and 
        # later bundling into a dictionary
        
        model.build_vocab(coldata)
        model.train(corpus_iterable=coldata, total_examples= model.corpus_count, 
                    epochs=model.epochs)
        
        w2v = dict(zip(model.wv.index_to_key, model.wv.vectors.round(3)))
        
        # Creating TFIDF vectorizer model
        
        tfidfvectorizer = TfidfVectorizer()
        tfidfvectorizer.fit_transform(c)
        vocab = tfidfvectorizer.vocabulary_.items()

        # Generating word2weight dictionary of word and its TFIDF values
        word2weight = [(w, round(tfidfvectorizer.idf_[i], 3)) 
                    for w, i in tfidfvectorizer.vocabulary_.items()]
        word2weight  = dict(word2weight)

        return w2v, word2weight, vocab

    def tfidfw2v_vectorizer(text, w2v, word2weight, vector_dim = 768):
        """

        Perform TF-IDF weighted Word2Vec embdedding on a text column in a DataFrame
        using the word2vec related components provided on the text. 

        Function calculates the TFIDF (from scikit-learn's TFIDFfVectorizer) 
        weighted word2vec (from gensim.Word2Vec) as per the following formulae:
        Tfidf w2v (w1,w2..) = 
        (tfidf(w1) * w2v(w1) + tfidf(w2) * w2v(w2) + …)/(tfidf(w1) + tfidf(w2) + …
        from various inputs. 

        Args:
            text (str): Input text for which to calculate the TF-IDF weighted 
            Word2Vec vector.
            w2v (dict): Dictionary with keys as words and values as their 
            respective vectors.
            word2weight (dict): Dictionary with words and their corresponding
            TF-IDF  weights.

        Returns:
            np.ndarray: TF-IDF weighted Word2Vec vector for the input text.

        """
        words = text.split() 

        if len(words) == 0:
            
            return np.zeros(vector_dim) 

        else:
            numerator_vector = np.zeros(vector_dim)
            denominator_value = 0.0
            
            for word in words:
                
                if word in w2v.keys() and word in word2weight.keys():
                    
                    numerator_val = words.count(word)*word2weight[word]*w2v[word]
                    numerator_vector += numerator_val
                
                    denominator_val = words.count(word)*word2weight[word]
                    denominator_value += denominator_val
            
            if denominator_value == 0.0:
                
                return np.zeros(vector_dim)
        
            else: 
                
                return np.round(numerator_vector/denominator_value, 3)
    
    # Defining functions that encode and pad boolean and float values

    def encode_and_pad_boolean_columns(fdata, bool_column, vector_dim = 768):
        """
        Encode bookean columns in a pandas DataFrame using OneHot Encoder
        
        Args:
            fdata (pandas DataFrame): upon whose boolean columns the encoding is to 
            executed

            bool_column (list): List containing the boolean columns names to be 
            encoded

            vector_dim (int): Dimension of the w2v_vectors
        
        Returns:
            None, modifies the DataFrame in place adding new columns with one hot 
            encoded data
        
        """
        onehotencoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
        
        for colname in bool_column:
            fdata[colname + "__w2v"] = [
                np.pad(x,  
                    (0, vector_dim - (len(x) % vector_dim)), 
                    'constant') for x in onehotencoder.fit_transform(
                        np.reshape(np.array(fdata[colname]), (-1, 1))
                        )
                        ]

    def pad_float_columns(fdata, float_column, vector_dim = 768):
        """
        Pads the specified float columns in the fdata pandas DataFrame so that the
        final value has a length equal to vector_dim

        Args:
            fdata (pandas DataFrame): Data frame containing the float value
            float_column (list): List of column names containig the float data
            vector_dim (int): Dimension of the vector the columns will be padded

        Returns:
            None: Converts/ modifies the data and generates the new columns
        """
        for colname in float_column:
            fdata[colname + "__w2v"] = [np.pad(
                x, 
                (0, vector_dim - (len(x) % vector_dim)), 
                'constant'
            ) for x in (np.reshape(
                np.array(fdata[colname]), (-1, 1)
            ))]
            
    def hstacker(row_arrays):
        """
        Function that concatenates each of the column data for each row
        """
        return np.concatenate(row_arrays)

    def vstacker(row_arrays):
        """
        Gives the mean vector for the vectors in columns row-wise
        """
        return np.mean(row_arrays)
    
    # Gathering the data and dropping duplicates
    data__ = data[[uid_column_name]+ [x + "__w2vpp" for x in str_column] + bool_column + float_column]

    # Applying encode_pad_boolean_columns and pad_float_columns 

    encode_and_pad_boolean_columns(data__, bool_column)
    pad_float_columns(data__, float_column)

     # Gathering and applying BERT base embedded vector for opportunity columns
    
    dict_hstack = {}
    dict_vstack = {}

    # Gathering string data only along with uid_column_name
    w2v_dict, word2weight_dict = {}, {}
    for colname in  str_column:
        w2v, word2weight, vocab = tfidf_weighted_word2vec(data__, colname + "__w2vpp")
        
        w2v_dict[colname + "__w2vpp"] = w2v
        word2weight_dict[colname + "__w2vpp"] = word2weight

        data__[colname + "__w2v"] = data__[colname + "__w2vpp"].apply(lambda x: tfidfw2v_vectorizer(x, w2v, word2weight))
    

    data__[uid_column_name + "__w2v_hstack"] = data__[[m + "__w2v" for m in str_column + bool_column + float_column]].apply(hstacker, axis = 1)
    data__[uid_column_name + "__w2v_vstack"] = data__[[m + "__w2v" for m in str_column + bool_column + float_column]].apply(vstacker, axis = 1)
    
    for index, row in data__.iterrows():
        dict_hstack[data__.at[index, uid_column_name]] = data__.at[index, uid_column_name + "__w2v_hstack"]
        dict_vstack[data__.at[index, uid_column_name]] = data__.at[index, uid_column_name + "__w2v_vstack"]
    
    # Saving w2v model components for applications
    if uid_column_name == 'ApplicationId':        
        save_app_data(w2v_dict, uid_column_name + '_w2v_dict')
        save_app_data(word2weight_dict, uid_column_name + '_word2weight_dict')    
    
    return dict_hstack, dict_vstack

### 3a.2.2 Executing the modelbased_embedder function

In [5]:
# Gathering arguments for the w2vbased_embedder function for Opportunity columns
# data = fdata
uid_column_name = 'OpportunityId'
str_col = [x for x in job_column if x in str_column]
bool_col = [x for x in job_column if x in bool_column]
float_col = [x for x in job_column if x in float_column]

job_opportunityid_w2v_dict_hstack, job_opportunityid_w2v_dict_vstack = w2vbased_embedder(fdata, uid_column_name, str_col, bool_col, float_col)

In [None]:
# Gathering arguments for the modelbased_embedder function for Candidate columns
# data = fdata
uid_column_name = 'ApplicationId'
str_col = [x for x in can_column if x in str_column]
bool_col = [x for x in can_column if x in bool_column]
float_col = [x for x in can_column if x in float_column]

# Running the modelbased_embedder function
can_applicationid_w2v_dict_hstack, can_applicationid_w2v_dict_vstack = w2vbased_embedder(fdata, uid_column_name, str_col, bool_col, float_col)

## 3a.3 Saving data for futher analysis

In [None]:

# Importing modules that save the data 

from src.getter.save_application_and_opportunity import *

# Adding dictionaries into the variables for pickle
# Creating dictionary
w2v_dict = {}

# Adding dictionaries
w2v_dict[
    'job_opportunityid_w2v_dict_hstack'
    ] = job_opportunityid_w2v_dict_hstack
w2v_dict[
    'can_applicationid_w2v_dict_hstack'
    ] = can_applicationid_w2v_dict_hstack

w2v_dict[
    'job_opportunityid_w2v_dict_vstack'
    ] = job_opportunityid_w2v_dict_vstack
w2v_dict[
    'can_applicationid_w2v_dict_vstack'
    ] = can_applicationid_w2v_dict_vstack

# Saving variables dictionary

save_interim_data(w2v_dict, "w2v_data_dictionary")