# 3a. Featurizing the data - Word 2 Vec model

## 3a.1.1 Gathering data

In [1]:
import pickle 
import numpy as np 
import pandas as pd 
import gensim
import nltk
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm 

In [2]:
dataloc ='/Users/rathish/Documents/Projects/Opportunity_Application_Ranker/inputs/data'
fdata = pd.read_pickle(dataloc + '/interim/preprocesseddata.pkl')
fdata.head()

Unnamed: 0,OpportunityId,ApplicationId,ExternalBriefDescription,ExternalDescription,Title,JobCategoryName,IsRejected,IsCandidateInternal,BehaviorCriteria,MotivationCriteria,...,SkillCriteria__mbertpp,WorkExperiences__mbertpp,Educations__mbertpp,LicenseAndCertifications__mbertpp,Skills__mbertpp,Motivations__mbertpp,Behaviors__mbertpp,StepId__mbertpp,StepName__mbertpp,StepGroup__mbertpp
0,MbzeABKVn06G8irkoHJeIg==,nTzdqGj020CYqTouPocGSg==,"$16.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$16.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Call Cente...,Degree Some college Description None Graduatio...,,ScaleValue 4 ScaleValueName Advanced Skill Clo...,Description Inspired to perform well by moneta...,Description Devoted to a task or purpose with ...,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
1,7SPt0A57/kyzM9hE9vxDRg==,QVk5MFCZ70WvlZE9FzAW9g==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Coordinato...,Degree Diploma Description None GraduationMont...,,ScaleValue 5 ScaleValueName Expert Skill Sales,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
2,7SPt0A57/kyzM9hE9vxDRg==,I1kcPlAw3E+rqceh0qrutQ==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Direct Car...,Degree HIGH SCHOOL DIPLOMA Description None Gr...,,ScaleValue 4 ScaleValueName Advanced Skill Cash,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
3,zolSWBFjWESbfkj8AXLYwA==,VTCXZK6/ZUWJDpxTcm2CRg==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear 2019.0 JobTitle Package ...,Degree Associate in Early Description None Gra...,,ScaleValue 5 ScaleValueName Expert Skill Cashier,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined
4,zolSWBFjWESbfkj8AXLYwA==,I6KgcL0jdkG8wBnL1+BZ/g==,"$15.00 Per Hour\n\nAt Orkin, our purpose is to...",<p><strong>$15.00 Per Hour</strong></p>\n<p><s...,Customer Service Specialist,Customer Service,True,False,[{'Description': 'Capable of carrying out a gi...,[{'Description': 'Inspired to perform well by ...,...,MinimumScaleValue 3 MinimumScaleValueName Inte...,EndMonth None EndYear None JobTitle Warehouse ...,Degree Bachelor of Business Admin Description ...,,ScaleValue 5 ScaleValueName Expert Skill Forklift,,,K8yQlic+/UiXxBMpOnAoLQ==,Decline,declined


### 3a.1.2 Defining column names for featurization 

In [3]:
# Defining list containing names of the columns

job_column = [
    'ExternalBriefDescription',
    'ExternalDescription', 
    'Title', 
    'JobCategoryName'
]

uid_column = ['OpportunityId', 'ApplicationId']

# Column - 'Tag' Will be added later
can_column = [
    'IsCandidateInternal',
    'BehaviorCriteria', 
    'MotivationCriteria',
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup',
    'pass_first_step'
] 

sel_column = ['IsRejected']

# Defining list of columns based on the type of contents

str_column = [
    'ExternalBriefDescription', 
    'ExternalDescription', 
    'Title', 
    'JobCategoryName', 
    'BehaviorCriteria', 
    'MotivationCriteria', 
    'EducationCriteria', 
    'LicenseAndCertificationCriteria', 
    'SkillCriteria', 
    'WorkExperiences', 
    'Educations', 
    'LicenseAndCertifications', 
    'Skills', 
    'Motivations', 
    'Behaviors', 
    'StepId', 
    'StepName', 
    'StepGroup'
]

bool_column = ['IsCandidateInternal', 'pass_first_step']

float_column = ['Tag']

## 3a.2 TFIDF weighted word2vec vectorization

Let's use TFIDF weighted word2vec for generating the necessary features to measure similarity. 
To do this, let's generate two functions - tfidf_weighted_word2vec, which inputs the column name and the data to generate the TFIDF vocabulary, TFIDF's - idf values, and word2vec vocab:vector after training the data. 
The second function i.e. TFIDF w2v_vectorizer intakes all of the variables generated in the first function along and applies it to a particular text to achieve TFIDF weighted word to vec. 

### 3a.2.1 Creating functions that derive TFIDF weighted word2vec information

In [4]:
# Setting the TF-IDF vector dimension to other comparitive models - in this case 512
vector_dim = 512

In [5]:
def tfidf_weighted_word2vec(data, colname):
    """
    Function generates necessary components to derive TF-IDF weighted Word2Vec
    from an entire data column.    

    Args:
        data (pandas.DataFrame): Dataset containing columns text for converting 
        into word2vec
    
        col_names (str) : Name of the target column on which the operation 
        needs to be performed

    Returns: 
        w2v (dict) : Dictionary with keys as words and values as i.e. word2vec 
        model generated respective vectors
        word2weight (dict) : Dictionary with words and their corresponding 
        TF-IDF weights 
        vocab(dict) : Vocabulary dictionary with word indices
    """

    # Generating coldata
    c = data[colname].tolist()
    c = [str(x) for x in c]
    coldata = []
    
    # Creating model and tokenizing words for the moedl
    model = Word2Vec(window = 2, min_count = 3, sg = 1, vector_size = vector_dim)
    
    for x in c:
        coldata.append(gensim.utils.simple_preprocess(x))

    # Creating model vocabulary from the tokens and then training and 
    # later bundling into a dictionary
    
    model.build_vocab(coldata)
    model.train(corpus_iterable=coldata, total_examples= model.corpus_count, 
                epochs=model.epochs)
    
    w2v = dict(zip(model.wv.index_to_key, model.wv.vectors.round(3)))
    
    # Creating TFIDF vectorizer model
    
    tfidfvectorizer = TfidfVectorizer()
    tfidfvectorizer.fit_transform(c)
    vocab = tfidfvectorizer.vocabulary_.items()

    # Generating word2weight dictionary of word and its TFIDF values
    word2weight = [(w, round(tfidfvectorizer.idf_[i], 3)) 
                   for w, i in tfidfvectorizer.vocabulary_.items()]
    word2weight  = dict(word2weight)

    return w2v, word2weight, vocab


def tfidfw2v_vectorizer(text, w2v, word2weight):
    """

    Perform TF-IDF weighted Word2Vec embdedding on a tet column in a DataFrame

    Function calculates the TFIDF (from scikit-learn's TFIDFfVectorizer) 
    weighted word2vec (from gensim.Word2Vec) as per the following formulae:
    Tfidf w2v (w1,w2..) = 
    (tfidf(w1) * w2v(w1) + tfidf(w2) * w2v(w2) + …)/(tfidf(w1) + tfidf(w2) + …
    from various inputs. 

    Args:
        text (str): Input text for which to calculate the TF-IDF weighted 
        Word2Vec vector.
        w2v (dict): Dictionary with keys as words and values as their 
        respective vectors.
        word2weight (dict): Dictionary with words and their corresponding
        TF-IDF  weights.

    Returns:
        np.ndarray: TF-IDF weighted Word2Vec vector for the input text.

    """
    words = text.split() 

    if len(words) == 0:
        
        return np.zeros(vector_dim) # BERT and dBERT create 512 dimensional vectors

    else:
        numerator_vector = np.zeros(vector_dim)
        denominator_value = 0.0
        
        for word in words:
            
            if word in w2v.keys() and word in word2weight.keys():
                
                numerator_val = words.count(word)*word2weight[word]*w2v[word]
                numerator_vector += numerator_val
               
                denominator_val = words.count(word)*word2weight[word]
                denominator_value += denominator_val
        
        if denominator_value == 0.0:
            
            return np.zeros(vector_dim)
       
        else: 
            
            return np.round(numerator_vector/denominator_value, 3)

### 3a.2.2 Applying the TFIDF weighted word2vec function

In [6]:
# Applying the TFIDF - avg word 2 vec

for colname in str_column:
    w2v, word2weight, vocab = tfidf_weighted_word2vec(
        fdata, colname + "__w2vpp"
    )
    
    fdata[colname + "__w2v"] = fdata[colname + "__w2vpp"].apply(
        lambda x: tfidfw2v_vectorizer(x, w2v,word2weight)
    )

### 3a.2.3 Handling boolean float columns with one hot encoding

In [7]:
"""
Applying OneHotEncoder and generating a vector that is padded with zeros to 
attain length = vector_dim, for easy vertical stacking
"""

onehotencoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

for colname in bool_column:
    fdata[colname + "__w2v"] = [np.pad(
        x, 
        (0, vector_dim - (len(x) % vector_dim)), 
        'constant'
    ) for x in onehotencoder.fit_transform(
        np.reshape(np.array(fdata[colname]), (-1, 1))
    )]

"""
minmaxscaler = MinMaxScaler() isn't being applied as float data contained
None objects. The None objects were converted to -1. Applying minmax scaller 
would modify the data
"""

for colname in float_column:
    fdata[colname + "__w2v"] = [np.pad(
        x, 
        (0,vector_dim - (len(x)%vector_dim)), 
        'constant'
    ) for x in (np.reshape(
        np.array(fdata['Tag']), (-1, 1)
    )
    )]

### 3a.2.4 Stacking all the vectors together

The stacking of vectors could be done in two ways either horizontally or vertically. 
Horizontal stacking is simple concatenation giving rise to vectors or varied lengths. 
In vertical stacking, we take the mean vector values stacked. 

In [8]:
# Function for adding relevant arrays column wise to create larger vectors

def hstacker(row_arrays):
    """
    Function that concatenates each of the column data for each row
    """
    return np.concatenate(row_arrays)

def vstacker(row_arrays):
    """
    Gives the mean vector for the vectors in columns row-wise
    """
    return np.mean(row_arrays)

In [9]:
"""
Gathering/concatenating the opportunity/job related vectors into a new column -
Horizontal Stacking
"""
fdata['opportunity__w2v_hstack'] = fdata[
    [m + "__w2v" for m in job_column]
].apply(
    hstacker, axis = 1
)

# Gathering/Concatenating the candidate related vectors into a new column
fdata['candidate__w2v_hstack'] = fdata[
    [m + "__w2v" for m in can_column]
].apply(
    hstacker, axis = 1
)

# Adding column 'Tag' to the candidate_w2v as it contains float values

fdata['candidate__w2v_hstack'] = fdata[
    ['candidate__w2v_hstack'] + ['Tag__w2v']
].apply(
    hstacker, axis = 1
)

"""
Gathering/concatenating the opportunity/job related vectors into a new column -
Vertical Stacking
"""
fdata['opportunity__w2v_vstack'] = fdata[
    [m + "__w2v" for m in job_column]
].apply(
    vstacker, axis = 1
)

# Gathering/Concatenating the candidate related vectors into a new column
# Adding column 'Tag' to the candidate_w2v as it contains float values
fdata['candidate__w2v_vstack'] = fdata[[m + "__w2v" for m in can_column] + 
['Tag__w2v']].apply(
    vstacker, axis = 1
)

  fdata['candidate__w2v_hstack'] = fdata[
  fdata['opportunity__w2v_vstack'] = fdata[
  fdata['candidate__w2v_vstack'] = fdata[[m + "__w2v" for m in can_column] +


## 3a.3 Saving data for futher analysis

In [10]:
# Exporting rdata to a pickle file

""" 
Creating featurizeddata_w2v that caries all the applicant, opportunity 
related vectors
"""

fdata[
  ["opportunity__w2v_hstack"] + ["candidate__w2v_hstack"] + 
  ["opportunity__w2v_vstack"] + ["candidate__w2v_vstack"]
].to_pickle(
    dataloc + "/interim/featurizeddata_w2v.pkl"
  )