# Imports

In [1]:
from abc import ABC, abstractmethod
from enum import Enum
from typing import Callable, Iterable, List, Tuple

from gensim.models import KeyedVectors, Word2Vec
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from numpy.linalg import norm
import pandas as pd
from pyxtension.streams import stream
from sklearn.base import BaseEstimator
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np


pd.set_option('display.max_rows', 500)

# Data Exploration

In [2]:
data_file = "potential-talents - Aspiring human resources - seeking human resources.csv"

df = pd.read_csv(data_file, index_col="id")
print(f"Shape: {df.shape}")
df.head()

Shape: (104, 4)


Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


# Data Cleansing

Remove stopwords and lemmatize

In [3]:
acronyms = {
    "HRIS": "human resources information system",
    "CHRO": "chief human resources officer",
    "GPHR": "Global Professional in Human Resources",
    "SPHR": "Senior Professional in Human Resources",
    "HR": "Human Resources",
    "CSR": "Corporate Social Responsibility",
}


def preprocess_sentence(sentence):
    for acronym, full_form in acronyms.items():
        sentence = sentence.replace(acronym, full_form)
    
    return (
        stream(simple_preprocess(sentence))
        .filter(lambda word: word not in stopwords.words('english'))
        .map(WordNetLemmatizer().lemmatize)
        .toList()
    )

# Word embedding

![Candidate ranker diagram](CandidateRankerDiagram.svg)

In [4]:
class WordEmbedder(ABC):
    @abstractmethod
    def fit(self, tokenized_sentences: Iterable[List[str]]):
        """Training"""
    
    @abstractmethod
    def predict(self, word) -> np.array:
        """Calculates the word vector"""
    
    @abstractmethod
    def word_sets_similarity(self, word_set_1: List[str], word_set_2: List[str]) -> float:
        """Calculates the similarity between two word sets"""
        
    def get_word_set_embedding(self, word_set: List[str]) -> np.array:
        return np.mean([
            self.predict(word)
            for word in word_set
        ], axis=0)

        
class MeanVectorCosineSimilarityMixin:
    def word_sets_similarity(self, word_set_1: List[str], word_set_2: List[str]) -> float:
        return self.cosine_similarity(
            vector_1=self.get_word_set_embedding(word_set_1),
            vector_2=self.get_word_set_embedding(word_set_2)
        )
    
    @staticmethod
    def cosine_similarity(vector_1, vector_2):
        return vector_1.dot(vector_2) / (norm(vector_1) * norm(vector_2))
    
    
class CandidateRanker:
    def __init__(self, tokenize_sentence: Callable[str, List[str]], word_embedding_model: WordEmbedder, fit_word_embedding: bool = False):
        self.tokenize_sentence = tokenize_sentence
        self.word_embedding_model = word_embedding_model
        self.fit_word_embedding = fit_word_embedding
        
    def fit(self, sentences: pd.Series):
        self.sentences = sentences
        self.tokenized_sentences = sentences.map(self.tokenize_sentence)
        
        if self.fit_word_embedding:
            self.word_embedding_model.fit(self.tokenized_sentences)
            
        return self
        
    def predict(self, search):
        search_words = self.tokenize_sentence(search)
        
        similarities = self.tokenized_sentences.map(
            lambda sentence_words: self.word_embedding_model.word_sets_similarity(sentence_words, search_words)
        ).sort_values(ascending=False)

        return pd.DataFrame(dict(title=self.sentences.reindex_like(similarities), score=similarities))

## Word2Vec

Encode job titles with Word2Vec model

In [165]:
class Word2VecType(Enum):
    CBOW = 0
    SKIP_GRAM = 1
    
    
class Word2VecEmbedder(WordEmbedder):
    def __init__(self, *, vector_size=100, window=5, min_count=1, sg=Word2VecType.SKIP_GRAM.value, epochs=10, **kwargs):
        self.word_2_vec_hyperparametrs = dict(
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            sg=sg,
            epochs=epochs,
            **kwargs
        )
        
    def fit(self, tokenized_sentences):
        self.model = Word2Vec(**self.word_2_vec_hyperparametrs)
        self.model.build_vocab(tokenized_sentences)
        self.model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=self.model.epochs)
        return self
        
    def predict(self, word):
        return self.model.wv[word] if word in self.model.wv.index_to_key else np.zeros(self.model.vector_size)
    
    def word_sets_similarity(self, word_set_1: List[str], word_set_2: List[str]) -> float:
        return self.model.wv.n_similarity(word_set_1, word_set_2)
    

In [169]:
word_2_vec_embedder = Word2VecEmbedder(window=5, min_count=1, vector_size=100).fit(df.job_title.map(preprocess_sentence))
word_2_vec_ranker = CandidateRanker(preprocess_sentence, word_2_vec_embedder).fit(df.job_title)

In [170]:
word_2_vec_ranker.predict("Aspiring human resources")

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,Aspiring Human Resources Professional,0.983534
58,Aspiring Human Resources Professional,0.983534
3,Aspiring Human Resources Professional,0.983534
17,Aspiring Human Resources Professional,0.983534
46,Aspiring Human Resources Professional,0.983534
97,Aspiring Human Resources Professional,0.983534
33,Aspiring Human Resources Professional,0.983534
24,Aspiring Human Resources Specialist,0.983208
49,Aspiring Human Resources Specialist,0.983208
36,Aspiring Human Resources Specialist,0.983208


In [173]:
word_2_vec_ranker.predict("Success").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
103,Always set them up for Success,0.71598
92,Seeking employment opportunities within Custom...,0.278069
39,Student at Humber College and Aspiring Human R...,0.273548
25,Student at Humber College and Aspiring Human R...,0.273548
37,Student at Humber College and Aspiring Human R...,0.273548


In [172]:
word_2_vec_ranker.predict("seeking human resources").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
28,Seeking Human Resources Opportunities,0.98577
30,Seeking Human Resources Opportunities,0.98577
73,"Aspiring Human Resources Manager, seeking inte...",0.985069
99,Seeking Human Resources Position,0.984148
53,Seeking Human Resources HRIS and Generalist Po...,0.978527


## FastText pretrained embedder

In [9]:
class PretrainedEmbedder(MeanVectorCosineSimilarityMixin, WordEmbedder):
    def __init__(self, model_path: str):
        self.model = KeyedVectors.load_word2vec_format(model_path, binary=False)
        
    def fit(self, tokenized_sentences):
        return self
        
    def predict(self, word):
        if word in self.model:
            return self.model[word]
        else:
            return np.zeros(self.model.vector_size)

In [10]:
model_path = "C:\\Users\\chana\\Downloads\\wiki-news-300d-1M.vec\\wiki-news-300d-1M.vec"
fast_text_embedder = PretrainedEmbedder(model_path)


In [11]:
fast_text_ranker = CandidateRanker(preprocess_sentence, fast_text_embedder).fit(df.job_title)

In [14]:
fast_text_ranker.predict("Aspiring human resources").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
58,Aspiring Human Resources Professional,0.968588
3,Aspiring Human Resources Professional,0.968588
46,Aspiring Human Resources Professional,0.968588
33,Aspiring Human Resources Professional,0.968588
17,Aspiring Human Resources Professional,0.968588


In [16]:
fast_text_ranker.predict("student").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
98,Student,1.0
11,Student at Chapman University,0.828506
63,Student at Chapman University,0.828506
54,Student at Chapman University,0.828506
41,Student at Chapman University,0.828506


In [13]:
fast_text_ranker.predict("seeking human resources").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
99,Seeking Human Resources Position,0.961437
28,Seeking Human Resources Opportunities,0.956862
30,Seeking Human Resources Opportunities,0.956862
73,"Aspiring Human Resources Manager, seeking inte...",0.932743
40,Seeking Human Resources HRIS and Generalist Po...,0.929337


# Second phase: supervised learning