# Imports

In [1]:
from abc import ABC, abstractmethod
from enum import Enum
from typing import Callable

from gensim.models import KeyedVectors, Word2Vec
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from numpy.linalg import norm
import pandas as pd
from pyxtension.streams import stream
from sklearn.base import BaseEstimator


pd.set_option('display.max_rows', 500)

# Data Exploration

In [5]:
data_file = "potential-talents - Aspiring human resources - seeking human resources.csv"

df = pd.read_csv(data_file, index_col="id")
print(f"Shape: {df.shape}")
df.head()

Shape: (104, 4)


Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


# Data Cleansing

Remove stopwords and lemmatize

In [6]:
acronyms = {
    "HRIS": "human resources information system",
    "CHRO": "chief human resources officer",
    "GPHR": "Global Professional in Human Resources",
    "SPHR": "Senior Professional in Human Resources",
    "HR": "Human Resources",
    "CSR": "Corporate Social Responsibility",
}


def preprocess_sentence(sentence):
    for acronym, full_form in acronyms.items():
        sentence = sentence.replace(acronym, full_form)
    
    return (
        stream(simple_preprocess(sentence))
        .filter(lambda word: word not in stopwords.words('english'))
        .map(WordNetLemmatizer().lemmatize)
        .toList()
    )

# Word embedding

![Candidate ranker diagram](CandidateRankerDiagram.svg)

In [7]:
class WordEmbedder(ABC):
    @abstractmethod
    def fit(self, tokenized_sentences):
        pass
    
    @abstractmethod
    def predict(self, word) -> np.array:
        pass
    
    @abstractmethod
    def word_sets_similarity(self, word_set_1: Set[str], word_set_2: Set[str]) -> float:
        pass

    
class CandidateRanker:
    def __init__(self, tokenize_sentence: Callable, word_embedding_model: WordEmbedder, fit_word_embedding: bool = False):
        self.tokenize_sentence = tokenize_sentence
        self.word_embedding_model = word_embedding_model
        self.fit_word_embedding = fit_word_embedding
        
    def fit(self, sentences: pd.Series):
        self.sentences = sentences
        tokenized_sentences = sentences.map(self.tokenize_sentence)
        
        if self.fit_word_embedding:
            self.word_embedding_model.fit(tokenized_sentences)
            
        self.sentences_embedding = tokenized_sentences.map(self._get_sentence_words_embedding)
        return self
        
    def predict(self, search):
        search_embedding = self._get_sentence_words_embedding(self.tokenize_sentence(search))
        
        similarities = (
            self.sentences_embedding
            .map(lambda sentence_embedding: self._cosine_similarity(sentence_embedding, search_embedding))
            .sort_values(ascending=False)
        )

        return pd.DataFrame(dict(title=self.sentences.reindex_like(similarities), score=similarities))
    
    def _get_sentence_words_embedding(self, sentence_words):
        return np.mean([
            self.word_embedding_model.predict(word)
            for word in sentence_words
        ], axis=0)
    
    @staticmethod
    def _cosine_similarity(vector_1, vector_2):
        return vector_1.dot(vector_2) / (norm(vector_1) * norm(vector_2))

## Word2Vec

Encode job titles with Word2Vec model

In [8]:
class Word2VecType(Enum):
    CBOW = 0
    SKIP_GRAM = 1
    
    
class Word2VecEmbedder(WordEmbedder):
    def __init__(self, *, vector_size=100, window=5, min_count=1, sg=Word2VecType.SKIP_GRAM.value, epochs=10, **kwargs):
        self.word_2_vec_hyperparametrs = dict(
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            sg=sg,
            epochs=epochs,
            **kwargs
        )
        
    def fit(self, tokenized_sentences):
        self.model = Word2Vec(sentences=tokenized_sentences, **self.word_2_vec_hyperparametrs)
        return self
        
    def predict(self, word):
        return self.model.wv[word] if word in self.model.wv.index_to_key else np.zeros(self.model.vector_size)
    

In [9]:
word_2_vec_embedder = Word2VecEmbedder(window=5, min_count=1).fit(df.job_title.map(preprocess_sentence))
word_2_vec_ranker = CandidateRanker(preprocess_sentence, word_2_vec_embedder).fit(df.job_title)

In [39]:
word_2_vec_ranker.predict("Aspiring human resources").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,Aspiring Human Resources Professional,0.983534
58,Aspiring Human Resources Professional,0.983534
3,Aspiring Human Resources Professional,0.983534
17,Aspiring Human Resources Professional,0.983534
46,Aspiring Human Resources Professional,0.983534


In [40]:
word_2_vec_ranker.predict("seeking human resources").head()

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
28,Seeking Human Resources Opportunities,0.98577
30,Seeking Human Resources Opportunities,0.98577
73,"Aspiring Human Resources Manager, seeking inte...",0.985069
99,Seeking Human Resources Position,0.984149
53,Seeking Human Resources HRIS and Generalist Po...,0.978526


## FastText pretrained embedder

In [11]:
class PretrainedEmbedder(WordEmbedder):
    def __init__(self, model_path: str):
        self.model = KeyedVectors.load_word2vec_format(model_path, binary=False)
        
    def fit(self, tokenized_sentences):
        return self
        
    def predict(self, word):
        if word in self.model:
            return self.model[word]
        else:
            return np.zeros(self.model.vector_size)

In [12]:
model_path = "C:\\Users\\chana\\Downloads\\wiki-news-300d-1M.vec\\wiki-news-300d-1M.vec"
fast_text_embedder = PretrainedEmbedder(model_path)


In [13]:
fast_text_ranker = CandidateRanker(preprocess_sentence, fast_text_embedder).fit(df.job_title)

In [14]:
print(fast_text_ranker.predict(search_1))
print(fast_text_ranker.predict(search_2))

id
97    Aspiring Human Resources Professional
3     Aspiring Human Resources Professional
46    Aspiring Human Resources Professional
21    Aspiring Human Resources Professional
17    Aspiring Human Resources Professional
58    Aspiring Human Resources Professional
33    Aspiring Human Resources Professional
24      Aspiring Human Resources Specialist
36      Aspiring Human Resources Specialist
49      Aspiring Human Resources Specialist
Name: job_title, dtype: object
id
99                     Seeking Human Resources Position
30                Seeking Human Resources Opportunities
28                Seeking Human Resources Opportunities
73    Aspiring Human Resources Manager, seeking inte...
53    Seeking Human Resources HRIS and Generalist Po...
10    Seeking Human Resources HRIS and Generalist Po...
62    Seeking Human Resources HRIS and Generalist Po...
40    Seeking Human Resources HRIS and Generalist Po...
27    Aspiring Human Resources Management student se...
29    Aspiring Huma

# Word2Vec as Abhiram told me

In [None]:
model = gensim.models.Word2Vec(iter=1)  # an empty model, no training yet
model.build_vocab(some_sentences)  # can be a non-repeatable, 1-pass generator
model.train(other_sentences) 

In [25]:
class Word2VecEmbedder_2(WordEmbedder):
    def __init__(self, *, vector_size=100, window=5, min_count=1, sg=Word2VecType.SKIP_GRAM.value, epochs=10, **kwargs):
        self.word_2_vec_hyperparametrs = dict(
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            sg=sg,
            epochs=epochs,
            **kwargs
        )
        
    def fit(self, tokenized_sentences):
        self.model = Word2Vec(**self.word_2_vec_hyperparametrs)
        self.model.build_vocab(tokenized_sentences)
        self.model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=self.model.epochs)
        return self
        
    def predict(self, word):
        return self.model.wv[word] if word in self.model.wv.index_to_key else np.zeros(self.model.vector_size)

In [26]:
word_2_vec_embedder_2 = Word2VecEmbedder_2().fit(df.job_title.map(preprocess_sentence))
word_2_vec_ranker_2 = CandidateRanker(preprocess_sentence, word_2_vec_embedder_2).fit(df.job_title)

In [32]:
search_embedding = word_2_vec_ranker_2._get_sentence_words_embedding(preprocess_sentence("successful human resources"))

In [None]:
word_2_vec_ranker_2

In [36]:
preprocess_sentence(df.job_title.loc[98])

['student']

In [38]:
word_2_vec_embedder_2.model.wv.n_similarity(preprocess_sentence("seeking human resources"), preprocess_sentence(df.job_title.loc[28]))

0.98577005

In [None]:
def predict(self, search):
    tokenized_sentence = self.tokenize_sentence(search)
    self.to
    
    search_embedding = self._get_sentence_words_embedding(self.tokenize_sentence(search))

    similarities = (
        self.sentences_embedding
        .map(lambda sentence_embedding: self._cosine_similarity(sentence_embedding, search_embedding))
        .sort_values(ascending=False)
    )

    return pd.DataFrame(dict(title=self.sentences.reindex_like(similarities), score=similarities))

In [27]:
word_2_vec_ranker_2.predict("seeking human resources")

Unnamed: 0_level_0,title,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
28,Seeking Human Resources Opportunities,0.98577
30,Seeking Human Resources Opportunities,0.98577
73,"Aspiring Human Resources Manager, seeking inte...",0.985069
99,Seeking Human Resources Position,0.984149
53,Seeking Human Resources HRIS and Generalist Po...,0.978526
40,Seeking Human Resources HRIS and Generalist Po...,0.978526
62,Seeking Human Resources HRIS and Generalist Po...,0.978526
10,Seeking Human Resources HRIS and Generalist Po...,0.978526
100,Aspiring Human Resources Manager | Graduating ...,0.973693
27,Aspiring Human Resources Management student se...,0.969067
