In [69]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sent2vec.vectorizer import Vectorizer
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from scipy import spatial

In [21]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_csv('potential-talents - Aspiring human resources - seeking human resources.csv')

In [3]:
df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [4]:
df.dtypes

id              int64
job_title      object
location       object
connection     object
fit           float64
dtype: object

In [5]:
df.job_title.unique()

array(['2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
       'Native English Teacher at EPIK (English Program in Korea)',
       'Aspiring Human Resources Professional',
       'People Development Coordinator at Ryan',
       'Advisory Board Member at Celal Bayar University',
       'Aspiring Human Resources Specialist',
       'Student at Humber College and Aspiring Human Resources Generalist',
       'HR Senior Specialist',
       'Seeking Human Resources HRIS and Generalist Positions',
       'Student at Chapman University',
       'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR',
       'Human Resources Coordinator at InterContinental Buckhead Atlanta',
       'Aspiring Human Resources Management student seeking an internship',
       'Seeking Human Resources Opportunities',
       'Experienced Retail Manager and aspiring Human Resources Professional',
       'H

In [6]:
df.connection.unique()

array(['85', '500+ ', '44', '1', '61', '2', '390', '57', '82', '5', '7',
       '16', '212', '409', '52', '455', '174', '268', '50', '4', '40',
       '18', '349', '155', '39', '64', '9', '415', '19', '71', '48',
       '103', '49'], dtype=object)

In [7]:
df.location.unique()

array(['Houston, Texas', 'Kanada', 'Raleigh-Durham, North Carolina Area',
       'Denton, Texas', 'İzmir, Türkiye', 'Greater New York City Area',
       'San Francisco Bay Area', 'Greater Philadelphia Area',
       'Lake Forest, California', 'Houston, Texas Area',
       'Atlanta, Georgia', 'Chicago, Illinois', 'Austin, Texas Area',
       'Jackson, Mississippi Area', 'Greater Grand Rapids, Michigan Area',
       'Virginia Beach, Virginia', 'Monroe, Louisiana Area',
       'Greater Boston Area', 'San Jose, California',
       'New York, New York', 'Dallas/Fort Worth Area',
       'Amerika Birleşik Devletleri', 'Baton Rouge, Louisiana Area',
       'Myrtle Beach, South Carolina Area', 'Chattanooga, Tennessee Area',
       'Los Angeles, California', 'Highland, California',
       'Gaithersburg, Maryland', 'Baltimore, Maryland',
       'Milpitas, California', 'Greater Atlanta Area',
       'Greater Chicago Area', 'Torrance, California',
       'Long Beach, California', 'Bridgewater, Massa

## Cleaning Job Title

To cleaning job title, we do the following:

* Split tokens on white space.
* Remove all punctuation from words.
* Remove all words that are not purely comprised of alphabetical characters.
* Remove all words that are known stop words

In [85]:
# Data cleaning and Preparation


def textpreprocessing(text):
    #remove punctuations and uppercase
    clean_text = re.sub('[^a-zA-Z]',' ', text)
    
    clean_text = clean_text.lower().split()
    
    #remove stopwords
    s_words =set(stopwords.words('english'))
    
    clean_text = [word for word in clean_text  if word not in s_words]
    
    sentence = []
    
    for word in clean_text:
        if word == 'hr':
            word = 'human resources'
        elif word == 'chro':
            word = 'chief human resources officer'
        elif word == 'gphr':
            word = 'general professional in human resources'
        elif word == 'sphr':
            word = 'senior professional in human resources'
        
        #lemmatize the word
        lemmatizer = WordNetLemmatizer()
        sentence.append(lemmatizer.lemmatize(word, 'v'))

    return ' '.join(sentence)

In [86]:
df['cleaned_title'] = df['job_title'].apply(textpreprocessing)

In [10]:
df

Unnamed: 0,id,job_title,location,connection,fit,cleaned_title
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,c bauer college business graduate magna cum la...
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,native english teacher epik english program korea
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,people development coordinator ryan
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,advisory board member celal bayar university
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,aspire human resources manager graduate may se...
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,human resources generalist loparex
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,business intelligence analytics travelers
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,always set success


In [11]:
df.cleaned_title.unique()

array(['c bauer college business graduate magna cum laude aspire human resources professional',
       'native english teacher epik english program korea',
       'aspire human resources professional',
       'people development coordinator ryan',
       'advisory board member celal bayar university',
       'aspire human resources specialist',
       'student humber college aspire human resources generalist',
       'Human Resources senior specialist',
       'seek human resources hris generalist position',
       'student chapman university',
       'svp Chief Human Resources Officer market communications csr officer engie houston woodlands energy General Professional in Human Resources Senior Professional in Human Resources',
       'human resources coordinator intercontinental buckhead atlanta',
       'aspire human resources management student seek internship',
       'seek human resources opportunities',
       'experience retail manager aspire human resources professional',
    

# TFIDF

Term Frequency-inverse document frequency (TF-idf): this looks at words that appear in both pieces of text, and scores them based on how often they appear. 

In [12]:
query1 = textpreprocessing('Aspiring human resources')
query1 = [query1]

In [13]:
tfvectorizer = TfidfVectorizer()

title_vector = tfvectorizer.fit_transform(df.cleaned_title)

query1_vector = tfvectorizer.transform(query1)

In [14]:
df["tfidf_sim_1"]= cosine_similarity(title_vector, query1_vector)

In [15]:
df.sort_values(by='tfidf_sim_1', ascending=False).head(30)

Unnamed: 0,id,job_title,location,connection,fit,cleaned_title,tfidf_sim_1
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,,aspire human resources professional,0.75945
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153


In [46]:
def cleaning_list(df,query):
    clean_list = []
    
    for num in range(len(df)):
        desc = df.iloc[num]['cleaned_title']
        
        desc = nltk.word_tokenize(desc)
        
        #removing words that are not in pretrained model.
        
        for word in range(len(desc)):
            if desc[word] in ['epik', 'celal', 'bayar', 'humber', 'engie', 'buckhead', 'luxottica', 'beneteau', 'scottmadden', 
                                'nortia', 'schwan', 'endemol', 'jti', 'styczynski', 'westfield', 'kokomo', 'delphi', 'loparex']:
                desc[word] = ''
        desc = ' '.join(desc)
        clean_list.append(desc)
    
    ## tokenize query and add to list
    clean_list.append(query)
    
    return clean_list

In [47]:
cleaned_list = cleaning_list(df, 'aspiring human resources')

In [48]:
cleaned_list

['c bauer college business graduate magna cum laude aspire human resources professional',
 'native english teacher  english program korea',
 'aspire human resources professional',
 'people development coordinator ryan',
 'advisory board member   university',
 'aspire human resources specialist',
 'student  college aspire human resources generalist',
 'Human Resources senior specialist',
 'student  college aspire human resources generalist',
 'seek human resources hris generalist position',
 'student chapman university',
 'svp Chief Human Resources Officer market communications csr officer  houston woodlands energy General Professional in Human Resources Senior Professional in Human Resources',
 'human resources coordinator intercontinental  atlanta',
 'c bauer college business graduate magna cum laude aspire human resources professional',
 'c bauer college business graduate magna cum laude aspire human resources professional',
 'native english teacher  english program korea',
 'aspire 

# BERT

In [49]:
vectorizer = Vectorizer()
vectorizer.run(cleaned_list)
bert_vectors = vectorizer.vectors

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
bert_scores = []
for num in range(len(cleaned_list) - 1):
    bert_scores.append(1 - spatial.distance.cosine(bert_vectors[-1], bert_vectors[num]))
    
df['bert_fit'] = bert_scores

In [62]:
df.sort_values(by='bert_fit', ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit,cleaned_title,tfidf_sim_1,word2vec_fit,bert_fit
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,human resources specialist luxottica,0.216661,0.276118,0.99268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,human resources generalist loparex,0.21917,0.218552,0.99206
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,,human resources generalist schwan,0.21917,0.218552,0.99206
73,74,Human Resources Professional,Greater Boston Area,16,,human resources professional,0.428006,0.160971,0.991282
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.085605,0.991089
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.085605,0.991089
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.085605,0.991089
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.085605,0.991089
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.085605,0.991089
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,seek human resources opportunities,0.2502,0.210502,0.990903


# Word2vec

In [56]:
vectorizer = Vectorizer(pretrained_weights= 'GoogleNews-vectors-negative300.bin')
vectorizer.run(cleaned_list)
word2vec_vectors = vectorizer.vectors

Initializing word2vec with vector path GoogleNews-vectors-negative300.bin


In [63]:
## append word2vec scores to dataframe
scores = []
for num in range(len(cleaned_list) - 1):
    scores.append(1 - spatial.distance.cosine(word2vec_vectors[-1], word2vec_vectors[num]))
    
df['word2vec_fit'] = scores

In [64]:
df.sort_values(by='word2vec_fit', ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit,cleaned_title,tfidf_sim_1,word2vec_fit,bert_fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,,aspire human resources professional,0.75945,0.947545,0.98984
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.914395,0.991089
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.914395,0.991089
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspire human resources specialist,0.677153,0.914395,0.991089


# Glove

In [91]:
def glove_list(df,query):
    clean_list = []
    
    for num in range(len(df)):
        desc = df.iloc[num]['cleaned_title']
        
        desc = nltk.word_tokenize(desc)
        
        #removing words that are not in pretrained model.
        
        for word in range(len(desc)):
            if desc[word] in ['hris', 'epik', 'celal', 'bayar', 'humber', 'engie', 'buckhead', 'luxottica', 'beneteau', 'scottmadden', 
                                'nortia', 'schwan', 'endemol', 'jti', 'styczynski', 'westfield', 'kokomo', 'delphi', 'loparex']:
                desc[word] = ' '
                
        desc = ' '.join(desc)
        desc = nltk.word_tokenize(desc)
        clean_list.append(desc)
    
    ## tokenize query and add to list
    clean_list.append(query.split())
    
    return clean_list

In [92]:
gloveembed_list = glove_list(df, 'aspiring human resources')

In [93]:
gloveembed_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspire',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspire', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspire', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspire', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspire', 'human', 'resources', 'generalist'],
 ['seek', 'human', 'resources', 'generalist', 'position'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'market',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'in',
  'human',
  'resources',
  'senior',
  'professional',
  'in',
  'human',
  'resources'],
 ['human', 'reso

In [70]:
model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.300d.txt', no_header=True)

In [80]:
def get_vector(s):
    return np.sum(np.array([model[i] for i in s]), axis=0)

In [94]:
glove_vectors = []

for num in range(len(gloveembed_list)):
    glove_vectors.append(get_vector(gloveembed_list[num]))

In [95]:
glove_scores = []
for num in range(len(gloveembed_list) - 1):
    glove_scores.append(1 - spatial.distance.cosine(glove_vectors[num], glove_vectors[-1]))
    
df['glove_fit'] = glove_scores

In [97]:
df.sort_values(by='glove_fit', ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit,cleaned_title,tfidf_sim_1,word2vec_fit,bert_fit,glove_fit
73,74,Human Resources Professional,Greater Boston Area,16,,human resources professional,0.428006,0.839029,0.991282,0.877932
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,aspire human resources manager seek internship...,0.544969,0.879637,0.983642,0.857727
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,seek human resources opportunities,0.2502,0.789498,0.990903,0.8428
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,seek human resources opportunities,0.2502,0.789498,0.990903,0.8428
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resources professional,0.75945,0.947545,0.98984,0.842521
