In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

from IPython.display import display

In [2]:
df = pd.read_csv("dataset.csv", sep=",")
print(df.shape)
df.head()

(104, 5)


Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


In [4]:
df.drop(['fit'], axis=1, inplace=True)
df['connection'] = df['connection'].replace(['500+ '], '500')
print(df.shape)
df.head()

(104, 4)


Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500


In [5]:
df = df[~df.drop(["id"], axis=1).duplicated()]
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.head()

(53, 4)


Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500


In [6]:
df.job_title.value_counts()

Aspiring Human Resources Professional                                                                                    2
2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 1
Lead Official at Western Illinois University                                                                             1
Senior Human Resources Business Partner at Heil Environmental                                                            1
Aspiring Human Resources Professional | An energetic and Team-Focused Leader                                             1
HR Manager at Endemol Shine North America                                                                                1
Human Resources professional for the world leader in GIS software                                                        1
RRP Brand Portfolio Executive at JTI (Japan Tobacco International)                                                       1
Information Syst

In [8]:
# Pre-processing
df = df.replace({'job_title' : { 'CHRO' : 'chief human resources officer', 'SVP' : 'senior vice president'
        ,'GPHR' : 'global professional in human resources','HRIS' : 'human resources management system'
        , 'CSR' : 'corporate social responsibility', 'SPHR' : 'strategic and policy-making certification'
        , 'HR' : 'human resources', "JTI" : "", 'GIS': 'Geographic information system'
        , 'MES': 'Manufacturing execution systems', 'EY': "Ernst & Young", 'EPIK': "", "Inc.": 'Incorporated'}}, regex=True)

def pre_processing(text_to_preprocess):
    cleaned_text = []
    lemmatizer = WordNetLemmatizer()
    for text in text_to_preprocess:
        processed_text = ' '.join([word.lower() for word in word_tokenize(text)])
        processed_text = ''.join([word.translate(str.maketrans('', '', string.punctuation)) for word in processed_text])
        processed_text = ''.join([i for i in processed_text if not i.isdigit()])
        processed_text = ' '.join([word for word in processed_text.split() if not word in stopwords.words('english')])
        processed_text = ' '.join( [w for w in processed_text.split() if len(w)>1])
        processed_text = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(processed_text)]
        processed_text = ' '.join([i for i in processed_text])
        cleaned_text.append(processed_text)
    return cleaned_text

def clean_phrase(phrase):
    lemmatizer = WordNetLemmatizer()
    X = [lemmatizer.lemmatize(word) for word in word_tokenize(phrase.lower())]
    X = ' '.join([i for i in X])
    return X

def check_similarity(column_name, embeddings, X_search):
    copy_df = df.copy()

    score_list = []
    for i in range(len(df)):
        score_list.append(1 - cosine(embeddings[i], X_search))
    
    copy_df[column_name] = score_list
    display(copy_df.sort_values(by = column_name, ascending=False, ignore_index=True).head())

phrase_one = clean_phrase("Aspiring human resources")
phrase_two = clean_phrase("seeking human resources")

df['job_title'] = pre_processing(df['job_title'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,ct bauer college business graduate magna cum l...,"Houston, Texas",85
1,2,native english teacher english program korea,Kanada,500
2,3,aspiring human resource professional,"Raleigh-Durham, North Carolina Area",44
3,4,people development coordinator ryan,"Denton, Texas",500
4,5,advisory board member celal bayar university,"İzmir, Türkiye",500


In [9]:
# Cleaned data: This data will be used to build embeddings 

for i in range(53):
    print(df['job_title'][i])

ct bauer college business graduate magna cum laude aspiring human resource professional
native english teacher english program korea
aspiring human resource professional
people development coordinator ryan
advisory board member celal bayar university
aspiring human resource specialist
student humber college aspiring human resource generalist
human resource senior specialist
seeking human resource human resource management system generalist position
student chapman university
senior vice president chief human resource officer marketing communication corporate social responsibility officer engie houston woodland energy global professional human resource strategic policymaking certification
human resource coordinator intercontinental buckhead atlanta
aspiring human resource management student seeking internship
seeking human resource opportunity
experienced retail manager aspiring human resource professional
human resource staffing recruiting professional
human resource specialist luxotti

In [10]:
# Tf-Idf

tf_idf = TfidfVectorizer().fit(df.job_title)
tf_idf_embeddings = tf_idf.transform(df.job_title)

# tf_idf.get_feature_names()
tf_idf_embeddings = tf_idf_embeddings.toarray()
print(tf_idf_embeddings.shape)

X_search_one = tf_idf.transform([phrase_one]).toarray()
X_search_two = tf_idf.transform([phrase_two]).toarray()

check_similarity("tf_idf_one", tf_idf_embeddings, X_search_one) 
check_similarity("tf_idf_two", tf_idf_embeddings, X_search_two) 

(53, 179)


Unnamed: 0,id,job_title,location,connection,tf_idf_one
0,97,aspiring human resource professional,"Kokomo, Indiana Area",71,0.775698
1,3,aspiring human resource professional,"Raleigh-Durham, North Carolina Area",44,0.775698
2,6,aspiring human resource specialist,Greater New York City Area,1,0.673329
3,73,aspiring human resource manager seeking intern...,"Houston, Texas Area",7,0.606927
4,27,aspiring human resource management student see...,"Houston, Texas Area",500,0.435955


Unnamed: 0,id,job_title,location,connection,tf_idf_two
0,99,seeking human resource position,"Las Vegas, Nevada Area",48,0.694884
1,28,seeking human resource opportunity,"Chicago, Illinois",390,0.671599
2,73,aspiring human resource manager seeking intern...,"Houston, Texas Area",7,0.622793
3,10,seeking human resource human resource manageme...,Greater Philadelphia Area,500,0.555784
4,27,aspiring human resource management student see...,"Houston, Texas Area",500,0.462562


In [13]:
# BERT Model

s_embedder = SentenceTransformer('all-mpnet-base-v2')
# s_embedder = SentenceTransformer('bert-base-nli-mean-tokens')
bert_embeddings = s_embedder.encode(df.job_title)

print(bert_embeddings.shape)

X_search_one = s_embedder.encode(phrase_one)
X_search_two = s_embedder.encode(phrase_two)

check_similarity("BERT_one", bert_embeddings, X_search_one) 
check_similarity("BERT_two", bert_embeddings, X_search_two) 

(53, 768)


Unnamed: 0,id,job_title,location,connection,BERT_one
0,3,aspiring human resource professional,"Raleigh-Durham, North Carolina Area",44,0.880906
1,97,aspiring human resource professional,"Kokomo, Indiana Area",71,0.880906
2,6,aspiring human resource specialist,Greater New York City Area,1,0.87784
3,74,human resource professional,Greater Boston Area,16,0.754004
4,82,aspiring human resource professional energetic...,"Austin, Texas Area",174,0.729662


Unnamed: 0,id,job_title,location,connection,BERT_two
0,28,seeking human resource opportunity,"Chicago, Illinois",390,0.881616
1,99,seeking human resource position,"Las Vegas, Nevada Area",48,0.838536
2,67,human resource staffing recruiting professional,"Jackson, Mississippi Area",500,0.810193
3,6,aspiring human resource specialist,Greater New York City Area,1,0.72842
4,10,seeking human resource human resource manageme...,Greater Philadelphia Area,500,0.716096
