In [1]:
#import basic packages to deal with data and load the data into a data frame
import pandas as pd
import numpy as np
df=pd.read_csv("potential-talents.csv")

In [2]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
df.location.unique()

array(['Houston, Texas', 'Kanada', 'Raleigh-Durham, North Carolina Area',
       'Denton, Texas', 'İzmir, Türkiye', 'Greater New York City Area',
       'San Francisco Bay Area', 'Greater Philadelphia Area',
       'Lake Forest, California', 'Houston, Texas Area',
       'Atlanta, Georgia', 'Chicago, Illinois', 'Austin, Texas Area',
       'Jackson, Mississippi Area', 'Greater Grand Rapids, Michigan Area',
       'Virginia Beach, Virginia', 'Monroe, Louisiana Area',
       'Greater Boston Area', 'San Jose, California',
       'New York, New York', 'Dallas/Fort Worth Area',
       'Amerika Birleşik Devletleri', 'Baton Rouge, Louisiana Area',
       'Myrtle Beach, South Carolina Area', 'Chattanooga, Tennessee Area',
       'Los Angeles, California', 'Highland, California',
       'Gaithersburg, Maryland', 'Baltimore, Maryland',
       'Milpitas, California', 'Greater Atlanta Area',
       'Greater Chicago Area', 'Torrance, California',
       'Long Beach, California', 'Bridgewater, Massa

In [4]:
df.job_title.unique()

array(['2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
       'Native English Teacher at EPIK (English Program in Korea)',
       'Aspiring Human Resources Professional',
       'People Development Coordinator at Ryan',
       'Advisory Board Member at Celal Bayar University',
       'Aspiring Human Resources Specialist',
       'Student at Humber College and Aspiring Human Resources Generalist',
       'HR Senior Specialist',
       'Seeking Human Resources HRIS and Generalist Positions',
       'Student at Chapman University',
       'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR',
       'Human Resources Coordinator at InterContinental Buckhead Atlanta',
       'Aspiring Human Resources Management student seeking an internship',
       'Seeking Human Resources Opportunities',
       'Experienced Retail Manager and aspiring Human Resources Professional',
       'H

In [5]:
df.connection.value_counts()

500+     44
85        7
61        7
44        6
1         5
2         4
4         2
7         2
57        2
390       2
103       1
48        1
18        1
71        1
19        1
415       1
9         1
64        1
39        1
155       1
349       1
174       1
40        1
50        1
268       1
455       1
52        1
409       1
212       1
16        1
5         1
82        1
49        1
Name: connection, dtype: int64

In [6]:
df.dtypes

id              int64
job_title      object
location       object
connection     object
fit           float64
dtype: object

We want to start preprocessing the field so they can be used. We will not do anything with the location field for now. We want to convert connections into a numeric value in case we want that to be a relevant factor and change the job_title field to a string that we can use for NLP

In [7]:
#turn connections into a numeric field for potential use
df.connection=df.connection.str.replace("500+ ","500",regex=False)
df.connection=df.connection.astype(int)

In [8]:
#Create a dataframe without the missing field
X=df.drop(columns=["fit"])

In [9]:
#Replace "HR" with "Human Resources" since that's generally what it refers to
X.job_title=X.job_title.str.replace("HR "," Human Resources ")

In [10]:
#Standardize by putting everything in lower case
X.job_title=X.job_title.str.lower()

In [11]:
#Get rid of all characters other than letters and white space
X.job_title=X.job_title.str.replace("[^a-z\s]","",regex=True)

In [12]:
#Make all the spacing uniquely single space
X.job_title=X.job_title.apply(lambda x : " ".join(x.split()))

In [13]:
X.job_title.unique()

array(['ct bauer college of business graduate magna cum laude and aspiring human resources professional',
       'native english teacher at epik english program in korea',
       'aspiring human resources professional',
       'people development coordinator at ryan',
       'advisory board member at celal bayar university',
       'aspiring human resources specialist',
       'student at humber college and aspiring human resources generalist',
       'human resources senior specialist',
       'seeking human resources hris and generalist positions',
       'student at chapman university',
       'svp chro marketing communications csr officer engie houston the woodlands energy gp human resources sphr',
       'human resources coordinator at intercontinental buckhead atlanta',
       'aspiring human resources management student seeking an internship',
       'seeking human resources opportunities',
       'experienced retail manager and aspiring human resources professional',
       'hu

In [15]:
#Our search strings
query_1="aspiring human resources"
query_2="seeking human resources"
query_list=[query_1,query_2]

We need to build some metrics to see how well the job title matches our query strings. The general approach will be to vectorize the job titles, vectorize the query strings and use cosine similarity to see how closely they match. 

In [17]:
#One way to vectorize is TfIdf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#Use TfIdf to vectorize the job titles
tf=TfidfVectorizer()
vects=tf.fit_transform(X.job_title)
#Get the appropriate vectors for the queries we want
tf_query=tf.transform(query_list)

In [18]:
#Look at the similarity to each of our queries
X["tfidf_sim_1"]=cosine_similarity(vects,tf_query)[:,0]
X["tfidf_sim_2"]=cosine_similarity(vects,tf_query)[:,1]

In [19]:
X.head()

Unnamed: 0,id,job_title,location,connection,tfidf_sim_1,tfidf_sim_2
0,1,ct bauer college of business graduate magna cu...,"Houston, Texas",85,0.252975,0.097077
1,2,native english teacher at epik english program...,Kanada,500,0.0,0.0
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
3,4,people development coordinator at ryan,"Denton, Texas",500,0.0,0.0
4,5,advisory board member at celal bayar university,"İzmir, Türkiye",500,0.0,0.0


In [20]:
X.sort_values("tfidf_sim_1",ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_sim_1,tfidf_sim_2
32,33,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
45,46,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
20,21,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
57,58,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
96,97,aspiring human resources professional,"Kokomo, Indiana Area",71,0.736678,0.282693
16,17,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693
23,24,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851
5,6,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851
35,36,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851


In [21]:
X.sort_values("tfidf_sim_2",ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_sim_1,tfidf_sim_2
29,30,seeking human resources opportunities,"Chicago, Illinois",390,0.2502,0.652003
27,28,seeking human resources opportunities,"Chicago, Illinois",390,0.2502,0.652003
98,99,seeking human resources position,"Las Vegas, Nevada Area",48,0.242423,0.631738
72,73,aspiring human resources manager seeking inter...,"Houston, Texas Area",7,0.50159,0.548602
9,10,seeking human resources hris and generalist po...,Greater Philadelphia Area,500,0.173929,0.453247
61,62,seeking human resources hris and generalist po...,Greater Philadelphia Area,500,0.173929,0.453247
39,40,seeking human resources hris and generalist po...,Greater Philadelphia Area,500,0.173929,0.453247
52,53,seeking human resources hris and generalist po...,Greater Philadelphia Area,500,0.173929,0.453247
28,29,aspiring human resources management student se...,"Houston, Texas Area",500,0.336841,0.412123
26,27,aspiring human resources management student se...,"Houston, Texas Area",500,0.336841,0.412123


In [22]:
#We want to see how well a candidate matches either of the desired queries so take the maximum
X["tfidf_score"]=X[["tfidf_sim_1","tfidf_sim_2"]].max(axis=1)
X.head()

Unnamed: 0,id,job_title,location,connection,tfidf_sim_1,tfidf_sim_2,tfidf_score
0,1,ct bauer college of business graduate magna cu...,"Houston, Texas",85,0.252975,0.097077,0.252975
1,2,native english teacher at epik english program...,Kanada,500,0.0,0.0,0.0
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
3,4,people development coordinator at ryan,"Denton, Texas",500,0.0,0.0,0.0
4,5,advisory board member at celal bayar university,"İzmir, Türkiye",500,0.0,0.0,0.0


In [23]:
X.sort_values("tfidf_score",ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_sim_1,tfidf_sim_2,tfidf_score
57,58,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
32,33,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
20,21,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
45,46,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
96,97,aspiring human resources professional,"Kokomo, Indiana Area",71,0.736678,0.282693,0.736678
16,17,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.736678,0.282693,0.736678
35,36,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851,0.677153
5,6,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851,0.677153
59,60,aspiring human resources specialist,Greater New York City Area,1,0.677153,0.259851,0.677153


In [72]:
import tensorflow as tf
from tensorflow.keras import layers, activations, losses, Model, Input
from tensorflow.nn import leaky_relu
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [79]:
#Implement RankNet 
class RankNet(Model):
    def __init__(self):
        super().__init__()
        self.dense = [layers.Dense(16, activation=leaky_relu), layers.Dropout(0.2), layers.Dense(16, activation=leaky_relu),layers.Dropout(0.2),layers.Dense(8, activation=leaky_relu)]
        self.o = layers.Dense(1, activation='linear')
        self.oi_minus_oj = layers.Subtract()
    
    def call(self, inputs):
        xi, xj = inputs
        densei = self.dense[0](xi)
        densej = self.dense[0](xj)
        for dense in self.dense[1:]:
            densei = dense(densei)
            densej = dense(densej)
        oi = self.o(densei)
        oj= self.o(densej)
        oij = self.oi_minus_oj([oi, oj])
        output = layers.Activation('sigmoid')(oij)
        return output
    
    def predict(self,_input):
        x=_input
        _dense=self.dense[0](x)
        for dense in self.dense[1:]:
            _dense_=dense(_dense)
        _o=self.o(_dense)
        output=layers.Activation('sigmoid')(_o)
        return output
    
    


In [80]:
ranknet = RankNet()
ranknet.compile(optimizer='adam', loss='binary_crossentropy',metrics="accuracy")