## Project 3 - Ranking Potential Candidates

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

In [2]:
# Read the dataset
data = pd.read_csv("potential-talents.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


### Data Preparation

- Check for null values and duplicate columns

In [3]:
# Check for null values
data.isnull().sum()

id              0
job_title       0
location        0
connection      0
fit           104
dtype: int64

In [4]:
# Check for duplicate entries
data.duplicated(subset=['job_title', 'connection']).sum()

51

In [5]:
# Remove duplicated entries
data = data.drop_duplicates(subset=['job_title', 'connection'])

- As there is no particular indication as to whether location is an important feature, it has been removed from further analysis. 

In [6]:
# Drop columns
data.drop(columns = ['fit','location'], axis = 1, inplace = True)

- Before performing analysis on the job titles, a few data preprocessing is done here such as removal of punctuation marks, replacing the word HR with Human resources

In [7]:
import re

# Function to preprocess job titles
def preprocess_job_titles(job_title):
    # Replace "HR" with "Human Resource"
    job_title = job_title.replace("HR", "Human Resource")
    
    # Remove punctuations
    job_title = re.sub(r'[^\w\s]', '', job_title)
    
    return job_title

# Apply preprocessing to the job_title column
data['job_title'] = data['job_title'].apply(preprocess_job_titles)

- As the 'connection' feature is a string, it is converted to integer. Moreover, entries which have 500+ connections are kept as 501 connections

In [8]:
def convert_connections_adjusted(value):
    value = value.strip()  # Remove leading and trailing spaces
    if value == '500+':
        return 501
    else:
        return int(value)

# Apply the function
data['connection'] = data['connection'].apply(convert_connections_adjusted)

In [9]:
data.head()

Unnamed: 0,id,job_title,connection
0,1,2019 CT Bauer College of Business Graduate Mag...,85
1,2,Native English Teacher at EPIK English Program...,501
2,3,Aspiring Human Resources Professional,44
3,4,People Development Coordinator at Ryan,501
4,5,Advisory Board Member at Celal Bayar University,501


### Candidate Fit through Cosine Similiarity

The cosine similiarity of word embeddings of job titles against the keywords, 'Aspiring human resources' and 'seeking human resources' is found here through Sentence Transformer and BERT.

#### 1. Sentence Transformer

In [10]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract job titles
job_titles = data['job_title'].tolist()

keyword1 = "Aspiring human resources"
keyword2 = "Seeking human resources"

# Generate embeddings for job titles and keyword
job_title_embeddings = model.encode(job_titles, convert_to_tensor=True)
keyword_embedding_1 = model.encode([keyword1], convert_to_tensor=True)
keyword_embedding_2 = model.encode([keyword2], convert_to_tensor=True)

# Calculate cosine similarity
cosine_similarity_key1 = util.pytorch_cos_sim(job_title_embeddings, keyword_embedding_1).numpy()
cosine_similarity_key2 = util.pytorch_cos_sim(job_title_embeddings, keyword_embedding_2).numpy()

In [11]:
# Flatten the cosine similarity scores to a 1D array and add as new column
data['fit_sentence_transformer_key1'] = cosine_similarity_key1.flatten()
data['fit_sentence_transformer_key2'] = cosine_similarity_key2.flatten()

#### 2. BERT

In [12]:
# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings
def get_bert_embedding(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Pool the outputs into a single mean vector
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [13]:
keyword_embedding_1 = get_bert_embedding(keyword1)
keyword_embedding_2 = get_bert_embedding(keyword2)

job_titles = data['job_title']
job_title_embeddings = torch.stack([get_bert_embedding(title) for title in job_titles])

In [14]:
# Compute cosine similarity
cosine_similarity_bert_1 = [1 - cosine(keyword_embedding_1.numpy().flatten(), embedding.numpy().flatten()) for embedding in job_title_embeddings]
cosine_similarity_bert_2 = [1 - cosine(keyword_embedding_2.numpy().flatten(), embedding.numpy().flatten()) for embedding in job_title_embeddings]

# Flatten the cosine similarity scores to a 1D array and add as new column
data['fit_bert_key1'] = cosine_similarity_bert_1
data['fit_bert_key2'] = cosine_similarity_bert_2

In [15]:
# Calculate the Average cosine similiarity
data["avg_fit"] = data[["fit_sentence_transformer_key1", "fit_sentence_transformer_key2", "fit_bert_key1", "fit_bert_key2"]].mean(axis=1)

The initial rank is obtained from the average cosine similiarity of the keywords obtained through BERT and Sentence Transformer

In [16]:
data['initial_rank'] = data['avg_fit'].rank(method='dense', ascending=False).astype(int)

In [17]:
data.head()

Unnamed: 0,id,job_title,connection,fit_sentence_transformer_key1,fit_sentence_transformer_key2,fit_bert_key1,fit_bert_key2,avg_fit,initial_rank
0,1,2019 CT Bauer College of Business Graduate Mag...,85,0.550054,0.425106,0.647348,0.562101,0.546152,33
1,2,Native English Teacher at EPIK English Program...,501,0.211303,0.191841,0.556558,0.542762,0.375616,47
2,3,Aspiring Human Resources Professional,44,0.949807,0.7727,0.902632,0.794786,0.854981,3
3,4,People Development Coordinator at Ryan,501,0.380222,0.377238,0.752288,0.738064,0.561953,32
4,5,Advisory Board Member at Celal Bayar University,501,0.230512,0.231743,0.430944,0.457767,0.337742,51


To simulate starring candidates and re-ranking, 15 candidates are starred as 'Fit'.

In [18]:
data['starred'] = 0

indices_to_star = [2, 6, 9, 11, 27, 65, 68, 72, 73, 78, 81, 88, 96, 99, 100]
data.loc[indices_to_star, 'starred'] = 1

In [19]:
data.head()

Unnamed: 0,id,job_title,connection,fit_sentence_transformer_key1,fit_sentence_transformer_key2,fit_bert_key1,fit_bert_key2,avg_fit,initial_rank,starred
0,1,2019 CT Bauer College of Business Graduate Mag...,85,0.550054,0.425106,0.647348,0.562101,0.546152,33,0
1,2,Native English Teacher at EPIK English Program...,501,0.211303,0.191841,0.556558,0.542762,0.375616,47,0
2,3,Aspiring Human Resources Professional,44,0.949807,0.7727,0.902632,0.794786,0.854981,3,1
3,4,People Development Coordinator at Ryan,501,0.380222,0.377238,0.752288,0.738064,0.561953,32,0
4,5,Advisory Board Member at Celal Bayar University,501,0.230512,0.231743,0.430944,0.457767,0.337742,51,0


### Re-Ranking using LightGBM

For re-ranking, LightGBM Ranker is used here with the cosine similiarity features and connection.

In [20]:
features = ['connection', 'fit_sentence_transformer_key1', 'fit_sentence_transformer_key2', 'fit_bert_key1', 'fit_bert_key2','initial_rank']
X = data[features]
y = data['starred']

In [57]:
from sklearn.model_selection import train_test_split

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

In [58]:
import lightgbm as lgb

# Group information indicating all instances belong to the same group
train_group = [X_train.shape[0]]
test_group = [X_test.shape[0]]

# Convert data to LightGBM dataset format
train_data = lgb.Dataset(X_train, label=y_train, group=train_group)
test_data = lgb.Dataset(X_test, label=y_test, group=test_group, reference=train_data)

# Parameters for RankNet (using lambdarank in LightGBM)
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',  # Normalized Discounted Cumulative Gain
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# Train the model
gbm = lgb.train(params,
                train_data,
                num_boost_round=100,
                valid_sets=[test_data])


In [59]:
from sklearn.metrics import ndcg_score

# Perform prediction on test data
y_pred = gbm.predict(X_test)

# The predictions (y_pred) and true scores (y_test) should be reshaped if they are 1D
y_test_reshaped = y_test.values.reshape(1, -1)
y_pred_reshaped = y_pred.reshape(1, -1)

# Calculate NDCG
ndcg = ndcg_score(y_test_reshaped, y_pred_reshaped)

print(f"NDCG Score: {ndcg}")


NDCG Score: 0.8519590445170675


#### New Ranking

The new ranking predicted with the LightGBM model is shown below for each candidate

In [60]:
y_pred_data = gbm.predict(X)

ranking_df = pd.DataFrame({
    'CandidateID': data['id'],  # An identifier for each candidate
    'Job Title' : data['job_title'],
    'Score': y_pred_data
})

# Sort the dataframe by 'Score' in descending order to obtain rankings
ranking_df = ranking_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

# Optionally, add a 'Rank' column to explicitly show the ranking
ranking_df['Rank'] = ranking_df.index + 1

In [62]:
ranking_df.head()

Unnamed: 0,CandidateID,Job Title,Score,Rank
0,28,Seeking Human Resources Opportunities,1.026183,1
1,97,Aspiring Human Resources Professional,0.998986,2
2,10,Seeking Human Resources Human ResourceIS and G...,0.989268,3
3,74,Human Resources Professional,0.951133,4
4,3,Aspiring Human Resources Professional,0.943098,5
