In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/textsearch/potential-talents - Aspiring human resources - seeking human resources.csv


In [2]:
import re

# Load the data
data = pd.read_csv('/kaggle/input/textsearch/potential-talents - Aspiring human resources - seeking human resources.csv')

# Define the keywords
keywords = ['aspiring human resources', 'seeking human resources']

# Fill the "fit" column based on the keywords
for index, row in data.iterrows():
    job_title = row['job_title'].lower()
    for keyword in keywords:
        if re.search(keyword, job_title):
            data.at[index, 'fit'] = 1
            break
    else:
        data.at[index, 'fit'] = 0

# Save the data
data.to_csv('job_titles_with_fit.csv', index=False)
print(data)

# Filter the data based on the column and value
filtered_data = data.loc[data['fit'] == 1]

# Print the filtered data
print(filtered_data.count())

      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
2      3              Aspiring Human Resources Professional   
3      4             People Development Coordinator at Ryan   
4      5    Advisory Board Member at Celal Bayar University   
..   ...                                                ...   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   
101  102   Business Intelligence and Analytics at Travelers   
102  103                     Always set them up for Success   
103  104   Director Of Administration at Excellence Logging   

                                location connection  fit  
0                         Houston, Texas         85  1.0  
1                                 Kanada      500+   0.0  
2    Raleigh-Durham, North Carolina Area         44  1.0  
3      

In [3]:
import IPython
IPython.core.display._iopub_max_chars = 1000000
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [4]:
# Define the search keywords
search_keywords = ['aspiring human resources', 'seeking human resources']

# Load the word2vec model
model = api.load('word2vec-google-news-300')

# Preprocess the data
data = data.dropna() # remove missing values
data['job_title'] = data['job_title'].apply(lambda x: x.lower()) # convert to lowercase

# Calculate the embeddings for each search keyword
search_embeddings = [np.mean([model.get_vector(word) for word in search_keyword.split() if word in model.key_to_index] or [np.zeros(300)], axis=0) for search_keyword in search_keywords]

# Calculate the similarity scores for each job title
similarity_scores = np.zeros((len(data), len(search_keywords)))
for j in range(len(search_keywords)):
    embeddings = np.array([np.mean([model.get_vector(word) for word in job_title.split() if word in model.key_to_index] or [np.zeros(300)], axis=0) for job_title in data['job_title']])
    similarity_scores[:, j] = np.array([cosine_similarity(embeddings[i].reshape(1, -1), search_embeddings[j].reshape(1, -1)) for i in range(len(data))]).flatten()

# Fill the fit column in the range from 0 to 1 based on the similarity scores
max_similarity_scores = np.max(similarity_scores, axis=1).reshape(-1, 1)
min_similarity_scores = np.min(similarity_scores, axis=1).reshape(-1, 1)
data['fit'] = (similarity_scores - min_similarity_scores) / (max_similarity_scores - min_similarity_scores)

# Define the input and output data for the learning to rank model
X = data['fit'].values.reshape(-1, 1)
y = np.arange(len(data))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Get the predicted relevance scores for the candidates
relevance_scores = model.predict_proba(X)[:, 1]

# Add the predicted relevance scores to the dataset
data['relevance_score'] = relevance_scores

# Sort the dataset by the predicted relevance scores in descending order
sorted_data = data.sort_values(by=['relevance_score'], ascending=False)

# Print the top 10 candidates
for i in range(10):
    candidate = sorted_data.iloc[i]
    print(f'Candidate {i+1}:')
    print(f'  ID: {candidate["id"]}')
    print(f'  Job Title: {candidate["job_title"]}')
    print(f'  Location: {candidate["location"]}')
    print(f'   Connections: {candidate["connection"]}')
    print(f'   Fit: {candidate["fit"]}')

Candidate 1:
  ID: 1
  Job Title: 2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional
  Location: Houston, Texas
   Connections: 85
   Fit: 1.0
Candidate 2:
  ID: 52
  Job Title: student at humber college and aspiring human resources generalist
  Location: Kanada
   Connections: 61
   Fit: 1.0
Candidate 3:
  ID: 90
  Job Title: undergraduate research assistant at styczynski lab
  Location: Greater Atlanta Area
   Connections: 155
   Fit: 1.0
Candidate 4:
  ID: 72
  Job Title: business management major and aspiring human resources manager
  Location: Monroe, Louisiana Area
   Connections: 5
   Fit: 1.0
Candidate 5:
  ID: 37
  Job Title: student at humber college and aspiring human resources generalist
  Location: Kanada
   Connections: 61
   Fit: 1.0
Candidate 6:
  ID: 39
  Job Title: student at humber college and aspiring human resources generalist
  Location: Kanada
   Connections: 61
   Fit: 1.0
Candidate 7:
  ID: 41
  Job Title: s

In [5]:
print(sorted_data)

      id                                          job_title  \
0      1  2019 c.t. bauer college of business graduate (...   
51    52  student at humber college and aspiring human r...   
89    90  undergraduate research assistant at styczynski...   
71    72  business management major and aspiring human r...   
36    37  student at humber college and aspiring human r...   
..   ...                                                ...   
63    64  svp, chro, marketing & communications, csr off...   
64    65  human resources coordinator at intercontinenta...   
67    68            human resources specialist at luxottica   
68    69  director of human resources north america, gro...   
103  104   director of administration at excellence logging   

                                location connection  fit  relevance_score  
0                         Houston, Texas         85  1.0         0.013857  
51                                Kanada         61  1.0         0.013857  
89             