In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/textsearch/potential-talents - Aspiring human resources - seeking human resources.csv


In [2]:
import IPython
IPython.core.display._iopub_max_chars = 1000000
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [3]:
from gensim.models import Word2Vec

# load dataset into a pandas DataFrame
df = pd.read_csv('/kaggle/input/textsearch/potential-talents - Aspiring human resources - seeking human resources.csv')

# train a Word2Vec model on the `job_title` column
sentences = [job_title.split() for job_title in df['job_title']]
model = Word2Vec(sentences, min_count=1)

# get the similarity score for each job title
scores = []
for job_title in df['job_title']:
    tokens = job_title.split()
    if len(tokens) == 0:
        scores.append(0.0)
    else:
        scores.append(model.wv.n_similarity(tokens, ['aspiring', 'human', 'resources']))

# add the scores as a new column in the DataFrame
df['word2vec'] = scores

# filter the DataFrame to show only rows with similarity > 0
similar_jobs = df[df['word2vec'] > 0]

# print the filtered DataFrame
print(similar_jobs)
print(similar_jobs.count())

      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
2      3              Aspiring Human Resources Professional   
4      5    Advisory Board Member at Celal Bayar University   
5      6                Aspiring Human Resources Specialist   
..   ...                                                ...   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   
101  102   Business Intelligence and Analytics at Travelers   
102  103                     Always set them up for Success   
103  104   Director Of Administration at Excellence Logging   

                                location connection  fit  word2vec  
0                         Houston, Texas         85  NaN  0.357872  
1                                 Kanada      500+   NaN  0.216646  
2    Raleigh-Durham, North Carolina 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# fit the vectorizer on the job_title column
vectorizer.fit(df['job_title'])

# transform the job_title column into a TF-IDF matrix
tfidf_matrix = vectorizer.transform(df['job_title'])

# compute the cosine similarity between each row and the target phrase
target_phrase = 'aspiring human resources'
target_tfidf = vectorizer.transform([target_phrase])
similarity_scores = tfidf_matrix.dot(target_tfidf.T).toarray().flatten()

# add the scores as a new column in the DataFrame
df['tfidf'] = similarity_scores

# filter the DataFrame to show only rows with similarity > 0
similar_jobs = df[df['tfidf'] > 0]

# print the filtered DataFrame
print(similar_jobs)

      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
2      3              Aspiring Human Resources Professional   
5      6                Aspiring Human Resources Specialist   
6      7  Student at Humber College and Aspiring Human R...   
8      9  Student at Humber College and Aspiring Human R...   
..   ...                                                ...   
93    94  Seeking Human  Resources Opportunities. Open t...   
96    97              Aspiring Human Resources Professional   
98    99                   Seeking Human Resources Position   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   

                                location connection  fit  word2vec     tfidf  
0                         Houston, Texas         85  NaN  0.357872  0.265312  
2    Raleigh-Durham, North Carolina Area         44  NaN  0.126447  0.753591  
5     

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

# load a pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# define a function to compute the similarity score between two sentences using BERT
def bert_similarity(sentences):
    # tokenize the input sentences
    inputs = tokenizer.encode_plus(sentences[0], sentences[1], add_special_tokens=True, return_tensors='pt')
    # get the token embeddings from the BERT model
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[0]
    # compute the cosine similarity between the token embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
    return cosine_similarities.item()

# compute the similarity score between each job title and the target phrase using BERT
target_phrase = 'aspiring human resources'
scores = []
for job_title in df['job_title']:
    score = bert_similarity([job_title, target_phrase])
    scores.append(score)
    
# add the scores as a new column in the DataFrame
df['bert'] = scores

# filter the DataFrame to show only rows with similarity > 0
similar_jobs = df[df['bert'] > 0]

# print the filtered DataFrame
print(similar_jobs)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
2      3              Aspiring Human Resources Professional   
3      4             People Development Coordinator at Ryan   
4      5    Advisory Board Member at Celal Bayar University   
..   ...                                                ...   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   
101  102   Business Intelligence and Analytics at Travelers   
102  103                     Always set them up for Success   
103  104   Director Of Administration at Excellence Logging   

                                location connection  fit  word2vec     tfidf  \
0                         Houston, Texas         85  NaN  0.357872  0.265312   
1                                 Kanada      500+   NaN  0.216646  0.000000   
2  

In [6]:
import gensim.downloader as api

# load the pre-trained GloVe model
model = api.load('glove-wiki-gigaword-100')


# define a function to compute the average embedding of a sentence
def get_average_embedding(sentence):
    # split the sentence into words
    words = sentence.split()
    # get the embeddings for each word and compute the average
    embeddings = [model[word] for word in words if word in model]
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

# define the target phrase
target_phrase = 'Aspiring human resources'

# compute the average embedding of the target phrase
target_embedding = get_average_embedding(target_phrase)

# compute the similarity score between the target phrase and each job title in the DataFrame
scores = []
for job_title in df['job_title']:
    job_title_embedding = get_average_embedding(job_title)
    norm_product = np.linalg.norm(job_title_embedding) * np.linalg.norm(target_embedding)
    if norm_product == 0:
        score = 0
    else:
        score = np.dot(job_title_embedding, target_embedding) / norm_product
    scores.append(score)

# add the scores as a new column in the DataFrame
df['glove'] = scores

# filter the DataFrame to show only rows with similarity > 0
similar_jobs = df[df['glove'] > 0]

# print the filtered DataFrame
print(similar_jobs)

      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
3      4             People Development Coordinator at Ryan   
4      5    Advisory Board Member at Celal Bayar University   
6      7  Student at Humber College and Aspiring Human R...   
..   ...                                                ...   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   
101  102   Business Intelligence and Analytics at Travelers   
102  103                     Always set them up for Success   
103  104   Director Of Administration at Excellence Logging   

                                location connection  fit  word2vec     tfidf  \
0                         Houston, Texas         85  NaN  0.357872  0.265312   
1                                 Kanada      500+   NaN  0.216646  0.000000   
3  

In [7]:
# export the modified DataFrame to a new CSV file
df.to_csv('modified_dataset.csv', index=False)