In [1]:
%cd ../../

/Users/riccardotedoldi/Desktop/projectHMD/Jobify


In [2]:
import utils.config as cfg
import utils.utils as utils
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, cosine_similarity, sigmoid_kernel, euclidean_distances, manhattan_distances, cosine_distances

import torch
from transformers import AutoTokenizer, AutoModel

device = cfg.device

## Load the data

In [3]:
# Load the data
PATH_DATA = 'data/KB_user_profiles.csv.gz'
df = pd.read_csv(PATH_DATA, compression='gzip')
df.head(10)

Unnamed: 0,name,surname,gender,mansion,category,category_id,description,salary,skills,city,country,remote_hybrid_office,relocation,current_company,part_full_time,junior_senior,experience_years,level_education
0,Alice,Smith,0,Healthcare Information Technology (IT) Specialist,healthcare,7,"I am Alice Smith, a Healthcare Information Tec...",1968,"Cross-functional Collaboration, Complexity Man...",Nairobi,Kenya,1,1,HealthTechInnovations,0,0,11,1
1,Alice,Johnson,0,Logistics Manager,infrastructure,22,"Presenting myself as Alice Johnson, I hold the...",3608,"Systems Awareness, Social Entrepreneurship, In...",Havana,Cuba,0,1,PinnacleLogix Logistics,1,0,3,0
2,Alice,Williams,0,Content Marketing Manager,media,5,"I go by Alice Williams, occupying the Content ...",3465,"Negotiation Skills, Conflict Resolution, Futur...",Riyadh,Saudi Arabia,2,0,ConnectSphereTelecommunications,1,0,7,1
3,Alice,Jones,0,Drummer,music,17,"A Drummer known by the name Alice Jones, I am ...",4978,"Conflict Management, Attention to Detail, Tole...",Lima,Peru,2,1,ConnectXTelecom Telecommunications,1,1,18,0
4,Alice,Brown,0,Novelist,arts,9,"I am Alice Brown, a Novelist creative UX/UI de...",1603,"Problem Sensitivity, Resilient Leadership, Cog...",Sydney,Australia,2,0,GreenTechInnovations,0,1,3,0
5,Alice,Davis,0,Actress,arts,9,"Alice Davis, residing in the Actress of cybers...",5702,"Social Entrepreneurship, Conflict Transformati...",Manama,Bahrain,1,0,BlueSkyDigital Media,0,1,5,1
6,Alice,Miller,0,Nurse Manager,healthcare,7,"I am Alice Miller, a Nurse Manager dynamic pro...",6567,"Cross-Cultural Competence, Conflict Transforma...",Kyiv,Ukraine,1,0,UnitedManufacture Systems,1,0,20,0
7,Alice,Wilson,0,Medical Records Clerk,healthcare,7,"I am Alice Wilson, a Medical Records Clerk met...",832,"Relationship Building, Conflict Resolution, Co...",Wellington,New Zealand,1,1,DataQuestAnalytics,1,0,19,3
8,Alice,Moore,0,Chef de Cuisine,hospitality,0,"Alice Moore, residing in the Chef de Cuisine o...",3589,"Time Management, Autonomous Decision-Making, I...",Mumbai,India,2,0,Apex Financial Services,1,1,20,2
9,Alice,Taylor,0,IT Help Desk Technician,technology,18,"I am Alice Taylor, a IT Help Desk Technician i...",5904,"Interdisciplinary Collaboration, Analytical Th...",Tokyo,Japan,1,0,InnovateHealth Pharmaceuticals,0,1,8,1


In [7]:
df.keys()

Index(['name', 'surname', 'gender', 'mansion', 'category', 'category_id',
       'description', 'salary', 'skills', 'city', 'country',
       'remote_hybrid_office', 'relocation', 'current_company',
       'part_full_time', 'junior_senior', 'experience_years',
       'level_education'],
      dtype='object')

## Check the sentences category
Use a pre-trainded model to predict the category towards a the given text belongs to.

In [46]:
tokenizer = AutoTokenizer.from_pretrained(cfg.NAME_MODEL_TEXT_ENCODER)
model = AutoModel.from_pretrained(cfg.NAME_MODEL_TEXT_ENCODER)
model.eval()
model.to(device)

0.0

In [64]:
df = pd.DataFrame(cfg.CATEGORY_MANSIONS, columns=['mansions'])

def encode_text(text, categories):
    input_ids = [tokenizer.encode(desc, return_tensors='pt', max_length=128, truncation=True).to(device) for desc in list(text +  categories)]
    with torch.no_grad():
        out = [model(x) for x in input_ids]
    # return cls token
    return [x.last_hidden_state[0, 0, :].cpu().numpy() for x in out]

out = encode_text(['Fo Software Developer with web development skills.'], cfg.CATEGORY_MANSIONS)

input_vector = out[0][None, :]
enc_categories = out[1:]

# Similarity Calculation
cosine_similarities = cosine_distances(input_vector, enc_categories).flatten()

# Ranking
df['similarity'] = cosine_similarities
ranked_profiles = df.sort_values(by='similarity', ascending=True)

print(ranked_profiles[['mansions', 'similarity']])


          mansions  similarity
10      technology    0.142188
21         cousine    0.145065
0            other    0.147258
24      industrial    0.153603
6          finance    0.153920
8   infrastructure    0.156101
18    recreational    0.161155
23      healthcare    0.162522
20         fashion    0.166947
11    construction    0.167272
5            media    0.167846
3          science    0.168618
9      agriculture    0.169594
1           mining    0.170164
12      government    0.171572
13     hospitality    0.171729
25       education    0.173395
17            arts    0.173453
7            legal    0.179185
14  transportation    0.181558
22           music    0.183619
4        religious    0.185791
16     residential    0.187121
2       commercial    0.188450
15   manufacturing    0.189815
19        military    0.190946


## Generate TF-IDF vectors and compute similarity

- TfidfVectorizer ngram_range=(1,2), max_features=1000 # 12.9 sec # 0.48 sim
- TfidfVectorizer ngram_range=(1,2), max_features=1000 # 26.9 sec # 0.47 sim
- TfidfVectorizer ngram_range=(1,4), max_features=256 # 26.9 sec # 0.78 sim
- TfidfVectorizer max_features=1000 # 6.9 sec # 0.49 sim
- TfidfVectorizer max_features=256 # 6.9 sec # 0.61 sim
- TfidfVectorizer max_features=64 # 6.9 sec # 0.28 sim
- TfidfVectorizer ngram_range=(1,4), max_features=128 # 28 sec # 0.88 sim
- TfidfVectorizer ngram_range=(1,2), max_features=128 # 12 sec # 0.87 sim
- TfidfVectorizer ngram_range=(1,4), max_features=64 # 26 sec # 0.29 sim

In [83]:
# Text input
input_text = 'Looking for a software developer with web development skills.'

# Text Preprocessing
vectorizer = TfidfVectorizer(stop_words='english') 
tfidf_matrix = vectorizer.fit_transform(df['description'].fillna(''))

# Feature Extraction
input_vector = vectorizer.transform([input_text])

# Similarity Calculation
cosine_similarities = linear_kernel(input_vector, tfidf_matrix).flatten()

# Ranking
df['similarity'] = cosine_similarities
ranked_profiles = df.sort_values(by='similarity', ascending=False)

print(ranked_profiles[['name', 'description', 'similarity']].head(10))

           name                                        description  similarity
12347    George  I am George Richards, a Software Architect ded...    0.491600
17291     Grace  I am Grace Cruz, a Web Designer versatile soft...    0.439774
63278    Yvonne  I am Yvonne Price, a Web Developer versatile s...    0.434255
72613     Sylas  I am Sylas Clark, a Environmental Consultant r...    0.422023
9349   Patricia  I am Patricia Greene, a Web Designer meticulou...    0.417270
17194     Grace  I am Grace Robinson, a Web Developer innovativ...    0.413118
55061    Justin  I am Justin Hamilton, a Front-End Developer ad...    0.411762
86980   Leander  I am Leander Greene, a Operations Research Ana...    0.410482
56986      Sean  I am Sean Austin, a Psychiatrist dynamic softw...    0.407616
17141     Felix  I am Felix Peters, a Front of House Manager pr...    0.407371


In [4]:
vectorizer = TfidfVectorizer(stop_words='english') 

top_k = utils.get_sorted_profiles('Looking for an artist who draws.', df, vectorizer, kernel=linear_kernel)

utils.get_topK_name_surname_mansion_description(top_k, topK=10)

['I am Henry Brown, a Artist tenacious graphic designer. With a adaptive fervor for content creation, I bring 8 years of experience to the technology domain. Recognized for my resourceful project coordination, I excel in risk management and consistently deliver innovative solutions.'
 'I am Erin Arnold, a Artist proactive financial analyst. With a dedicated fervor for user interface design, I bring 30 years of experience to the hospitality domain. Recognized for my versatile effective communication, I excel in analyzing market trends and consistently deliver innovative solutions.'
 'I am Cassius Wright, a Artist dedicated marketing specialist. With a meticulous fervor for content creation, I bring 8 years of experience to the technology domain. Recognized for my resourceful innovative thinking, I excel in designing visually appealing graphics and consistently deliver customer satisfaction.'
 'I am Oberon Wallace, a Artist resourceful financial analyst. With a versatile fervor for strat

Unnamed: 0,name,surname,mansion,description
22675,Henry,Brown,Artist,"I am Henry Brown, a Artist tenacious graphic d..."
27190,Erin,Arnold,Artist,"I am Erin Arnold, a Artist proactive financial..."
74226,Cassius,Wright,Artist,"I am Cassius Wright, a Artist dedicated market..."
70865,Oberon,Wallace,Artist,"I am Oberon Wallace, a Artist resourceful fina..."
78036,Calypso,Snyder,Artist,"I am Calypso Snyder, a Artist adaptive UX/UI d..."
94232,Galadriel,Harrison,Artist,"I am Galadriel Harrison, a Artist meticulous c..."
50393,Megan,White,Artist,"I am Megan White, a Artist adaptive cybersecur..."
43934,Emma,Peters,Artist,"I am Emma Peters, a Artist versatile financial..."
43718,Daniel,Oliver,Artist,"I am Daniel Oliver, a Artist dynamic cybersecu..."
37268,Xavier,Willis,Artist,"I am Xavier Willis, a Artist innovative cybers..."
