In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
%pip install --upgrade openai

Python interpreter will be restarted.
Python interpreter will be restarted.


# Load models that we have done

In [0]:
from pyspark.ml.clustering import KMeansModel

model_path = "dbfs:/FileStore/shared_uploads/kinani@campus.technion.ac.il/kmeans_model"

kmeans_model = KMeansModel.load(model_path)

In [0]:
centers = kmeans_model.clusterCenters()
len(centers)

Out[2]: 50

In [0]:
from joblib import dump, load

knn_loaded = load('/dbfs/FileStore/shared_uploads/kinani@campus.technion.ac.il/knn_model.joblib')

# Load Data

In [0]:
df_posts_preds = spark.read.parquet("dbfs:/FileStore/shared_uploads/kinani@campus.technion.ac.il/posts_predictions.parquet")

In [0]:
df_users_knn_preds = spark.read.parquet("dbfs:/FileStore/shared_uploads/kinani@campus.technion.ac.il/user_kmeans_knn_embeds-2.parquet")

In [0]:
df_val = spark.read.parquet("dbfs:/FileStore/shared_uploads/kinani@campus.technion.ac.il/validation_set.parquet")

In [0]:
import ast
from pyspark.sql.types import MapType, LongType, StructType, StructField, StringType, ArrayType, DoubleType
from pyspark.sql.functions import explode, col, size


def extract_list(dict_str):
    try:
        return ast.literal_eval(dict_str)
    except (SyntaxError, ValueError):
        dict_ = []
        return dict_
    
extract_dict_udf = udf(extract_list, ArrayType(DoubleType()))
df_users_knn_preds = df_users_knn_preds.withColumn('embeddings', extract_dict_udf(col('embeddings')))

# Match posts to the validation set

In [0]:
from pyspark.sql.functions import explode, col, size
import numpy as np
from scipy.spatial.distance import euclidean

ids = [id[0] for id in df_val.select('id').collect()]
ori_pos = [pos[0] for pos in df_val.select('original_pos').collect()]

val_df_full = df_users_knn_preds.filter(col('id').isin(ids))
ids = [id[0] for id in val_df_full.select('id').collect()]
ori_pos = [pos[0] for pos in val_df_full.select('position').collect()]

train_df_full = df_users_knn_preds.filter(~col('id').isin(ids))

embeddings = [embedding[0] for embedding in val_df_full.select('embeddings').collect()]
new_ids = {}
new_pos = {}
for i in range(len(embeddings)):
  pred_cluster = knn_loaded.predict([embeddings[i]])
  temp = df_users_knn_preds.filter(col('knn_prediction') == int(pred_cluster))
  embeddings_train = [np.array(embedding[0]) for embedding in temp.select('embeddings').collect()]
  l = np.mean(embeddings_train,axis=0)
  dis_list = []
  for center in centers:
      dis_list.append(euclidean(l, center))
  fit_posts_cluster = np.argmin(dis_list)
  if i == 0:
    fitted_posts = df_posts_preds.filter(col('prediction') == int(fit_posts_cluster)).select('Title', 'post_link', 'position')
    count = fitted_posts.count()
    if count > 5:
      fitted_posts = fitted_posts.limit(5)
    new_ids[ids[i]] = fitted_posts.count()
    new_pos[ids[i]+ '+' +ori_pos[i]] = fitted_posts.count()
  else:
    y = df_posts_preds.filter(col('prediction') == int(fit_posts_cluster)).select('Title', 'post_link', 'position')
    count = y.count()
    if count > 5:   
      y = y.limit(5)   
      fitted_posts = fitted_posts.union(y)
      new_ids[ids[i]] = 5
      new_pos[ids[i]+ '+' +ori_pos[i]] = 5
    else:
      fitted_posts = fitted_posts.union(y)
      new_ids[ids[i]] = count
      new_pos[ids[i]+ '+' +ori_pos[i]] = count

In [0]:
ids = []
pos = []
for id, value in new_ids.items():
    ids += [id]*value

for position, value in new_pos.items():
    pos += [position.split('+')[1]]*value

titles = [title[0] for title in fitted_posts.select('Title').collect()]
post_links = [post_link[0] for post_link in fitted_posts.select('post_link').collect()]
positions = [position[0] for position in fitted_posts.select('position').collect()]

data = list(zip(ids, pos, titles, post_links, positions))
result = spark.createDataFrame(data, schema=['id', 'Original Position', 'Title', 'post_link', 'position'])

In [0]:
result.display()

id,Original Position,Title,post_link,position
jason-perez-lcsw-23a57878,Psychotherapist,General Manager position open,https://www.linkedin.com/pulse/general-manager-position-open-krys-schroeder?trk=public_profile_article_view,General Manager
jason-perez-lcsw-23a57878,Psychotherapist,Hiring for Entry Level Sales Position.,https://www.linkedin.com/pulse/hiring-entry-level-sales-position-matt-jones?trk=public_profile_article_view,Junior Partner
jason-perez-lcsw-23a57878,Psychotherapist,We are hiring--Parts Consultant/Service Manager/Service Technician,https://www.linkedin.com/pulse/we-hiring-parts-consultantservice-managerservice-ken-vance-motors?trk=public_profile_article_view,Service Manager
jason-perez-lcsw-23a57878,Psychotherapist,Job Opening: Engineering Manager,https://www.linkedin.com/pulse/job-opening-engineering-manager-katelin-moore?trk=public_profile_article_view,Engineering Manager
jason-perez-lcsw-23a57878,Psychotherapist,Bath and Body Works now Hiring Leadership Positions,https://www.linkedin.com/pulse/bath-body-works-now-hiring-leadership-positions-kristina-dedivanaj?trk=public_profile_article_view,assistant manager/Co-Manager
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Full Time Java Architects/Java Developers Required!!!,https://www.linkedin.com/pulse/full-time-java-architectsjava-developers-required-chris-jones?trk=public_profile_article_view,Full Time Java Developers
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,JAVA DEVELOPER - Positions all round the year,https://www.linkedin.com/pulse/java-developer-positions-all-round-year-chetan-sindagi?trk=public_profile_article_view,JAVA DEVELOPER
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Lyrahealth is Hiring: Java Back End engineers.,https://www.linkedin.com/pulse/lyrahealth-hiring-java-back-end-engineers-albert-baranchuk?trk=public_profile_article_view,Java Back End engineers
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Urgent - Required Part time Trainers,https://www.linkedin.com/pulse/urgent-required-part-time-trainers-ashok-kumar-k?trk=public_profile_article_view,Java Weekdays and Weekend Trainer:
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Hiring Oracle PL/SQL Developer @ Peoria IL,https://www.linkedin.com/pulse/hiring-oracle-plsql-developer-peoria-il-jani-pasha-shaik-l-i-o-n-?trk=public_profile_article_view,Oracle PL/SQL Developer


# Examine how good our model is:

In [0]:
result = spark.read.parquet("dbfs:/FileStore/shared_uploads/kinani@campus.technion.ac.il/final_res.parquet")
result.display()

id,Original Position,Title,post_link,position
jason-perez-lcsw-23a57878,Psychotherapist,General Manager position open,https://www.linkedin.com/pulse/general-manager-position-open-krys-schroeder?trk=public_profile_article_view,General Manager
jason-perez-lcsw-23a57878,Psychotherapist,Hiring for Entry Level Sales Position.,https://www.linkedin.com/pulse/hiring-entry-level-sales-position-matt-jones?trk=public_profile_article_view,Junior Partner
jason-perez-lcsw-23a57878,Psychotherapist,We are hiring--Parts Consultant/Service Manager/Service Technician,https://www.linkedin.com/pulse/we-hiring-parts-consultantservice-managerservice-ken-vance-motors?trk=public_profile_article_view,Service Manager
jason-perez-lcsw-23a57878,Psychotherapist,Job Opening: Engineering Manager,https://www.linkedin.com/pulse/job-opening-engineering-manager-katelin-moore?trk=public_profile_article_view,Engineering Manager
jason-perez-lcsw-23a57878,Psychotherapist,Bath and Body Works now Hiring Leadership Positions,https://www.linkedin.com/pulse/bath-body-works-now-hiring-leadership-positions-kristina-dedivanaj?trk=public_profile_article_view,assistant manager/Co-Manager
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Full Time Java Architects/Java Developers Required!!!,https://www.linkedin.com/pulse/full-time-java-architectsjava-developers-required-chris-jones?trk=public_profile_article_view,Full Time Java Developers
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,JAVA DEVELOPER - Positions all round the year,https://www.linkedin.com/pulse/java-developer-positions-all-round-year-chetan-sindagi?trk=public_profile_article_view,JAVA DEVELOPER
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Lyrahealth is Hiring: Java Back End engineers.,https://www.linkedin.com/pulse/lyrahealth-hiring-java-back-end-engineers-albert-baranchuk?trk=public_profile_article_view,Java Back End engineers
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Urgent - Required Part time Trainers,https://www.linkedin.com/pulse/urgent-required-part-time-trainers-ashok-kumar-k?trk=public_profile_article_view,Java Weekdays and Weekend Trainer:
bryanna-penaloza-8b8974227,Ultrasound Technologist at Next Generation Prenatal Imaging,Hiring Oracle PL/SQL Developer @ Peoria IL,https://www.linkedin.com/pulse/hiring-oracle-plsql-developer-peoria-il-jani-pasha-shaik-l-i-o-n-?trk=public_profile_article_view,Oracle PL/SQL Developer


In [0]:
# import the OpenAI Python library for calling the OpenAI API
from openai import OpenAI
import os
key = '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'
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", key))

In [0]:
MODEL = "gpt-3.5-turbo-0125"
"""
we are cleaning the positions column such that positions that are similar like Senior,Junior Software engineer wil be considered the same position so we can ease the job matching process
"""
original_pos = [id[0] for id in result.select("Original Position").collect()]
positions = [position[0] for position in result.select("position").collect()]
answers_list = []
for i in range(len(original_pos)):
    original = original_pos[i]
    position = positions[i]
    try:
        response = client.chat.completions.create(
        model= MODEL,
        response_format={ "type": "text" },
        messages=[
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": "the user have this position: "+ original +"i am suggesting for him this position: " + position +"will he fit the position or not? (provide only yes or no answers)"}
        ]
        )
        answers_list.append(response.choices[0].message.content)
    except:
        answers_list.append('no')

In [0]:
ids = [id[0] for id in result.select("id").collect()]
count = 0
dict_user = {}
for answer, user in zip(answers_list, ids):
    if user not in dict_user.keys():
        dict_user[user] = 0
    if 'yes' in answer.lower():
        dict_user[user] += 1

for user in dict_user.keys():
    if dict_user[user] >= 1:
        count += 1
print(f'Accuracy for the matching model: {count/len(list(dict_user.keys()))}')

Accuracy for the matching model: 0.7739130434782608


We used ChatGPT to calculate the accuracy of out model, we asked him to see if the original position and the suggested position are similar (or the user will fit the suggested position), if the user fit with at least one of the jobs that suggested we count it and then we divided by the count of the number of users.

# Examples

As a test we have this user which have a computer programmer, so we took his embeddings (for his skills) and try our model on it.

In [0]:
sample_list = [[-0.6706933975219727, -0.42842090129852295, -0.510421872138977, -0.4945063292980194, 0.19550849497318268, 1.1977299451828003, 0.7323092818260193, 0.02588106319308281, -0.08655472844839096, 0.11286614090204239, -1.7146906852722168, 0.34664157032966614, -0.21252375841140747, -0.42784157395362854, -0.5012285709381104, 0.13415215909481049, -0.16122323274612427, 0.3724403381347656, -0.18778802454471588, 1.8381671905517578, 0.17887042462825775, 0.44922539591789246, 0.5905948281288147, -0.5668662190437317, 0.9148982763290405, -0.5254251956939697, -0.35749319195747375, -1.063961148262024, 0.5688954591751099, -0.733620285987854, -0.8561392426490784, -0.7014173269271851, -0.1544964760541916, 1.183553695678711, 0.5970229506492615, 0.2638500928878784, 0.49220648407936096, -0.9508345127105713, -0.5862522125244141, 0.4927821457386017, -0.6425691246986389, -0.9544792771339417, 0.8654577136039734, 1.822817325592041, -1.0189638137817383, -0.9332273006439209, 0.6490455865859985, -0.8929405808448792, -0.6282690763473511, -0.05509595572948456, -0.18858931958675385, -0.3116092085838318, 0.20015980303287506, 0.3848048448562622, -0.1600695550441742, -0.4681367576122284, -0.12548016011714935, 0.09133560210466385, -0.4496992230415344, 0.7830198407173157, -0.24506162106990814, -1.680182695388794, 0.2570926547050476, -0.22641684114933014, -0.448573499917984, -0.08252928406000137, -0.1565830558538437, 1.1435072422027588, 0.4230324923992157, 0.005915723275393248, -0.4902113080024719, -0.5577434301376343, 0.5840638279914856, 0.18176552653312683, 0.4270551800727844, 0.7933772802352905, -1.3446415662765503, 0.6914663910865784, -0.5882591605186462, 0.7857420444488525, 0.3810943365097046, -0.7915015816688538, 0.3224917948246002, 0.5636569261550903, -0.6346782445907593, -0.7694112658500671, 1.3532264232635498, 0.4394994080066681, 0.05955122783780098, 1.0237390995025635, 0.06282403320074081, 0.6773216128349304, 0.5415569543838501, 0.8082630634307861, 0.8172611594200134, -1.318649172782898, -1.7530862092971802, 0.7132034301757812, -0.4186774492263794, -0.43020889163017273, -0.6111202836036682, 0.8143481612205505, -0.3681672215461731, -0.22268714010715485, -1.0856276750564575, -1.0090683698654175, 0.007512924261391163, -0.5466165542602539, -0.45647698640823364, -1.1467773914337158, 1.0998704433441162, 0.7022850513458252, 0.07868332415819168, 1.6956406831741333, 0.2950560748577118, -0.53275066614151, -1.0292999744415283, 0.09590861201286316, 0.005544342566281557, -0.006183079909533262, 0.06757470220327377, 0.9113250374794006, -0.2616834342479706, -1.0086617469787598, -0.21128173172473907, -0.6154233813285828, -0.8373727202415466, -0.14452597498893738, 0.0489526130259037, -0.6197733283042908, -0.09811694920063019, -0.6379470229148865, -1.1373000144958496, -0.4057888686656952, -0.4701167047023773, -1.0680999755859375, -0.4024772346019745, -0.34391331672668457, 0.13926783204078674, 0.849109411239624, -0.7231711745262146, 1.3830336332321167, -0.2513340413570404, -0.7594247460365295, 0.029093872755765915, 0.42706501483917236, -0.9618529677391052, -0.24059763550758362, -0.7362045645713806, -0.14806799590587616, 0.7299978733062744, 0.5559238791465759, 1.4572376012802124, -1.0193123817443848, 1.2857905626296997, 0.9910129904747009, -1.0639045238494873, -0.6419322490692139, 0.5261046886444092, -0.19174642860889435, -0.5793922543525696, 0.6166197657585144, -0.6117421388626099, 0.6775218844413757, -0.25443607568740845, -0.4538983106613159, 1.7183769941329956, 1.3199787139892578, -0.15999187529087067, 2.534010410308838, -0.2757120132446289, 1.1027060747146606, 0.7066921591758728, 0.02665880136191845, -1.1340314149856567, -3.243330955505371, 0.47715485095977783, -0.15105067193508148, -1.9128615856170654, -0.6994367837905884, 0.7890516519546509, 0.1348860114812851, 0.5355706214904785, 0.7056477665901184, -0.4776057004928589, 1.9991614818572998, 0.25492021441459656, -0.49879753589630127, -0.020700471475720406, 0.5205522775650024, -0.13151492178440094, 0.24322791397571564, -0.09002704173326492, -0.5339353680610657, -0.6905004382133484, -0.4793427288532257, -0.5753371715545654, 1.0785597562789917, -0.8376231789588928, -0.018610378727316856, 2.3314523696899414, 0.4859733283519745, -0.31899452209472656, 1.0165929794311523, -0.09688716381788254, 0.9779155254364014, -0.4675481617450714, 0.2399086356163025, 0.1151476576924324, 0.7407677173614502, 0.12820936739444733, 0.7481772899627686, 0.4897526502609253, -0.3816620707511902, 0.423444539308548, -0.42953041195869446, -1.0550055503845215, 0.7910961508750916, 0.5393232703208923, -0.15882720053195953, -1.7058193683624268, 0.36783361434936523, -0.653914749622345, -0.016651809215545654, 1.3438118696212769, -0.5271036028862, 1.0151762962341309, -0.15813952684402466, 0.9643442630767822, -1.2352749109268188, 0.5292386412620544, 0.7325783371925354, -0.25680550932884216, 0.24278421700000763, 0.5838282108306885, 0.12534423172473907, -0.07369983196258545, -0.7971658110618591, 0.27440014481544495, 0.1676362305879593, 0.7232828140258789, 0.8130565285682678, -0.6648814678192139, 0.18237881362438202, 0.6223799586296082, -0.2020183503627777, -0.045716773718595505, -0.2484089732170105, 0.39859887957572937, 0.11470148712396622, 0.3555806875228882, -0.8652286529541016, -0.3336186110973358, -0.6998476386070251, -0.5229642391204834, -1.236716866493225]]
sample_id = "david-jager-57a50045"
sample_original_position = "Managing Director - Investments at Wells Fargo Advisor"

In [0]:
from pyspark.sql.functions import explode, col, size
import numpy as np

pred_cluster = knn_loaded.predict(sample_list)
print(pred_cluster)
temp = df_users_knn_preds.filter(col('knn_prediction') == int(pred_cluster))
embeddings = [np.array(embedding[0]) for embedding in temp.select('embeddings').collect()]

[40]


In [0]:
l=np.mean(embeddings,axis=0)

In [0]:
from scipy.spatial.distance import euclidean

dis_list = []
for center in centers:
    dis_list.append(euclidean(l, center))

In [0]:
x = np.argmin(dis_list)

In [0]:
pos_reco = [pos[0] for pos in df_posts_preds.filter(col('prediction') == int(x)).select('position').collect()]

In [0]:
id = [sample_id] * len(pos_reco)
original_pos = [sample_original_position] * len(pos_reco)

data = list(zip(id, original_pos, pos_reco))
result_sample = spark.createDataFrame(data, schema=['Id', 'Original Position', 'Post Position'])
result_sample.display()

Id,Original Position,Post Position
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Applications Consultant
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Guidewire Policy Center Consultant
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Development Manager
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Technical Product Managers:
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Product Owner
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Marketplace Analyst
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Senior Network Sales Engineer
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Business Analyst
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Functional Analyst
david-jager-57a50045,Managing Director - Investments at Wells Fargo Advisor,Solution Delivery Lead


As we can see we got that the posts that matches this user is really relevant for his job (as in skills requirements) like: 
Development Manager, Product Owner...