In [1]:
import openai
from openai import embeddings
from dotenv import load_dotenv, find_dotenv
from unidecode import unidecode
from typing import Union, List


import os
import numpy as np
import pandas as pd

import torch


In [2]:
embeddingModel = 'text-embedding-3-small'

def get_embedding(text: str)->List[float]:
    text = text.replace("\n", " ")
    embedding = embeddings.create(
        input=text,
        model=embeddingModel
    )
    return embedding.data[0].embedding


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [3]:
data = pd.read_csv('data/df_all.csv')
data['embedding'] = data['embedding'].apply(eval).apply(np.array)
data.head()

Unnamed: 0,jobTitle,company,location,date posted,link,description,qualifications,date pulled,min_salary,max_salary,embedding
0,Data Scientist,Zenith,"New York, NY",,https://www.simplyhired.com/job/P_p1hn27u3cTsW...,Company Description\n\nZenith is the ROI Agenc...,Power BI||Microsoft Excel||Management||SAS||Re...,,70000,105000,"[-0.0014290509279817343, 0.055856604129076004,..."
1,Senior Data Scientist,"Charlie Health Engineering, Product & Design","New York, NY",,https://www.simplyhired.com/job/ikx9-0RLxNfIx5...,Why Charlie Health?\nYoung people across the c...,TensorFlow||Growing experience||Management||Py...,,148500,228000,"[-0.0016314274398609996, 0.002888173097744584,..."
2,Machine Learning Engineer,FalconSmartIT,"New York, NY",,https://www.simplyhired.com/job/OvfIVz13IToF5W...,Job Title: Machine Learning Engineer\nLocation...,TensorFlow||Oracle||Kubernetes||RESTful API||P...,,144000,183000,"[-0.02468576468527317, -0.02219345234334469, 0..."
3,Data Scientist,Morgan Stanley,"New York, NY",,https://www.simplyhired.com/job/zdQBTgDHV97jOa...,Data Scientist\nJob Number:\n3245996\nPOSTING ...,Ontology||TensorFlow||Oracle||Statistics||Doct...,,150000,190000,"[-0.015597617253661156, 0.015471013262867928, ..."
4,Data Scientist,City of Jersey City,"Jersey City, NJ",,https://www.simplyhired.com/job/QW6PGA25yHL-mA...,Job Opening:\nData Scientist - Innovation\nDep...,ArcGIS||Microsoft Word||Microsoft Excel||Micro...,,60000,67000,"[-0.015759838744997978, 0.013187067583203316, ..."


In [4]:
with open('Rhys Jervis Resume - Data Scientist (1).txt') as f:
    resume = f.read()

resume = unidecode(resume)

In [5]:

resume

"i>>?Irvington, NJ a(tm)| rhysjervis2@gmail.com a(tm)| 973-286-9351 a(tm)| github.com/sahleone  a(tm)| linkedin.com/in/rhys-jervis/\n________________\n\n\nData Scientist\nAccomplished Data Scientist Specializing in End-to-End Data Project Development\nProven and adaptable data scientist with a robust skill set dedicated to building comprehensive data projects from inception to implementation. Demonstrated success collaborating with cross-functional teams to deliver impactful results.\nSkills\n   Python (NumPy, Pandas, Scikit-learn, Stats models) | SQL/NoSQL | GIT \nAWS (EMR, S3) | Pytorch | Statistical Modeling | Machine Learning[a][b] | Forecasting\n\n\nTechnical Projects \n\n\nJob Posting Scraper and Recommendation Engine | Link                                             Present                                                    \n* Developed a web scraping module using Selenium to extract job postings from various websites.\n* Transformed the extracted data and stored it in a Vecto

In [6]:
resumeEmbedding = get_embedding(resume)

In [7]:
data['cosine_similarity'] = data['embedding'].apply(lambda x: cosine_similarity(x, resumeEmbedding))
data['cosine_distance'] = 1 - data['cosine_similarity']

data.sort_values('cosine_distance', ascending=True, inplace=True)

In [8]:
data.head()

Unnamed: 0,jobTitle,company,location,date posted,link,description,qualifications,date pulled,min_salary,max_salary,embedding,cosine_similarity,cosine_distance
111,Senior Data Analyst,Open Road Media,"New York, NY",,https://www.simplyhired.com/job/M7cjK3441tWBoT...,Essential Functions\nDevise mathematical model...,Economics||Relational databases||R||Tableau||M...,2024-02-07,108306,108306,"[-0.027259958907961845, 0.045442380011081696, ...",0.579271,0.420729
49,Data Scientist - Hybrid - W2,Ateeca Inc,"Franklin Lakes, NJ",,https://www.simplyhired.com/job/liX0Uu-5PKr_S0...,"""#INDEED_A""\nDescription:\nHybrid to remote 3 ...",R||SQL||AWS||Bachelor's degree||Clinical trial...,,60000,67000,"[-0.045383162796497345, 0.015293051488697529, ...",0.573564,0.426436
69,Data Science Program Manager - SFL Scientific,Deloitte,"New York, NY",,https://www.simplyhired.com/job/lpGth_CXLKBGsD...,"Data Science Program Manager, Specialist Maste...",Doctoral degree||Master's degree||Machine lear...,2024-02-01,137000,173000,"[0.008066137321293354, 0.020024465397000313, 0...",0.571437,0.428563
57,"Healthcare Artificial Intelligence Specialist,...",NJII,"Newark, NJ",,https://www.simplyhired.com/job/sKpFFbnC4rM-ce...,"Location\n100 summit street Newark, NJ, 07103,...",TensorFlow||CI/CD||Statistics||PyTorch||Softwa...,,981000,124000,"[-0.04891146346926689, 0.0010793631663545966, ...",0.569102,0.430898
94,"Scientist 3: Data Scientist (behavior change, ...",US Tech Solutions Private Limited,"New Brunswick, NJ",,https://www.simplyhired.com/job/sfahOgUOmhS1nb...,Duration: 12 months contract\nNotes about the ...,Ontology||Doctoral degree||Analysis skills||Do...,2024-02-06,6017,6017,"[-0.02753903903067112, 0.029321731999516487, 0...",0.562508,0.437492


# Database

In [17]:
def euclidean_distance(tensor1, tensor2):

    # np.linalg.norm(a - b)
    return torch.dist(tensor1, tensor2,p=2)

from torch.nn.functional import cosine_similarity 


def k_nearest_neighbors(query, tensor_list, k=None, metric = 'euclidean distance'):
    distances = []

    if k is None:
        k = len(tensor_list)

    if metric == 'euclidean distance':
        for i, tensor in enumerate(tensor_list):
            distances.append([i,euclidean_distance(query, tensor)])
    
    elif metric == 'cosine similarity':
        for i, tensor in enumerate(tensor_list):
            distances.append([i,cosine_similarity(query, tensor)])
        
    distances.sort(key=lambda x: x[1])
    return distances[:k]

# sklearn NearestNeighbors maybe

In [18]:
from lancedb.pydantic import LanceModel, vector

In [22]:
class JobPosting(LanceModel):
    jobTitle: str
    link: str
    description: str
    min_salary: int
    embedding: vector(1536)

In [23]:
import lancedb

db = lancedb.connect('~/.jobs.db')
table_name = "job_posting"
db.drop_table(table_name, ignore_missing=True)
table = db.create_table(table_name,schema=JobPosting)

In [24]:
table.add(data[['jobTitle', 'link', 'description', 'min_salary', 'embedding']])

In [26]:
type(resumeEmbedding)

list

In [33]:
table.search(np.array(resumeEmbedding),vector_column_name="embedding").limit(5).to_pandas()

Unnamed: 0,jobTitle,link,description,min_salary,embedding,_distance
0,Senior Data Analyst,https://www.simplyhired.com/job/M7cjK3441tWBoT...,Essential Functions\nDevise mathematical model...,108306,"[-0.027259959, 0.04544238, 0.07065169, 0.00478...",0.841458
1,Data Scientist - Hybrid - W2,https://www.simplyhired.com/job/liX0Uu-5PKr_S0...,"""#INDEED_A""\nDescription:\nHybrid to remote 3 ...",60000,"[-0.045383163, 0.0152930515, 0.08823126, 0.005...",0.852871
2,Data Science Program Manager - SFL Scientific,https://www.simplyhired.com/job/lpGth_CXLKBGsD...,"Data Science Program Manager, Specialist Maste...",137000,"[0.008066137, 0.020024465, 0.051869657, -0.000...",0.857125
3,"Healthcare Artificial Intelligence Specialist,...",https://www.simplyhired.com/job/sKpFFbnC4rM-ce...,"Location\n100 summit street Newark, NJ, 07103,...",981000,"[-0.048911463, 0.0010793632, 0.055022117, 0.00...",0.861796
4,"Scientist 3: Data Scientist (behavior change, ...",https://www.simplyhired.com/job/sfahOgUOmhS1nb...,Duration: 12 months contract\nNotes about the ...,6017,"[-0.027539039, 0.029321732, 0.01764153, 0.0293...",0.874984
