In [21]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModel

In [8]:
current_directory = os.path.dirname(os.path.abspath("__file__"))
project_root = os.path.abspath(os.path.join(current_directory, '..'))
job_posting = os.path.join(project_root, 'src', 'data', 'job_posting_description_cleaned.csv')
job_posting = load_dataset("csv", data_files=job_posting)

Generating train split: 33246 examples [00:07, 4241.67 examples/s]


In [9]:
job_posting['train'][:2]

{'job_id': [3757940104, 3757940025],
 'company_id': [553718, 2192142],
 'company_name': ['HearingLife', 'Metalcraft of Mayville, Inc.'],
 'title': ['Hearing Care Provider',
  'Shipping & Receiving Associate 2nd shift (Beaver Dam)'],
 'description': ['Overview\n\nHearingLife is a national hearing care company and part of the Demant Group, a global leader in hearing healthcare built on a heritage of care, health, and innovation since 1904. HearingLife operates more than 600 hearing care centers across 42 states. We follow a scientific, results-oriented approach to hearing healthcare that is provided by highly skilled and caring professionals. Our vision is to help more people hear better through life-changing hearing health delivered by the best personalized care. This Team Member must uphold the HearingLife Core Values:\n\n We create trust  We are team players  We apply a can-do attitude  We create innovative solutions \n\nResponsibilities\n\nYou will help more people hear better by pro

In [11]:
model_ckpt = "BAAI/bge-large-en" # can choose different checkpoint for sentence similarity "https://huggingface.co/models?pipeline_tag=sentence-similarity"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [12]:
import torch

try:
    device = torch.device("cuda")
    model.to(device)
    print("Using CUDA")
except Exception as e:
    device = torch.device("cpu")
    model.to(device)
    print("Using CPU")

Using CPU


In [13]:
# get embedding by pooling the output of last later
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [14]:
def get_embeddings(job_listing):
    encoded_input = tokenizer(
        job_listing["description_cleaned"], padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [15]:
## Test against a sinlge row
embedding = get_embeddings(job_posting["train"][0])
embedding.shape

torch.Size([1, 1024])

In [16]:
job_posting_splited = job_posting["train"].train_test_split(train_size=0.99, seed=42)
# Rename the default "test" split to "validation"
job_posting_splited["validation"] = job_posting_splited.pop("test")
# Add the "test" set to our `DatasetDict`
job_posting_splited

DatasetDict({
    train: Dataset({
        features: ['job_id', 'company_id', 'company_name', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped', 'company_speciality', 'company_industry', 'company_description', 'linkedin_url', 'job_skills', 'description_cleaned', 'description_cleaned_st'],
        num_rows: 32913
    })
    validation: Dataset({
        features: ['job_id', 'company_id', 'company_name', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'e

In [17]:
embeddings_dataset = job_posting_splited["validation"].map(
    lambda x: {"embeddings": get_embeddings(x).detach().cpu().numpy()[0]}
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 333/333 [13:34<00:00,  2.44s/ examples]


In [18]:
embeddings_dataset

Dataset({
    features: ['job_id', 'company_id', 'company_name', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped', 'company_speciality', 'company_industry', 'company_description', 'linkedin_url', 'job_skills', 'description_cleaned', 'description_cleaned_st', 'embeddings'],
    num_rows: 333
})

In [19]:
def search_jobs(search_query, embeddings = embeddings_dataset["embeddings"], k=5):
    # embedding search query
    question = {"description_cleaned": search_query} # similar to the job description from our validation set
    question_embedding = get_embeddings(question).cpu().detach().numpy()
    #
    # finding similari embeddings
    similarity_scores = list()
    for e in embeddings:
        similarity = 1 - cosine(question_embedding[0], e)
        similarity_scores.append(similarity)
    similarity_scores = np.array(similarity_scores)
    ranks = np.argsort(similarity_scores)
    ranks = ranks[::-1] # revers
    return ranks[:k]

In [25]:
# santiy checck
ranks = search_jobs("I am looking for Software Engineer jobs.", k=5)
ranks

array([121, 200,  11,  42, 298], dtype=int64)

In [26]:
embeddings_dataset.set_format("pandas")

In [27]:
embeddings_dataset[:].iloc[ranks]

Unnamed: 0,job_id,company_id,company_name,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,applies,original_listed_time,remote_allowed,views,job_posting_url,application_url,application_type,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped,company_speciality,company_industry,company_description,linkedin_url,job_skills,description_cleaned,description_cleaned_st,embeddings
121,3756108480,1441,Google,"Software Engineering Manager II, Machine Learn...",Minimum qualifications:\n\nBachelor's degree o...,283000.0,,185000.0,YEARLY,Full-time,"Sunnyvale, CA",5.0,1699050000000.0,,74.0,https://www.linkedin.com/jobs/view/3756108480/...,https://careers.google.com/jobs/results/136997...,OffsiteApply,1701640000000.0,,,,1699050000000.0,careers.google.com,0,FULL_TIME,USD,BASE_SALARY,1699135697,"search, ads, mobile, android, online video, ap...","Information Services, Technology, Information ...","""A problem isnt truly solved until its solved ...",https://www.linkedin.com/company/google,"Information Technology, Engineering",minimum qualificationsbachelors degree equival...,minimum qualificationsbachelors degree or equi...,"[0.015125895, -0.3616033, -0.36311126, -0.1834..."
200,3757493445,71939,Zenex Partners,Software Engineer,A great opportunity to work with one of our le...,75.0,,73.0,HOURLY,Contract,United States,63.0,1699050000000.0,1.0,156.0,https://www.linkedin.com/jobs/view/3757493445/...,,ComplexOnsiteApply,1701640000000.0,,Mid-Senior level,,1699050000000.0,,0,CONTRACT,USD,BASE_SALARY,1699057868,"Administrative & HR, Engineering & Technicians...","Travel Arrangements, Software Development, Com...",Zenex Partners was founded with the mission of...,https://www.linkedin.com/company/zenexpartners,"Engineering, Information Technology, Other",great one leading clientssoftware engineerloca...,a great opportunity to work with one of our le...,"[-0.19375668, -0.327315, -0.9283604, -0.184070..."
11,3755559192,75483,IT Trailblazers LLC,Biztalk Developer with .Net,HI Hope you are doing well!I have an urgent re...,,,,,Contract,United States,51.0,1699040000000.0,1.0,176.0,https://www.linkedin.com/jobs/view/3755559192/...,,ComplexOnsiteApply,1701630000000.0,,,,1699040000000.0,,0,CONTRACT,,,1699138776,Solution | Content Management | iOS Developmen...,IT Services and IT Consulting,Each and everything we do (professional servic...,https://www.linkedin.com/company/it-trailblazers,"Engineering, Information Technology",hi hope welli urgent one client please find de...,hi hope you are doing welli have an urgent req...,"[0.31549743, -0.07778397, -0.83485055, -0.0395..."
42,3699416230,343453,"Acumen, LLC",Front-End Application Developer III (Multiple ...,"Burlingame, CA and various unanticipated locat...",188310.0,,183310.0,YEARLY,Full-time,"Los Angeles, CA",4.0,1692850000000.0,,32.0,https://www.linkedin.com/jobs/view/3699416230/...,https://jobs.lever.co/acumenllc/87b51417-d863-...,OffsiteApply,1695440000000.0,,Mid-Senior level,,1692850000000.0,jobs.lever.co,0,FULL_TIME,USD,BASE_SALARY,1,"Healthcare Policy, Medicare/Medicaid Data, Eco...",Public Policy Offices,"Acumen, LLC works to improve the information p...",https://www.linkedin.com/company/acumen-llc,"Engineering, Information Technology",burlingame ca various unanticipated throughout...,burlingame ca and various unanticipated locati...,"[-0.35318315, -0.37763858, -0.8076201, -0.2217..."
298,3749350300,107854,Dover Corporation,Senior Software Engineer,Job Title: Senior Software EngineerDepartment:...,140000.0,,110000.0,YEARLY,Full-time,"Austin, Texas Metropolitan Area",4.0,1699040000000.0,,24.0,https://www.linkedin.com/jobs/view/3749350300/...,,ComplexOnsiteApply,1701630000000.0,,Associate,,1699040000000.0,,0,FULL_TIME,USD,BASE_SALARY,1699041423,"Fluids, Engineered Systems, Refrigeration & Fo...",Machinery Manufacturing,Dover is a diversified global manufacturer and...,https://www.linkedin.com/company/dovercorp,"Engineering, Information Technology",title senior software engineerdepartment solut...,job title senior software engineerdepartment s...,"[-0.047677077, -0.30189654, -0.44335032, -0.07..."
