# Create Training Dataset for Embedding Fine-tuning

Based on example from [here](https://sbert.net/docs/sentence_transformer/training_overview.html#trainer)

### imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import json
from datasets import DatasetDict, Dataset
import re
import numpy as np
from sentence_transformers import SentenceTransformer

from functions import remove_irrelevant_sections, extract_qualifications_from_html, remove_eoe_notes

### load data (batch)

In [2]:
# # extract JDs
# df_jobs = pd.read_csv("data/job_data.csv")
# # df_jobs = df_jobs.drop_duplicates()

# # only keep text relevant to job qualifications
# df_jobs['description_cleaned'] = df_jobs['description'].apply(remove_irrelevant_sections)
# df_jobs['description_cleaned'] = df_jobs['description_cleaned'].apply(extract_qualifications_from_html)
# df_jobs['description_cleaned'] = df_jobs['description_cleaned'].apply(remove_eoe_notes)

# # store job descriptions in a list
# job_description_list = df_jobs['description_cleaned'].to_list()

In [3]:
# # extract synthetic queries and store in list (from batch request_
# file_path = 'data/output.jsonl'
# query_list = []

# with open(file_path, 'r') as file:
#     for line in file:
#         query = json.loads(line)['response']['body']['choices'][0]['message']['content'].replace('"', '')
#         query_list.append(query)

In [4]:
# # create dict with queries and JDs
# df = pd.DataFrame({"query" : query_list, "job_description_pos" : job_description_list})

### load data (from csv)

In [5]:
# extract JDs
df_jobs = pd.read_csv("data/job_data_w_query.csv")

# only keep text relevant to job qualifications
df_jobs['job_description_pos'] = df_jobs['description'].apply(remove_irrelevant_sections)
df_jobs['job_description_pos'] = df_jobs['job_description_pos'].apply(extract_qualifications_from_html)
df_jobs['job_description_pos'] = df_jobs['job_description_pos'].apply(remove_eoe_notes)

# store job descriptions in a list
df = df_jobs[['query', 'job_description_pos']]

In [6]:
# drop duplicates
print("Original shape:", df.shape)
df = df.drop_duplicates(subset=['job_description_pos'])
print("Unique JDs:", df.shape)
df = df.drop_duplicates(subset=['query'])
print("Unique queries:",df.shape)

Original shape: (1179, 2)
Unique JDs: (1020, 2)
Unique queries: (1012, 2)


In [7]:
df.head()

Unnamed: 0,query,job_description_pos
0,"Kafka data injection, Snowflake SQL optimizati...",experience neededVery strong experience in Kaf...
1,"automotive engineering, cloud infrastructure m...",requirements to determine feasibility of desig...
2,"React development, API authentication, AWS Lambda","experienceAccountable for code quality, includ..."
3,"Data Analyst contract Queens NY, data modeling...","QualificationsAnalytical skills, including the..."
4,job search query: mortgage banking systems opt...,requirements and industry practices for mortga...


### create negative pairs

In [8]:
# Load the model
model = SentenceTransformer("all-mpnet-base-v2")

In [9]:
%%time
# Encode all job descriptions
job_embeddings = model.encode(df['job_description_pos'].to_list())
print(job_embeddings.shape)

(1012, 768)
CPU times: user 18.5 s, sys: 13.4 s, total: 32 s
Wall time: 1min 2s


In [10]:
# compute similarities
similarities = model.similarity(job_embeddings, job_embeddings)
print(similarities.shape)

torch.Size([1012, 1012])


In [11]:
# match least JDs least similar to positive match as the negative match
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

In [12]:
# add negative pairs to df
df['job_description_neg'] = df['job_description_pos'].iloc[negative_pair_index_list].values

In [13]:
df.head()

Unnamed: 0,query,job_description_pos,job_description_neg
0,"Kafka data injection, Snowflake SQL optimizati...",experience neededVery strong experience in Kaf...,"qualifications, skills, competencies, competen..."
1,"automotive engineering, cloud infrastructure m...",requirements to determine feasibility of desig...,SQL (expert)Snowflake - not a roadblock (added...
2,"React development, API authentication, AWS Lambda","experienceAccountable for code quality, includ...",Resource should be able to visualize and expla...
3,"Data Analyst contract Queens NY, data modeling...","QualificationsAnalytical skills, including the...",experiences. We own and operate leading entert...
4,job search query: mortgage banking systems opt...,requirements and industry practices for mortga...,Qualifications:\nFluency in English (native or...


### train-eval-test split

In [14]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

### upload to hugging face hub

In [15]:
# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

In [16]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 809
    })
    validation: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 101
    })
    test: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 102
    })
})

In [None]:
# push data to hub
dataset_dict.push_to_hub("pnimeesha/ai-job-embedding-finetuning")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shawhin/ai-job-embedding-finetuning/commit/c86ac36bb69d9cfa0f85968382b58e1be707f85b', commit_message='Upload dataset', commit_description='', oid='c86ac36bb69d9cfa0f85968382b58e1be707f85b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/shawhin/ai-job-embedding-finetuning', endpoint='https://huggingface.co', repo_type='dataset', repo_id='shawhin/ai-job-embedding-finetuning'), pr_revision=None, pr_num=None)