# Creating Dataset

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import huggingface_hub

from datasets import DatasetDict, Dataset

import warnings
warnings.filterwarnings("ignore")


In [2]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"

## Reading the data

In [3]:
df = pd.read_csv(f"{path}/data/synthetic_data/synthetic_data_v2/synthetic_data_v2.csv")
df

Unnamed: 0,job_data,resume_data,label
0,Audit Manager\nMenzies LLP\nWe are looking for...,**Jennifer Oneal**\n*+1 (555) 555-5555* *|* *j...,Complete Mismatch
1,Audit Manager\nMenzies LLP\nWe are looking for...,**Christina Padilla DVM**\n\n*Email: christina...,Underwhelming
2,Audit Manager\nMenzies LLP\nWe are looking for...,**Andrew Kirby**\n*+44 00000 000000* *|* *andr...,Good Fit
3,Audit Manager\nMenzies LLP\nWe are looking for...,**Erin Hicks**\n*+1 (555) 555-5555* *|* *erin....,Overqualified
4,"Audit Manager\nOverview\nBaker Tilly US, LLP (...",**Randy Smith**\n*+1 (123) 456-7890* *|* *rand...,Complete Mismatch
...,...,...,...
3995,Test Engineer\nJob Summary:\nPerforms LAT test...,# Darren Roberts\n\n*Email*: darren.roberts@em...,Overqualified
3996,Test Engineer\nCompany Description\nMUST be au...,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...,Complete Mismatch
3997,Test Engineer\nCompany Description\nMUST be au...,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5...",Underwhelming
3998,Test Engineer\nCompany Description\nMUST be au...,"# William Torres\n\n*Greensboro, NC* *|* *will...",Good Fit


In [4]:
revised_label_map = {"Complete Mismatch": "Bad Fit", "Underwhelming":"Bad Fit", "Good Fit":"Good Fit", "Overqualified":"Good Fit"}
df['label'] = df['label'].map(revised_label_map)
df['label'].value_counts()

label
Bad Fit     2000
Good Fit    2000
Name: count, dtype: int64

In [5]:
print("Number of rows in the dataset:", df.shape[0])
print("Number of columns in the dataset:", df.shape[1])

Number of rows in the dataset: 4000
Number of columns in the dataset: 3


In [6]:
df

Unnamed: 0,job_data,resume_data,label
0,Audit Manager\nMenzies LLP\nWe are looking for...,**Jennifer Oneal**\n*+1 (555) 555-5555* *|* *j...,Bad Fit
1,Audit Manager\nMenzies LLP\nWe are looking for...,**Christina Padilla DVM**\n\n*Email: christina...,Bad Fit
2,Audit Manager\nMenzies LLP\nWe are looking for...,**Andrew Kirby**\n*+44 00000 000000* *|* *andr...,Good Fit
3,Audit Manager\nMenzies LLP\nWe are looking for...,**Erin Hicks**\n*+1 (555) 555-5555* *|* *erin....,Good Fit
4,"Audit Manager\nOverview\nBaker Tilly US, LLP (...",**Randy Smith**\n*+1 (123) 456-7890* *|* *rand...,Bad Fit
...,...,...,...
3995,Test Engineer\nJob Summary:\nPerforms LAT test...,# Darren Roberts\n\n*Email*: darren.roberts@em...,Good Fit
3996,Test Engineer\nCompany Description\nMUST be au...,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...,Bad Fit
3997,Test Engineer\nCompany Description\nMUST be au...,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5...",Bad Fit
3998,Test Engineer\nCompany Description\nMUST be au...,"# William Torres\n\n*Greensboro, NC* *|* *will...",Good Fit


In [7]:
# Example mapping: adjust as necessary for your labels
label_list = sorted(df['label'].unique())
label2id = {label: i for i, label in enumerate(label_list)}
print(label2id)
df['label'] = df['label'].map(label2id)
df['label'].value_counts()

{'Bad Fit': 0, 'Good Fit': 1}


label
0    2000
1    2000
Name: count, dtype: int64

In [8]:
def get_title(text):
    return text.split("\n")[0]

## Analyzing Job titles

In [9]:
df['job_title'] = df['job_data'].apply(lambda x: get_title(x))
job_titles = df['job_title'].unique().tolist()
print(job_titles)

['Audit Manager', 'Automation Engineer', 'Business Development Representative', 'Business Intelligence Analyst', 'CNC Machinist', 'Construction Supervisor', 'Data Analyst', 'Data Engineer', 'Data Scientist', 'DevOps Engineer', 'Electrician', 'Embedded Software Engineer', 'Enterprise Architect', 'Facilities Manager', 'Financial Analyst', 'HR Manager', 'Lead Data Engineer', 'Marketing Coordinator', 'Physical Therapist', 'Physician', 'Plant Controller', 'Production Planner', 'Project Coordinator', 'Quality Assurance Engineer', 'Quality Control Inspector', 'Research Scientist', 'Retail Store Manager', 'Sales Manager', 'Senior Data Analyst', 'Senior Data Engineer', 'Senior Data Scientist', 'Senior DevOps Engineer', 'Senior Embedded Software Engineer', 'Senior Full Stack Developer', 'Senior Product Designer', 'Senior Quality Engineer', 'Senior Site Reliability Engineer', 'Senior Software Developer', 'Senior System Engineer', 'Senior Systems Engineer', 'Senior Tax Accountant', 'Senior Test En

### Selecting job titles for train, val, test split

In [10]:
# Train Data (40 Titles)
train_titles = [
    "Audit Manager", "Automation Engineer", "Business Development Representative", "Business Intelligence Analyst",
    "Construction Supervisor", "Data Analyst", "Data Engineer", "Data Scientist", "DevOps Engineer",
    "Embedded Software Engineer", "Enterprise Architect", "Facilities Manager", "Financial Analyst", "HR Manager",
    "Lead Data Engineer", "Marketing Coordinator", "Plant Controller", "Production Planner", "Project Coordinator",
    "Quality Assurance Engineer", "Quality Control Inspector", "Research Scientist", "Sales Manager",
    "Senior Data Analyst", "Senior Data Engineer", "Senior Data Scientist", "Senior DevOps Engineer",
    "Senior Embedded Software Engineer", "Senior Full Stack Developer", "Senior Product Designer",
    "Senior Quality Engineer", "Senior Site Reliability Engineer", "Senior Software Developer",
    "Senior System Engineer", "Senior Systems Engineer", "Senior Tax Accountant", "Senior Test Engineer",
    "Software Developer", "Software Engineer", "Supply Chain Manager"
]

# Validation Data (5 Titles)
val_titles = [
    "Solution Architect", "Solutions Architect", "System Engineer", "Systems Engineer", "Test Engineer"
]

# Test Data (5 Titles) - Significantly Different
test_titles = [
    "Physician", "Electrician", "CNC Machinist", "Physical Therapist", "Retail Store Manager"
]


### Checking for any common job titles among train, val, test

In [11]:
print("Number of common job titles between train and validation titles selected: ", len(set(train_titles).intersection(val_titles)))
print("Number of common job titles between validation and test titles selected: ", len(set(val_titles).intersection(test_titles)))
print("Number of common job titles between test and train titles selected: ", len(set(test_titles).intersection(train_titles)))

Number of common job titles between train and validation titles selected:  0
Number of common job titles between validation and test titles selected:  0
Number of common job titles between test and train titles selected:  0


## Creating Train, Val, test data

### Train data

In [12]:
train_df = df[df['job_title'].isin(train_titles)].drop(['job_title'], axis=1)
train_df

Unnamed: 0,job_data,resume_data,label
0,Audit Manager\nMenzies LLP\nWe are looking for...,**Jennifer Oneal**\n*+1 (555) 555-5555* *|* *j...,0
1,Audit Manager\nMenzies LLP\nWe are looking for...,**Christina Padilla DVM**\n\n*Email: christina...,0
2,Audit Manager\nMenzies LLP\nWe are looking for...,**Andrew Kirby**\n*+44 00000 000000* *|* *andr...,1
3,Audit Manager\nMenzies LLP\nWe are looking for...,**Erin Hicks**\n*+1 (555) 555-5555* *|* *erin....,1
4,"Audit Manager\nOverview\nBaker Tilly US, LLP (...",**Randy Smith**\n*+1 (123) 456-7890* *|* *rand...,0
...,...,...,...
3875,Supply Chain Manager\nManitoba Harvest - Winni...,**Tina Green**\n*+1 (234) 567-8901* *|* *tina....,1
3876,"Supply Chain Manager\n""Fuel your future with u...","# Nicole Bailey\n\n*Irving, TX 750xxxxxxxx | n...",0
3877,"Supply Chain Manager\n""Fuel your future with u...","**Kimberly Hernandez**\n*Irving, TX | (214) 55...",0
3878,"Supply Chain Manager\n""Fuel your future with u...",**ALYSSA BALDWIN**\n\n*(+1) 555-555-5555* *|* ...,1


#### Train label distribution

In [13]:
train_df['label'].value_counts()

label
0    1600
1    1600
Name: count, dtype: int64

### Validation data

In [14]:
val_df = df[df['job_title'].isin(val_titles)].drop(['job_title'], axis=1)
val_df


Unnamed: 0,job_data,resume_data,label
2760,Solution Architect\nNumber Of Positions Availa...,**Billy Dickson**\n*+1 (123) 456-7890* *|* *bi...,0
2761,Solution Architect\nNumber Of Positions Availa...,**KEVIN FORD**\n\n*+1 (xxx) xxx-xxxx* | *kevin...,0
2762,Solution Architect\nNumber Of Positions Availa...,**KRYSTAL DAVIS**\n\n*+61 4 0000 0000* *|* *kr...,1
2763,Solution Architect\nNumber Of Positions Availa...,**Leslie Perry**\n\n*+1 (123) 456-7890* | *les...,1
2764,Solution Architect\nSecurity Clearance Require...,"# James Bailey\n\n*Chippenham, UK* *+44 7xxx x...",0
...,...,...,...
3995,Test Engineer\nJob Summary:\nPerforms LAT test...,# Darren Roberts\n\n*Email*: darren.roberts@em...,1
3996,Test Engineer\nCompany Description\nMUST be au...,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...,0
3997,Test Engineer\nCompany Description\nMUST be au...,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5...",0
3998,Test Engineer\nCompany Description\nMUST be au...,"# William Torres\n\n*Greensboro, NC* *|* *will...",1


#### Validation label distribution

In [15]:
val_df['label'].value_counts()

label
0    200
1    200
Name: count, dtype: int64

### Test data

In [16]:
test_df = df[df['job_title'].isin(test_titles)].drop(['job_title'], axis=1)
test_df

Unnamed: 0,job_data,resume_data,label
160,"CNC Machinist\nLoc Performance Products, LLC p...",**DONNA JONES**\n\n*donnajones@email.com* · (1...,0
161,"CNC Machinist\nLoc Performance Products, LLC p...",**Ann Nelson**\n(555) 555-5555 | ann.nelson@em...,0
162,"CNC Machinist\nLoc Performance Products, LLC p...",**Adam Johnson**\n\n*+1 (123) 456-7890* *|* *a...,1
163,"CNC Machinist\nLoc Performance Products, LLC p...","# MARK MURRAY\n\n*Plymouth, MI* *|* *555-555-5...",1
164,CNC Machinist\nJC Ford - Tennessee is seeking ...,"# Tim Young\n\n*Nashville, TN* *|* *(615) 555-...",0
...,...,...,...
3075,Retail Store Manager\nAt Nespresso we place pe...,# WILLIAM BROWN\n\n*Email*: william.brown@emai...,1
3076,Retail Store Manager\nThe Retail Store Manager...,**Cory Mullins**\n*555-555-5555* *cory.mullins...,0
3077,Retail Store Manager\nThe Retail Store Manager...,**Catherine Church**\n*+1 (xxx) xxx- xxxx* *|*...,0
3078,Retail Store Manager\nThe Retail Store Manager...,**Franklin Gardner**\n*Phone: (555) 555-5555* ...,1


#### Test label distribution

In [17]:
test_df['label'].value_counts()

label
0    200
1    200
Name: count, dtype: int64

### Checking for matching indices

In [18]:
print("Number of common indices between train and validation data: ", len(set(train_df.index.tolist()).intersection(val_df.index)))
print("Number of common indices between validation and test data: ", len(set(val_df.index.tolist()).intersection(test_df.index)))
print("Number of common indices between test and train titles selected: ", len(set(test_df.index.tolist()).intersection(train_df.index)))

Number of common indices between train and validation data:  0
Number of common indices between validation and test data:  0
Number of common indices between test and train titles selected:  0


Therefore, No data is missed

## Creating dataset

In [19]:
# # -------------------------------
# # Creating Dataset
# # -------------------------------

# Convert Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict with train, validation, and test splits
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
    test: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
})

## Saving to disk

In [20]:
dataset.save_to_disk(f"{path}/data/jd_resume_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/3200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

## Saving to Huggingface Datasets

In [21]:
dataset.push_to_hub("saideep-arikontham/jd_resume_dataset", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/saideep-arikontham/jd_resume_dataset/commit/52183ed4d12b97f83e3718a0a10d0136c102d775', commit_message='Upload dataset', commit_description='', oid='52183ed4d12b97f83e3718a0a10d0136c102d775', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/saideep-arikontham/jd_resume_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='saideep-arikontham/jd_resume_dataset'), pr_revision=None, pr_num=None)