## Resume Anonymization

In [1]:
import kagglehub
import os
from pathlib import Path
os.environ["KAGGLEHUB_CACHE"] = str(Path.cwd() / "data" / "kagglehub")

import pandas as pd

path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/socialaistudio/Code/hiring_agent/data/kagglehub/datasets/snehaanbhawal/resume-dataset/versions/1


In [2]:
resume_df = pd.read_csv(path + "/Resume/resume.csv")
resume_df.drop(columns=['ID', 'Resume_html'], inplace=True)
resume_df.rename(columns={'Resume_str': 'resume'}, inplace=True)
resume_df = resume_df[['Category', 'resume']]
resume_df.head()

Unnamed: 0,Category,resume
0,HR,HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1,HR,"HR SPECIALIST, US HR OPERATIONS ..."
2,HR,HR DIRECTOR Summary Over 2...
3,HR,HR SPECIALIST Summary Dedica...
4,HR,HR MANAGER Skill Highlights ...


In [3]:
# resume_df.iloc[0].resume
resume_df.Category.unique(), len(resume_df.Category.unique())

(array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
        'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
        'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
        'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
        'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
       dtype=object),
 24)

In [4]:
# Select 2 resume samples from each category
sampled_resumes = pd.concat([resume_df[resume_df.Category == cat].sample(
    2, random_state=42) for cat in resume_df.Category.unique()
]).reset_index(drop=True)
sampled_resumes.head(5)

Unnamed: 0,Category,resume
0,HR,ASSISTANT MANAGER - HR www...
1,HR,HR ASSISTANT Summary Highly ...
2,DESIGNER,SENIOR GRAPHIC DESIGNER Summary...
3,DESIGNER,WEBSITE DESIGNER Summary ...
4,INFORMATION-TECHNOLOGY,INFORMATION TECHNOLOGY SPECIALIST (IN...


In [12]:
# select all resumes from 'CONSTRUCTION' category
SECTOR = "CONSTRUCTION"
sector_resumes = resume_df[resume_df.Category == 'CONSTRUCTION'].reset_index(drop=True)
print(f"Number of {SECTOR} resumes: {len(sector_resumes)}")
sector_resumes.head(5)

Number of CONSTRUCTION resumes: 112


Unnamed: 0,Category,resume
0,CONSTRUCTION,CONSTRUCTION Executive Summary ...
1,CONSTRUCTION,CONSTRUCTION Summary The pur...
2,CONSTRUCTION,CONSTRUCTION Experience ...
3,CONSTRUCTION,CONCRETE CONSTRUCTION Summary...
4,CONSTRUCTION,CONSTRUCTION WORK Career Over...


In [5]:

# create a subsample of 5 resumes for testing
subsampled_resumes = resume_df.sample(2, random_state=42).reset_index(drop=True)
subsampled_resumes.head(2)

Unnamed: 0,Category,resume
0,TEACHER,Kpandipou Koffi Summary ...
1,DIGITAL-MEDIA,DIRECTOR OF DIGITAL TRANSFORMATION ...


In [15]:
from src.utils.pipeline_utils import batch_process_resumes
from tqdm import tqdm
# batch in groups of 20 resumes to avoid timeout
processed_resumes = {}
batch_size = 20
for i in tqdm(range(0, len(sector_resumes), batch_size)):
    batch = sector_resumes.resume.iloc[i:i+batch_size]
    batch_result = batch_process_resumes(batch, country="Singapore")
    processed_resumes.update(batch_result)
# add the processed resumes to sampled_resumes dataframe as new columns
# for key in processed_resumes[0].keys():
#     subsampled_resumes[key] = [dct[key] for dct in processed_resumes]
# subsampled_resumes.head(5)

100%|██████████| 6/6 [13:45<00:00, 137.64s/it]


In [16]:
len(processed_resumes), processed_resumes[0]

(112,
 {'anonymized': '## Anonymized Resume\n\n**CONSTRUCTION Executive Summary**  \nTo find an internship in the profession where I can gain experience in and exposure to the practice of product design.\n\n**Core Qualifications**  \n- Adobe Photoshop and Illustrator  \n- AutoCAD and Revit  \n- Microsoft Word, Excel, and PowerPoint  \n\n**Professional Experience**  \n\n**Aug 2006 to Current**  \n[COMPANY X]  \n[LOCATION]  \nHigh Value Residential Insurance Appraiser  \n- Appraise high value homes in [STATE] for a replacement cost.  \n\n**Jul 2005 to Jan 2006**  \n[COMPANY Y]  \n[LOCATION]  \n- Extensive remodeling project.  \n\n**Nov 2004**  \n[COMPANY Z]  \n[LOCATION]  \n- Internship supporting interior design/project teams, researching materials, and organizing the materials resource library.  \n\n**Jan 1999 to Jan 2000**  \n[COMPANY A]  \n[LOCATION]  \nAccounts Payable Assistant  \n- Handling petty cash, data entry, payroll distribution, and other administrative duties.  \n\n**Educa

In [17]:
# Create empty columns in the dataframe for the new keys
for key in processed_resumes[0].keys():
    if key not in sector_resumes.columns:
        sector_resumes[key] = None
# Fill the new columns with the processed resume data
for k in processed_resumes:
    for key in processed_resumes[k].keys():
        sector_resumes.loc[sector_resumes.index == k, key] = processed_resumes[k][key]
sector_resumes.head(5)

Unnamed: 0,Category,resume,anonymized,reformatted,localized
0,CONSTRUCTION,CONSTRUCTION Executive Summary ...,## Anonymized Resume\n\n**CONSTRUCTION Executi...,```markdown\n[Candidate Name]\n\n## Executive ...,```markdown\n[Candidate Name]\n\n## Executive ...
1,CONSTRUCTION,CONSTRUCTION Summary The pur...,## ANONYMIZED RESUME\n\n**Summary** \nThe pur...,```markdown\n## [Candidate Name]\n\n**Summary*...,```markdown\n## [Candidate Name]\n\n**Summary*...
2,CONSTRUCTION,CONSTRUCTION Experience ...,## Anonymized Resume\n\n**CONSTRUCTION Experie...,```markdown\n## [Candidate Name]\n\n**CONSTRUC...,```markdown\n## [Candidate Name]\n\n**CONSTRUC...
3,CONSTRUCTION,CONCRETE CONSTRUCTION Summary...,## Anonymized Resume\n\n**CONCRETE CONSTRUCTIO...,```markdown\n## [Candidate Name]\n\n**CONCRETE...,```markdown\n## [Candidate Name]\n\n**CONCRETE...
4,CONSTRUCTION,CONSTRUCTION WORK Career Over...,## Anonymized Resume\n\n**CONSTRUCTION WORK**\...,```markdown\n## [Candidate Name]\n\n**CONSTRUC...,```markdown\n## [Candidate Name]\n\n**CONSTRUC...


In [18]:
# Save the processed resumes to a new CSV file
os.makedirs("data/processed", exist_ok=True)
sector_resumes.to_csv(f"data/processed/{SECTOR.lower()}_resumes.csv", index=False)

In [None]:
# # Save the processed resumes to a new CSV file
# os.makedirs("data/processed", exist_ok=True)
# sampled_resumes.to_csv("data/processed/resumes.csv", index=False)

## Post-processing - Resume

In [None]:
import pandas as pd
from pathlib import Path
import re 


def clean_text(text):
    # remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # remove multiple spaces
    text = re.sub(r' +', ' ', text)
    # strip leading and trailing whitespace
    text = text.strip()
    # remove triple backticks and the word markdown
    text = text.replace("```", "").replace("markdown", "")
    # remove first and last newlines
    text = text.lstrip('\n').rstrip('\n')
    # replace Candidate Name with CANDIDATE NAME
    text = text.replace("Candidate Name", "CANDIDATE NAME")
    return text


resume_df = pd.read_csv(Path("data/processed/resumes.csv"))
resume_df['localized'] = resume_df['localized'].apply(clean_text)
# drop resume, anonymized, and reformatted columns, rename localized to resume
resume_df = resume_df.drop(columns=['resume', 'anonymized', 'reformatted']).rename(columns={'localized': 'resume'})
resume_df.head()

## Job Processing

In [None]:
import kagglehub
import os
from pathlib import Path

os.environ["KAGGLEHUB_CACHE"] = str(Path.cwd() / "data" / "kagglehub")

# Download latest version
path = kagglehub.dataset_download("marcocavaco/scraped-job-descriptions")

print("Path to dataset files:", path)

import pandas as pd
df = pd.read_csv(path + "/JD_data.csv", index_col=0)
df['description'] = df['description'].apply(lambda x: x[3:-3])
df.head()

In [None]:
print(df['major_job'].unique(), len(df['major_job'].unique()))
print(df.ISCO.unique(), len(df.ISCO.unique()))
print(df['job'].value_counts())

# for i in range(9):
    # print(" ".join(df.iloc[i].position.split(" ")[:-1]),'\t', df.iloc[i].job)
    # print(df.iloc[i].description[3:-3].replace(". ", ".\n"))
    # print()

In [None]:
# Sample a random row per unique job
sampled_jobs = df.groupby('job').apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)
sampled_jobs.rename(columns={'description': 'job_description', 
                             'job': 'job_type', 
                             'major_job': 'job_classification'}, inplace=True)
sampled_jobs.drop(columns=['location', 'ISCO'], inplace=True)
sampled_jobs.head(5)

In [None]:
d = sampled_jobs.iloc[0]
print(d)

In [None]:
# from src.utils.pipeline_utils import job_pipeline

# job_dct = job_pipeline(job_classification=d['job_classification'], 
#                        job_type=d['job_type'], 
#                        position=d['position'], 
#                        job_description=d['job_description'])
# print(job_dct)

In [None]:
subsampled_jobs = sampled_jobs.sample(2, random_state=42).reset_index(drop=True)
subsampled_jobs.head(2)

In [None]:
from src.utils.pipeline_utils import batch_job_pipeline

batch_results = batch_job_pipeline(sampled_jobs)

In [None]:
len(batch_results), list(batch_results.keys())[:5]

In [None]:
# Add the processed job descriptions to the sampled_jobs dataframe as new columns
for key in batch_results[list(batch_results.keys())[0]].keys():
    if key not in sampled_jobs.columns:
        sampled_jobs[key] = None
# Fill the new columns with the processed job data
for job_id in batch_results:
    for key in batch_results[job_id].keys():
        sampled_jobs.loc[sampled_jobs.index == int(job_id), key] = batch_results[job_id][key]
sampled_jobs.drop(columns=['job_id'], inplace=True)
sampled_jobs.head(5)

In [None]:
# save the sampled_jobs to a csv file
import os
import pandas as pd
os.makedirs("data/processed", exist_ok=True)
sampled_jobs.to_csv("data/processed/sampled_jobs.csv", index=False)

## Post-processing - Job

In [None]:
import pandas as pd
import re 

def clean_text(text):
    # remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # remove multiple spaces
    text = re.sub(r' +', ' ', text)
    # strip leading and trailing whitespace
    text = text.strip()
    # remove triple backticks and the word markdown
    text = text.replace("```", "").replace("markdown", "")
    # remove first and last newlines
    text = text.lstrip('\n').rstrip('\n')
    return text

results_df = pd.read_csv("data/processed/sampled_jobs.csv", index_col=0)
results_df['company_criteria'] = results_df['company_criteria'].apply(clean_text)
results_df['previous_hires'] = results_df['previous_hires'].apply(clean_text)
results_df.head()