In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
random.seed(42)

In [3]:
# attribute dictionaries to randomly sample
rhode_island_zipcodes = [
    "02801", "02802", "02804", "02806", "02807", "02808", "02809", "02812", "02813", "02814",
    "02815", "02816", "02817", "02818", "02822", "02823", "02824", "02825", "02826", "02827",
    "02828", "02829", "02830", "02831", "02832", "02833", "02835", "02836", "02837", "02838",
    "02839", "02840", "02841", "02842", "02852", "02854", "02857", "02858", "02859", "02860",
    "02861", "02862", "02863", "02864", "02865", "02871", "02872", "02873", "02874", "02875",
    "02876", "02877", "02878", "02879", "02880", "02881", "02882", "02883", "02885", "02886",
    "02887", "02888", "02889", "02891", "02892", "02893", "02894", "02895", "02896", "02898",
    "02901", "02902", "02903", "02904", "02905", "02906", "02907", "02908", "02909", "02910",
    "02911", "02912", "02914", "02915", "02916", "02917", "02918", "02919", "02920", "02921",
    "02940"
]

us_big_cities = [
    "New York City",
    "Los Angeles",
    "Chicago",
    "Houston",
    "Phoenix",
    "Philadelphia",
    "San Antonio",
    "San Diego",
    "Dallas",
    "San Jose"
]

us_colleges = [
    "Harvard University",
    "Stanford University",
    "Massachusetts Institute of Technology (MIT)",
    "California Institute of Technology (Caltech)",
    "Princeton University",
    "Yale University",
    "University of Chicago",
    "Columbia University",
    "University of Pennsylvania",
    "Johns Hopkins University",
    "Northwestern University",
    "Duke University",
    "University of Michigan",
    "Carnegie Mellon University",
    "University of Washington",
    "University of Texas at Austin",
    "University of North Carolina at Chapel Hill",
    "University of Virginia",
    "Cornell University",
    "Brown University",
    "Dartmouth College",
    "Rice University",
    "Vanderbilt University",
    "University of Notre Dame",
    "Georgetown University",
    "Emory University",
    "New York University (NYU)",
    "University of Southern California (USC)",
    "Boston University",
    "University of Florida",
    "University of Miami",
    "University of Pittsburgh",
    "University of Minnesota Twin Cities",
    "University of Colorado Boulder",
    "Georgia Institute of Technology (Georgia Tech)",
    "Ohio State University",
    "University of Massachusetts Amherst",
    "University of Arizona",
    "University of Oregon",
    "University of Utah",
    "University of Georgia",
    "University of Iowa",
    "University of Alabama",
    "University of Tennessee Knoxville",
    "Michigan State University",
    "Pennsylvania State University",
    "University of Connecticut",
    "University of Delaware",
    "University of Oklahoma",
    "University of Kansas",
    "University of Missouri",
    "University of Kentucky",
    "University of South Carolina",
    "Louisiana State University (LSU)",
    "Texas A&M University",
    "University of Houston",
    "Arizona State University",
    "Indiana University Bloomington",
    "Purdue University",
    "Virginia Tech",
    "Rutgers University",
    "Florida State University",
    "University of Central Florida",
    "University of Cincinnati",
    "Iowa State University",
    "University of New Mexico",
    "University of Hawaii at Manoa"
]

In [4]:
# randomly generate start time for roles
def generate_start_time():
    return np.arange(datetime(2000,1,1), datetime(2010,1,1), timedelta(days = 30)).astype(datetime)

In [5]:
feature_value_range = {
    "School Name" : us_colleges,
    "GPA" : np.arange(2.0, 4.0, 0.01),
    "Degree" : ["Bachelors", "Masters", "PhD"],
    "Location" : us_big_cities,
    "Gender" : ["M","F","N/A"],
    "Veteran status" : [1.0, 0.0, "N/A"],
    "Disability" : [1, 0, "N/A"],
    #"Gender" : ["M","F",np.nan],
    #"Veteran status" : [1.0, 0.0, np.nan],
    "Work authorization" : [1, 0],
    #"Disability" : [1, 0, np.nan], # need to add nan later but rn has type error, should have smaller proportion of 1
    "Ethnicity" : [0,1,2,3,4],
    "Role 1" : ["Junior SWE", "Senior SWE", "Research Assistant", "Junior Data Scientist", "Senior Data Scientist" , "Lawyer", "Teaching Assistant", "ML Engineer"],
    "Role 2" : ["Junior SWE", "Senior SWE", "Research Assistant", "Junior Data Scientist", "Senior Data Scientist" , "Lawyer", "Teaching Assistant", "ML Engineer", "N/A"],
    "Role 3" : ["Junior SWE", "Senior SWE", "Research Assistant", "Junior Data Scientist", "Senior Data Scientist" , "Lawyer", "Teaching Assistant", "ML Engineer", "N/A"],
    "Start 1": generate_start_time(),
    "Start 2" : generate_start_time(),
    "Start 3" : generate_start_time()
}


In [18]:
# create random dictionary of values:
n = 2000
random_data = {}

for feature in feature_value_range:
    random_data[feature] = random.choices(feature_value_range[feature], k = n)

random_df = pd.DataFrame(random_data)

# start dates end dates generation
random_df[f'Job Duration 1'] = random.choices(np.arange(90, 1000), k=n)
random_df[f'End 1'] = random_df[f'Start 1'] + pd.to_timedelta(random_df[f'Job Duration 1'], unit = 'days')
random_df.drop(columns=[f"Job Duration 1"], inplace = True)

for i in range(2,4):
    # Generate start date based on previous end date
    random_df[f'Job Gap {i}'] = random.choices(np.arange(0, 90), k=n)
    random_df[f'Start {i}'] = random_df[f'End {i-1}'] + pd.to_timedelta(random_df[f'Job Gap {i}'], unit = 'days')
    random_df.drop(columns=[f"Job Gap {i}"], inplace = True)

    # Generate end date based on start date
    random_df[f'Job Duration {i}'] = random.choices(np.arange(90, 1000), k=n)
    random_df[f'End {i}'] = random_df[f'Start {i}'] + pd.to_timedelta(random_df[f'Job Duration {i}'], unit = 'days')
    random_df.drop(columns=[f"Job Duration {i}"], inplace = True)

# convert date format:
date_cols = ["Start 1", "Start 2", "Start 3", "End 1", "End 2", "End 3"]
for col in date_cols:
    #random_df[col] = random_df[col].dt.strftime('%m/%y').str.lstrip('0')
    random_df[col] = random_df[col].dt.strftime('%m/%y')

# Generate N/A:
random_df.loc[random_df['Role 1']=='N/A',["Start 1", "Start 2", "Start 3", "End 1", "End 2", "End 3", "Role 2", "Role 3"]] = "N/A"
random_df.loc[random_df['Role 2']=='N/A',["Start 2", "Start 3", "End 2", "End 3", "Role 3"]] = "N/A"
random_df.loc[random_df['Role 3']=='N/A',["Start 3", "End 3"]] = "N/A"

# Add applicant ID:
random_df['Applicant ID'] = random_df.index + 1

random_df['GPA'] = np.round(random_df['GPA'],2)

# Reorder dataframe columns:
new_cols = ['Applicant ID', 'School Name', 'GPA', 'Degree', 'Location', 'Gender', 'Veteran status',
       'Work authorization', 'Disability', 'Ethnicity', 'Role 1','Start 1', 'End 1', 'Role 2',
       'Start 2','End 2','Role 3', 'Start 3','End 3']
random_df = random_df[new_cols]

In [19]:
random_df

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Rice University,2.37,PhD,San Antonio,F,1.0,0,1,1,Senior Data Scientist,10/06,04/09,,,,,,
1,2,Rutgers University,3.11,Bachelors,Houston,,,1,0,4,Research Assistant,12/07,02/09,Lawyer,03/09,05/10,Research Assistant,08/10,08/12
2,3,Johns Hopkins University,2.20,Bachelors,Dallas,,0.0,0,,1,Junior SWE,02/07,11/09,Senior SWE,12/09,12/10,,,
3,4,University of South Carolina,3.10,Bachelors,Chicago,F,0.0,0,1,4,Junior Data Scientist,10/09,11/10,Teaching Assistant,01/11,12/12,Junior SWE,02/13,08/13
4,5,Pennsylvania State University,3.59,PhD,Dallas,F,1.0,0,0,4,Senior Data Scientist,11/08,02/10,Senior SWE,03/10,10/10,Research Assistant,12/10,07/12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,University of Oklahoma,2.90,Masters,Dallas,F,,0,1,2,Junior SWE,05/09,01/12,Senior Data Scientist,03/12,06/13,Senior Data Scientist,06/13,12/14
1996,1997,University of Missouri,3.17,Masters,San Antonio,M,1.0,0,,1,Senior SWE,10/01,09/02,Senior Data Scientist,09/02,02/05,Junior Data Scientist,05/05,04/07
1997,1998,University of Delaware,2.46,PhD,San Jose,M,,1,0,3,Junior Data Scientist,10/08,07/09,ML Engineer,08/09,06/11,,,
1998,1999,University of Oregon,3.44,Masters,Dallas,F,1.0,0,0,1,Junior Data Scientist,11/00,05/02,Junior Data Scientist,06/02,01/05,,,


In [20]:
random_df.to_csv('data/random_df.csv', index=False)

In [None]:
# Join random data with result from resume scorer to feed into candidate evaluator model:

# read in dataframes, og data, and resume scorer result
random_df = pd.read_csv('data/random_df.csv', na_filter=False)
results_rs = pd.read_csv('data/result_rs.csv')

df_combined = pd.merge(random_df, results_rs, left_on = 'Applicant ID', right_on = 'applicant_id')
df_combined.drop(columns=['applicant_id'],inplace=True)
df_combined = df_combined.rename(columns = {'score': 'Resume score'})

df_combined.to_csv('data/random_df_combined.csv', index=False) # the csv to feed into candidate evaluator

In [None]:
# Join both results from resume scorer and candidate evaluator with data for fairness analysis:

# read in dataframes, og data+resume scorer result, and candidate evaluator result
random_df_combined = pd.read_csv('data/random_df_combined.csv', na_filter=False)
results_ce = pd.read_csv('result_ce.csv')

df_combined2 = pd.merge(random_df_combined, results_ce, left_on = 'Applicant ID', right_on = 'applicant_id')
df_combined2.drop(columns=['applicant_id'],inplace=True)
df_combined2 = df_combined2.rename(columns = {'prediction': 'Candidate evaluation'})

df_combined2.to_csv('data/random_df_final.csv', index=False) # the csv to feed into candidate evaluator

## Other code:

In [10]:
# truncated normal distribution:

import numpy as np
from scipy.stats import truncnorm

def truncated_normal(mean, sd, min_val, max_val, size=1):
    # Calculate the standard deviation for the truncated distribution
    a, b = (min_val - mean) / sd, (max_val - mean) / sd
    # Generate truncated normal distribution
    return truncnorm.rvs(a, b, loc=mean, scale=sd, size=size)

# Example parameters
mean = 10  # Mean of the normal distribution
sd = 2     # Standard deviation of the normal distribution
min_val = 5  # Minimum value of the truncated distribution
max_val = 15  # Maximum value of the truncated distribution

# Generate truncated normal distribution
data = truncated_normal(mean, sd, min_val, max_val, size=1000)

# Print the generated data
#print(data)