In [1]:
import pandas as pd

file_path = 'survey_2023_raw.csv'
df = pd.read_csv(file_path)

cached_columns = df.columns.copy()
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
# map new column names to old column names
column_map = dict(zip(df.columns, cached_columns))


### Initially, need a remapping for the order of categories

In [3]:
remapped_categories = {
    # 1. how_long_have_you_worked_for_your_current_employer?
    "how_long_have_you_worked_for_your_current_employer?": [
        "Less than 1 year",
        "1 to 5 years",
        "6 to 10 years",
        "11 to 20 years",
        "21 years or more"
    ],
    
    # 2. which_of_the_following_groups_best_describes_your_employment_level?
    # (put "Not listed" at the end)
    "which_of_the_following_groups_best_describes_your_employment_level?": [
        "Entry-level",
        "Intermediate",
        "Mid-level",
        "Senior or executive-level",
        "Not listed (please specify)"
    ],
    
    # 3. approximately_what_is_the_size_of_your_employer's_workforce?
    "approximately_what_is_the_size_of_your_employer's_workforce?": [
        "1",
        "2 to 10",
        "11 to 100",
        "101 to 500",
        "501 to 2,000",
        "2,001 to 10,000",
        "10,001 to 20,001+"
    ],
    
    # 4. i_believe_i_am_being_paid_fairly_for_the_work_that_i_do.
    # typical strongly disagree -> strongly agree
    "i_believe_i_am_being_paid_fairly_for_the_work_that_i_do.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly agree"
    ],
    
    # 5. leadership_opportunities
    "leadership_opportunities": [
        "Very dissatisfied",
        "Dissatisfied",
        "Not applicable",
        "Satisfied",
        "Very satisfied"
    ],
    
    # 6. recognition_from_colleagues/peers
    "recognition_from_colleagues/peers": [
        "Very dissatisfied",
        "Dissatisfied",
        "Not applicable",
        "Satisfied",
        "Very satisfied"
    ],
    
    # 7. i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description.
    "i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly Agree"
    ],
    
    # 8. i_avoid_discussions_about_salary_and_incentives.
    "i_avoid_discussions_about_salary_and_incentives.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly agree"
    ],
    
    # 9. i_advocate_for_myself_when_appropriate_so_that_my_manager_knows_what_i_want_and_expect.
    "i_advocate_for_myself_when_appropriate_so_that_my_manager_knows_what_i_want_and_expect.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly agree"
    ],
    
    # 10. i_trust_that_i_am_compensated_appropriately_for_my_accomplishments.
    "i_trust_that_i_am_compensated_appropriately_for_my_accomplishments.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly agree"
    ],
    
    # 11. i_let_my_manager_know_what_i_want,_with_the_expectation_that_it_will_generate_growth_opportunities.
    "i_let_my_manager_know_what_i_want,_with_the_expectation_that_it_will_generate_growth_opportunities.": [
        "Strongly disagree",
        "Disagree",
        "Not applicable",
        "Agree",
        "Strongly Agree"
    ]
}

def enforce_ordinal_order(df, col, desired_order):
    # re-cast column `col` as a categorical with the specific desired_order,
    # then convert it to integer codes (0 = first in list, 1 = second, etc.)
    if col not in df.columns:
        return
    
    # convert to categorical with the specified ordering
    df[col] = pd.Categorical(df[col],
                             categories=desired_order,
                             ordered=True)

def reorder_selected_columns(df, remapped_categories):
    for col, category_list in remapped_categories.items():
        enforce_ordinal_order(df, col, category_list)


In [4]:
# NOTE: task is predicting whether each respondent has worked for their 
#       current employer for > 10 years (i.e., target = 1 if they have, 0 otherwise)
target = "how_long_have_you_worked_for_your_current_employer?"

# NOTE: a set of relevant columns defined from feature importance
train_set = [
    "which_of_the_following_groups_best_describes_your_employment_level?",
    "which_of_the_following_groups_best_describes_the_organization_that_you_work_for?",
    "flexible_working_arrangements:_including_but_not_limited_to_hybrid_or_remote_work,_short-term_disability_benefits,_flex_time/alternative_schedules,_job_sharing,_temporary_or_permanent_switch_to_part_time,_and_emergency_leave.1",
    "cultural/religious_practices:_including_but_not_limited_to_extended_or_flexible_holidays,_dedicated_prayer_rooms.1",
    "approximately_what_is_the_size_of_your_employer's_workforce?",
    "i_believe_i_am_being_paid_fairly_for_the_work_that_i_do.",
    "family_commitments:_including_but_not_limited_to_parental_leave,_bereavement_leave,_adoption/infertility_support,_child_care_supports,_elder_care_supports.1",
    "yes,_one_or_more_children",
    "leadership_opportunities",
    "black",
    "white",
    "asian",
    "latina/e/o/x",
    "recognition_from_colleagues/peers",
    "what_country_do_you_currently_live_in?",
    "is_your_first_language_the_primary_language_of_your_workplace?_",
    "socialize_with_peers_within_my_company",
    "i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description.",
    "i_avoid_discussions_about_salary_and_incentives.",
    "i_advocate_for_myself_when_appropriate_so_that_my_manager_knows_what_i_want_and_expect.",
    "i_trust_that_i_am_compensated_appropriately_for_my_accomplishments.",
    "i_let_my_manager_know_what_i_want,_with_the_expectation_that_it_will_generate_growth_opportunities."
]

cols_to_keep = [target] + train_set
cols_to_keep = [c for c in cols_to_keep if c in df.columns]
df = df[cols_to_keep].copy()

# some columns have a specific order to their categories
# so we need to remap them to enforce this order
reorder_selected_columns(df, remapped_categories)

# top 10 countries by respondent count
top_10_countries = df['what_country_do_you_currently_live_in?'].value_counts().head(10).index

# just keep these top 10 countries
df = df[df['what_country_do_you_currently_live_in?'].isin(top_10_countries)]

# recast the country column as a categorical
df['what_country_do_you_currently_live_in?'] = pd.Categorical(df['what_country_do_you_currently_live_in?'])

# we also want to store the original text labels for each column
# and convert the columns to numeric codes
column_mapping = {}
for col in df.columns: 
    df[col] = df[col].astype('category')
    
    # cats
    categories = df[col].cat.categories
    
    # convert to codes and shift by +1
    # recall nan convert to -1, so become 0
    codes = df[col].cat.codes + 1
    df[col] = codes
    
    # increment here too (0 is the code for nan)
    column_mapping[col] = {code + 1: cat for code, cat in enumerate(categories)}

    # for any column with more than 2 categories, add a 0 mapping to column mapping
    # for "Missing value"
    if len(categories) > 1:
        column_mapping[col][0] = "Missing value"

    # for any column with just 1 categories 
    # switch the mapping from "{1: 1.0}" to "{0: 'No', 1: 'Yes'}"
    if len(categories) == 1:
        column_mapping[col] = {0: 'No', 1: 'Yes'}
    
    # remove all text from column name after : (if present)
    new_col = col.split(":")[0]

    # remove punctuation from column names, except for "_"
    new_col = new_col.replace(" ", "")
    new_col = new_col.replace("/", "")
    new_col = new_col.replace(",", "")
    new_col = new_col.replace(".", "")
    new_col = new_col.replace("-", "")
    new_col = new_col.replace("?", "")

    # rename the column
    df.rename(columns={col: new_col}, inplace=True)

# for how_long_have_you_worked_for_your_current_employer, ignore if value is 0
df = df[df["how_long_have_you_worked_for_your_current_employer"] != 0]

# for how_long_have_you_worked_for_your_current_employer, turn into binary
# 1 if > 10 years, 0 otherwise
df["how_long_have_you_worked_for_your_current_employer"] = df["how_long_have_you_worked_for_your_current_employer"].apply(lambda x: 0 if x in [1, 2, 3] else 1)

# track the distribution of values for each column
column_distributions = {}
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        column_distributions[col] = df[col].value_counts(normalize=True).to_dict()

df.to_csv('survey_2023.csv', index=False)


In [5]:
# filter column_map for columns to keep
column_map = {k: v for k, v in column_map.items() if k in cols_to_keep}

In [12]:
df.columns

Index(['how_long_have_you_worked_for_your_current_employer',
       'which_of_the_following_groups_best_describes_your_employment_level',
       'which_of_the_following_groups_best_describes_the_organization_that_you_work_for',
       'flexible_working_arrangements', 'culturalreligious_practices',
       'approximately_what_is_the_size_of_your_employer's_workforce',
       'i_believe_i_am_being_paid_fairly_for_the_work_that_i_do',
       'family_commitments', 'yes_one_or_more_children',
       'leadership_opportunities', 'black', 'white', 'asian', 'latinaeox',
       'recognition_from_colleaguespeers',
       'what_country_do_you_currently_live_in',
       'is_your_first_language_the_primary_language_of_your_workplace_',
       'socialize_with_peers_within_my_company',
       'i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description',
       'i_avoid_discussions_about_salary_and_incentives',
       'i_advocate_for_myself_when_appropriate_so_that_

In [8]:
list(column_map.keys())

['family_commitments:_including_but_not_limited_to_parental_leave,_bereavement_leave,_adoption/infertility_support,_child_care_supports,_elder_care_supports.1',
 'cultural/religious_practices:_including_but_not_limited_to_extended_or_flexible_holidays,_dedicated_prayer_rooms.1',
 'flexible_working_arrangements:_including_but_not_limited_to_hybrid_or_remote_work,_short-term_disability_benefits,_flex_time/alternative_schedules,_job_sharing,_temporary_or_permanent_switch_to_part_time,_and_emergency_leave.1',
 'recognition_from_colleagues/peers',
 'leadership_opportunities',
 'socialize_with_peers_within_my_company',
 'i_let_my_manager_know_what_i_want,_with_the_expectation_that_it_will_generate_growth_opportunities.',
 'i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description.',
 'i_trust_that_i_am_compensated_appropriately_for_my_accomplishments.',
 'i_advocate_for_myself_when_appropriate_so_that_my_manager_knows_what_i_want_and_expect.',
 'i_belie

In [10]:
df['i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description']

0       4
1       4
2       4
3       4
4       5
       ..
1750    3
1751    5
1752    3
1753    3
1754    4
Name: i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description, Length: 1400, dtype: int8