### Reducing dim
The following is based on the classification notebook.

We reduce dim, and binarize certain features.

In [29]:
subset_of_features = [
        'how_long_have_you_worked_for_your_current_employer',
        'which_of_the_following_groups_best_describes_your_employment_level',
        "approximately_what_is_the_size_of_your_employer's_workforce",
        'i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description',
        'which_of_the_following_groups_best_describes_the_organization_that_you_work_for',
        'leadership_opportunities',
        'i_let_my_manager_know_what_i_want_with_the_expectation_that_it_will_generate_growth_opportunities',
        'i_avoid_discussions_about_salary_and_incentives',
        'what_country_do_you_currently_live_in',
        'i_trust_that_i_am_compensated_appropriately_for_my_accomplishments',
        'family_commitments',
        'yes_one_or_more_children']

In [30]:
import pandas as pd
import numpy as np

file_path = "survey_2023.csv"
df = pd.read_csv(file_path)

print(list(df.columns))

['how_long_have_you_worked_for_your_current_employer', 'which_of_the_following_groups_best_describes_your_employment_level', 'which_of_the_following_groups_best_describes_the_organization_that_you_work_for', 'flexible_working_arrangements', 'culturalreligious_practices', "approximately_what_is_the_size_of_your_employer's_workforce", 'i_believe_i_am_being_paid_fairly_for_the_work_that_i_do', 'family_commitments', 'yes_one_or_more_children', 'leadership_opportunities', 'black', 'white', 'asian', 'latinaeox', 'recognition_from_colleaguespeers', 'what_country_do_you_currently_live_in', 'is_your_first_language_the_primary_language_of_your_workplace_', 'socialize_with_peers_within_my_company', 'i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description', 'i_avoid_discussions_about_salary_and_incentives', 'i_advocate_for_myself_when_appropriate_so_that_my_manager_knows_what_i_want_and_expect', 'i_trust_that_i_am_compensated_appropriately_for_my_accomplish

In [31]:
# cartesian product of unique values in data
all_unique_counts = []
for col in df.columns:
    print(f"{col}: {df[col].unique()}")
    all_unique_counts.append(len(df[col].unique()))

# cartesian product multiplies all these
print(f"cartesian product of all unique values: {np.prod(all_unique_counts)}")


how_long_have_you_worked_for_your_current_employer: [1 0]
which_of_the_following_groups_best_describes_your_employment_level: [2 4 1 3 5 0]
which_of_the_following_groups_best_describes_the_organization_that_you_work_for: [8 5 7 4 1 9 6 2 3 0]
flexible_working_arrangements: [1 0]
culturalreligious_practices: [0 1]
approximately_what_is_the_size_of_your_employer's_workforce: [5 4 3 0 6 2 7 1]
i_believe_i_am_being_paid_fairly_for_the_work_that_i_do: [1 5 4 2 0 3]
family_commitments: [1 0]
yes_one_or_more_children: [0 1]
leadership_opportunities: [1 4 2 3 0 5]
black: [0 1]
white: [1 0]
asian: [0 1]
latinaeox: [0 1]
recognition_from_colleaguespeers: [5 4 1 2 3 0]
what_country_do_you_currently_live_in: [10  9  4  1  2  3  7  6  5  8]
is_your_first_language_the_primary_language_of_your_workplace_: [4 1 3 2 0]
socialize_with_peers_within_my_company: [1 0]
i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description: [4 5 2 3 1 0]
i_avoid_discussions_about_sa

In [32]:
df = df[subset_of_features]
df.columns

Index(['how_long_have_you_worked_for_your_current_employer',
       'which_of_the_following_groups_best_describes_your_employment_level',
       'approximately_what_is_the_size_of_your_employer's_workforce',
       'i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description',
       'which_of_the_following_groups_best_describes_the_organization_that_you_work_for',
       'leadership_opportunities',
       'i_let_my_manager_know_what_i_want_with_the_expectation_that_it_will_generate_growth_opportunities',
       'i_avoid_discussions_about_salary_and_incentives',
       'what_country_do_you_currently_live_in',
       'i_trust_that_i_am_compensated_appropriately_for_my_accomplishments',
       'family_commitments', 'yes_one_or_more_children'],
      dtype='object')

In [33]:
likert_agree_cols = [
    "i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description",
    "i_avoid_discussions_about_salary_and_incentives",
    "i_trust_that_i_am_compensated_appropriately_for_my_accomplishments",
    "i_let_my_manager_know_what_i_want_with_the_expectation_that_it_will_generate_growth_opportunities",
]

likert_satisfaction_cols = [
    "leadership_opportunities"
]

def ternary_agree_scale(x):
    """
    columns with:
       1 = Strongly disagree
       2 = Disagree
       3 = Not applicable
       4 = Agree
       5 = Strongly agree

    map to:
       0 = Disagree (1 or 2)
       1 = Agree (4 or 5)
       2 = Not applicable or Missing (0 or 3)
    """
    if x in [1, 2]:
        return 1
    elif x in [4, 5]:
        return 2
    else:
        return 0

def ternary_satisfaction_scale(x):
    """
    columns with:
       1 = Very dissatisfied
       2 = Dissatisfied
       3 = Not applicable
       4 = Satisfied
       5 = Very satisfied

    map to:
       0 = Dissatisfied (1 or 2)
       1 = Satisfied (4 or 5)
       2 = Not applicable or Missing (0 or 3)
    """
    if x in [1, 2]:
        return 1
    elif x in [4, 5]:
        return 2
    else:
        return 0

df_transformed = df.copy()

for col in likert_agree_cols:
    df_transformed[col] = df_transformed[col].apply(ternary_agree_scale)

for col in likert_satisfaction_cols:
    df_transformed[col] = df_transformed[col].apply(ternary_satisfaction_scale)

def ternary_workforce_size(x):
    """
    maps the workforce size to:
      0 -> Missing value (original code 0)
      1 -> Less than 2000 employees (original codes 1,2,3,4,5)
      2 -> More than 2000 employees (original codes 6,7)
    """
    if x == 0:
        return 0
    elif x in [1, 2, 3, 4, 5]:
        return 1
    elif x in [6, 7]:
        return 2
    else:
        return 0

df_transformed["approximately_what_is_the_size_of_your_employer's_workforce"] = df_transformed["approximately_what_is_the_size_of_your_employer's_workforce"].apply(ternary_workforce_size)


def ternary_employment_level(x):
    """
    maps the employment level to:
        0 -> Missing value or Not listed (codes 0, 5)
        1 -> Entry-level or Intermediate (codes 1, 2)
        2 -> Mid-level or Senior/exec-level (codes 3, 4)
    """
    if x in [0, 5]:
        return 0
    elif x in [1, 2]:
        return 1
    elif x in [3, 4]:
        return 2
    else:
        return 0

df_transformed["which_of_the_following_groups_best_describes_your_employment_level"] = df_transformed["which_of_the_following_groups_best_describes_your_employment_level"].apply(ternary_employment_level)


In [34]:
# cartesian product of unique values in data
all_unique_counts = []
for col in df_transformed.columns:
    print(f"{col}: {df_transformed[col].unique()}")
    all_unique_counts.append(len(df_transformed[col].unique()))

# cartesian product multiplies all these
print(f"cartesian product of all unique values: {np.prod(all_unique_counts)}")

how_long_have_you_worked_for_your_current_employer: [1 0]
which_of_the_following_groups_best_describes_your_employment_level: [1 2 0]
approximately_what_is_the_size_of_your_employer's_workforce: [1 0 2]
i_pursue_job_opportunities_even_if_i_do_not_fulfill_all_of_the_requirements_on_a_job_description: [2 1 0]
which_of_the_following_groups_best_describes_the_organization_that_you_work_for: [8 5 7 4 1 9 6 2 3 0]
leadership_opportunities: [1 2 0]
i_let_my_manager_know_what_i_want_with_the_expectation_that_it_will_generate_growth_opportunities: [2 1 0]
i_avoid_discussions_about_salary_and_incentives: [1 2 0]
what_country_do_you_currently_live_in: [10  9  4  1  2  3  7  6  5  8]
i_trust_that_i_am_compensated_appropriately_for_my_accomplishments: [1 2 0]
family_commitments: [1 0]
yes_one_or_more_children: [0 1]
cartesian product of all unique values: 1749600


In [35]:
df_transformed.to_csv("survey_2023_transformed.csv", index=False)