### Import Packages and CSV

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
import re

In [9]:
# Load cleaned data

cleaned_data = pd.read_csv('../predict_re/survey_responses.csv')
cleaned_data.columns = cleaned_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace("'",'')

### Tidy Data

In [10]:
data = cleaned_data.copy()

data['year'] = data['year'].map({'First-year': 1,
                                 'Second-year': 2,
                                 'Third-year': 3,
                                 'Fourth-year': 4})
data['gender'] = data['gender'].map({'Female': 'F', 'Male': 'M', 'Non-binary': 'NB', 'Prefer not to specify': 'unspecified'})
data['relationship_status'] = (data['relationship_status'] == 'In a relationship').astype(int)
data['is_long_distance'] = data.apply(lambda row: row['is_long_distance'] if row['relationship_status'] == 1 else np.nan, axis=1)
data['sexuality'] = data['sexuality'].str.lower().map({'heterosexual/straight': 'heterosexual', 'homosexual/gay': 'homosexual', 'prefer not to specify': 'unspecified'})
data['commute'] = (data['residence_type'] == 'Commute').astype(int)
data['job_status'] = (data['job_status'].isin(['Part-Time', 'Employed'])).astype(int)
data['minor'] = data['minor'].apply(lambda x: 0 if pd.isna(x) or x.strip().lower() == 'none' else 1)
data.drop(columns=['residence_type'], inplace=True)
data.drop(columns=['is_transfer'], inplace=True)
data.rename(columns={'is_transfer.1': 'is_transfer'}, inplace=True)

In [11]:
# Function to map majors to departments

def get_dept(major):
    major = major.strip().lower().split('&')
    dept = []
    for m in major:
        m = m.strip()
        if pd.isna(m):
            dept.append('Undeclared')
        elif re.search(r'bio', m, re.IGNORECASE):
            dept.append('NaturalSci')
        elif re.search(r'engineering', m, re.IGNORECASE):
            dept.append('Engineering')
        elif re.search(r'psych|sociology|political science|anthropology|urban|ethnic|economics', m, re.IGNORECASE):
            dept.append('SocSci')
        elif re.search(r'computer|data', m, re.IGNORECASE):
            dept.append('CS/DS')
        elif re.search(r'chem|environment|geo', m, re.IGNORECASE):
            dept.append('NaturalSci')
        elif re.search(r'cognitive', m, re.IGNORECASE):
            dept.append('CogSci')

        elif re.search(r'planning|interaction|neural|molecular|evolution', m, re.IGNORECASE):
            continue
        else:
            dept.append('Arts/Humanities')
    return list(set(dept))

In [12]:
# Add new columns for department -- second department if double major

data['dept'] = data['major'].apply(get_dept)
data['double_major'] = data['dept'].apply(lambda x: 1 if len(x) > 1 else 0)
data['dept1'] = data['dept'].apply(lambda x: x[0] if len(x) >= 1 else np.nan)
data['dept2'] = data['dept'].apply(lambda x: x[1] if len(x) >= 2 else np.nan)

In [13]:
# Reorder columns for easier viewing

data = data[['gender', 'race', 'is_hispaniclatino', 'sexuality', 'year', 'is_transfer', 'commute', 
             'college', 'major', 'minor', 'dept1', 'dept2', 'double_major', 'is_stem', 'gpa',
             'club_general', 'club_professional', 'club_recreational', 'club_athletic', 
             'club_cultural', 'club_skilldev', 'club_other', 'is_greek', 'job_status', 'outings_per_week',
             'relationship_status', 'longest_relationship_months', 'num_romantic_involvements',
             'met_partner_ucsd', 'is_long_distance']]

In [14]:
data.head()

Unnamed: 0,gender,race,is_hispaniclatino,sexuality,year,is_transfer,commute,college,major,minor,...,club_skilldev,club_other,is_greek,job_status,outings_per_week,relationship_status,longest_relationship_months,num_romantic_involvements,met_partner_ucsd,is_long_distance
0,F,White/Caucasian,0,heterosexual,3,0,0,Eighth,General Biology,0,...,0,0,0,1,2.0,0,0.0,0,0,
1,F,Asian,0,heterosexual,3,0,0,Muir,Data Science,1,...,1,0,0,0,1.0,0,24.0,2,0,
2,F,Multiracial,0,,2,0,0,Seventh,Mathematics–Economics,1,...,0,0,0,1,4.0,0,10.0,2,0,
3,M,White/Caucasian,0,heterosexual,1,0,0,Marshall,Mathematics–Economics,0,...,0,0,0,0,2.0,0,4.0,1,0,
4,F,Asian,0,heterosexual,1,0,0,Revelle,Cognitive Science,0,...,0,0,1,0,2.0,0,8.0,2,0,


### Diagnostics for binary variables

In [15]:
# Balance diagnosis for binary columns -- flag columns with high imbalance (>90/10)

binary_cols = ['is_hispaniclatino', 'commute', 'is_stem', 'is_greek', 'job_status', 'relationship_status', 'is_long_distance',
               'club_general', 'club_professional', 'club_recreational', 'club_athletic', 
               'club_cultural', 'club_skilldev', 'club_other', 'double_major']

for col in binary_cols:
    counts = data[col].value_counts(normalize=True)
    if counts.max() > 0.9:
        print(f"High imbalance detected in column '{col}': {counts.to_dict()}")

High imbalance detected in column 'club_general': {1: 0.9363636363636364, 0: 0.06363636363636363}
High imbalance detected in column 'club_other': {0: 0.9545454545454546, 1: 0.045454545454545456}
High imbalance detected in column 'double_major': {0: 0.9272727272727272, 1: 0.07272727272727272}


### Create a one-hot encoded dataframe

In [16]:
# One-hot encode categorical variables
onehot_cols = ['year', 'dept1', 'dept2'] # columns to one-hot encode (subject to change, but keep dept1 and dept2)

data_onehot = pd.get_dummies(data, columns=onehot_cols, dtype=int)

# Combine one-hot encoded department columns into single department columns
suffixes = set(data_onehot.columns[data_onehot.columns.str.contains('dept')].map(lambda x: x.split("_")[-1]))
for suffix in suffixes:
    cols_to_sum = [col for col in data_onehot.columns if col.endswith(f'{suffix}')]
    data_onehot[f'dept_{suffix}'] = data_onehot[cols_to_sum].sum(axis=1)
    data_onehot.drop(columns=cols_to_sum, inplace=True)

In [17]:
# Drop identify characteristics columns (not used in prediction)
data_onehot_x = data_onehot.drop(columns=['gender', 'race', 'is_hispaniclatino', 'sexuality', 'major', 'is_long_distance', 'college'], inplace=False)

# Remove multicollinear columns (based on columns created during one-hot encoding) -- dept1 and dept2 always retained
data_onehot_x.drop(columns= ['year_1', 'double_major'], inplace=True) 

In [18]:
data_onehot_x.head()

Unnamed: 0,is_transfer,commute,minor,is_stem,gpa,club_general,club_professional,club_recreational,club_athletic,club_cultural,...,met_partner_ucsd,year_2,year_3,year_4,dept_NaturalSci,dept_SocSci,dept_Engineering,dept_CogSci,dept_CS/DS,dept_Arts/Humanities
0,0,0,0,1,3.69,1,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,1,1,3.72,1,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,1,3.7,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,0,0,0,1,4.0,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,1,,1,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
