## Feature Engineering - Encode Categorical Features

In [46]:
import pandas as pd
import numpy as np

### 1.Load Data

In [47]:
exp = pd.read_csv('./data/processed/exp_r2r.csv')

In [48]:
exp.reset_index(inplace=True) # Avoid error in cross validation

### 2.Preprocessing

(1) Generate Labels

In [49]:
exp['max_n'] = exp[['la_n','n','ha_n']].max(axis=1)
exp['max_p'] = exp[['la_p','p','ha_p']].max(axis=1)
exp['valence_p'] = exp[['la_p','p','ha_p']].mean(axis=1)
exp['valence_n'] = exp[['la_n','n','ha_n']].mean(axis=1)
exp['valence_reg'] = exp['valence_p'] - exp['valence_n']
exp['valence'] = 0
exp.loc[exp['max_n'] >= exp['max_p'], 'valence'] = 1

(2) Encoding Categorical Data

i. Education

In [50]:
HS = ['12.0','high school', 'High School', 'high school + cosmetology school', 'High School Graduate']
BA = ['Some college', 'B.A. English/Spanish B.S. Nursing', 'B.S. working on MD and MTS', 'B.S.  Mechanical Engineering', 
      "Bachelor's Degree", 'B.A.', 'College student 3 years', 'B.A. from Cornell Univ.', 'Bachelors degree in Social Work', 
      'BS', 'BA Vanderbilt', 'BS Communications', "college graduate (bachelor's degree in Music from Belmont University)",
      'Bachelor of Arts', 'BSN- bachelors of science in nursing', 'BSE Princeton',
      'Obtained B.S., 2011', 'bachelor of science', 'Bachelor of Social Work', 'B.S.', 'BBA - Bachelors', 'BSC Communications',
      'B.A. 1988', 'B.S. degree dietetics, internship', 'Bachelor of Arts + Equivalency Degree', 'Bachelors', 'B.S. Business Management',
      'B.S. Psychology Texas A&M', 'BS Journalism; BS Nutrition', 'B.E.Sc.', 'Bachelor Degree Music Ed', 'NDCDP',
      'BS Math/Computer Science', 'BS Business', 'college graduate','14.0','16.0']
MS = ['MBA', 'High School Diploma, BA, MBA', 'B.S. Business Management', 'BS Business','College grad', 'College, Masters',"Bachelor of science, 1 year of Master's",'Masters; PhD ongoing', 'M.S. Psychology/ B.A. Sociology',
                'B.S., MPH','18.0', '19.0']
PhD = ['Ph.D.', 'PhD Candidate', 'PhD', 'BS. (PhD Student currently)', 'Masters; PhD ongoing', 
       'Pursuing PhD at Vanderbilt. Have B.S. from UTK', 'BA from Stanford, JD from Univ of Michigan','21.0', '22.0', '20.0']

In [51]:
exp.loc[:,'edu'] = 0
exp.loc[:,'edu_hs'] = 0
exp.loc[exp['Education'].isin(HS), 'edu_hs'] = 1
exp.loc[exp['Education'].isin(HS), 'edu'] = 'high school'
exp.loc[:,'edu_ba'] = 0
exp.loc[exp['Education'].isin(BA), 'edu_ba'] = 1
exp.loc[exp['Education'].isin(BA), 'edu'] = 'undergrad'
exp.loc[:,'edu_ma'] = 0
exp.loc[exp['Education'].isin(MS), 'edu_ma'] = 1
exp.loc[exp['Education'].isin(MS), 'edu'] = 'master'
exp.loc[:,'edu_phd'] = 0
exp.loc[exp['Education'].isin(PhD), 'edu_phd'] = 1
exp.loc[exp['Education'].isin(PhD), 'edu'] = 'phd'

ii. Sex

In [52]:
exp['Sex'].unique()
# Drop NA in Sex column
exp = exp[~exp['Sex'].isna()]
exp.shape

(3058, 223)

iii. Marital Status

In [53]:
exp.loc[:,'is_married'] = 0
exp.loc[exp['Marital_Status'] == 'Married', 'is_married'] = 1
exp.loc[:,'is_divorced'] = 0
exp.loc[exp['Marital_Status'] == 'Divorced', 'is_divorced'] = 1
exp.loc[:,'is_single'] = 0
exp.loc[exp['Marital_Status'] == 'Single', 'is_single'] = 1
exp.loc[:,'is_widowed'] = 0
exp.loc[exp['Marital_Status'] == 'Widowed', 'is_widowed'] = 1
exp.loc[:,'is_with_partner'] = 0
exp.loc[exp['Marital_Status'] == 'Living with partner', 'is_with_partner'] = 1

iv. Income

In [54]:
exp.loc[:,'income'] = 0
exp.loc[exp['Household_income'] == '$10,000-$19,999', 'income'] = 1
exp.loc[exp['Household_income'] == '$20,000-$29,999', 'income'] = 2
exp.loc[exp['Household_income'] == '$30,000-$39,999', 'income'] = 3
exp.loc[exp['Household_income'] == '$40,000-$49,999', 'income'] = 4
exp.loc[exp['Household_income'] == '$50,000-$59,999', 'income'] = 5
exp.loc[exp['Household_income'] == '$60,000-$69,999', 'income'] = 6
exp.loc[exp['Household_income'] == '$70,000-$79,999', 'income'] = 7
exp.loc[exp['Household_income'] == '$80,000-$89,999', 'income'] = 8
exp.loc[exp['Household_income'] == '$90,000-$99,999', 'income'] = 9
exp.loc[exp['Household_income'] == '$100,000-$109,999', 'income'] = 10
exp.loc[exp['Household_income'] == '$110,000-$119,999', 'income'] = 11
exp.loc[exp['Household_income'] == '$120,000-$129,999', 'income'] = 12
exp.loc[exp['Household_income'] == '$130,000-$139,000', 'income'] = 13
exp.loc[exp['Household_income'] == '$140,000-$149,999', 'income'] = 14
exp.loc[exp['Household_income'].isin(['$150,000 or more ','$150,000 or more']), 'income'] = 15

### 3.Check Missingness

In [55]:
missing_rate = exp.isnull().sum()/exp.shape[0]

In [56]:
missing_lt_half_col = missing_rate[missing_rate <= .3].index.values

In [57]:
exp_clean = exp[missing_lt_half_col].dropna()

In [58]:
exp_clean.columns.values

array(['index', 'Unnamed: 0', 'Unnamed: 0.1', 'subject', 'la_p', 'ha_p',
       'ha_n', 'la_n', 'la', 'p', 'n', 'ha', 'start_survey', 'survey_no',
       'experiment', 'DATE_x', 'Period_of_day', 'VALENCE', 'VALENCE_mean',
       'start_survey_5m_ahead', 'start_survey_10m_ahead',
       'start_survey_30m_ahead', 'start_survey_1h_ahead',
       'start_survey_3h_ahead', 'survey_date', 'subject_id_x', 'step_max',
       'step_min', 'step_median', 'steps_max_3h', 'steps_min_3h',
       'steps_mean_3h', 'steps_var_3h', 'steps_median_3h', 'move_rate_3h',
       'active_rate_3h', 'very_active_rate_3h', 'running_rate_3h',
       'steps_max_1h', 'steps_min_1h', 'steps_mean_1h', 'steps_var_1h',
       'steps_median_1h', 'move_rate_1h', 'active_rate_1h',
       'very_active_rate_1h', 'running_rate_1h', 'steps_max_30m',
       'steps_min_30m', 'steps_mean_30m', 'steps_var_30m',
       'steps_median_30m', 'move_rate_30m', 'active_rate_30m',
       'very_active_rate_30m', 'running_rate_30m', 'steps_m

In [59]:
exp_clean = exp_clean.drop(['index','Unnamed: 0', 'Unnamed: 0.1', 'start_survey', 'survey_no', 'DATE_x', 'Period_of_day', 'VALENCE', 'VALENCE_mean','Subject_y', 'Experiment_y','Education', 'Ethnicity','Sex', 'Marital_Status', 'edu','Household_income','Subject_x',  'subject_id_x', 'subject_id_y','survey_date', 'start_survey_10m_ahead', 'start_survey_1h_ahead', 'start_survey_30m_ahead', 'start_survey_3h_ahead', 'start_survey_5m_ahead', 'experiment'], axis=1)

### 4.Export

In [60]:
exp_clean = exp_clean.reset_index(drop=True)
exp_clean.to_csv('./data/processed/exp.csv')