In [None]:
from imblearn.over_sampling import SMOTE
import itertools 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
# from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns


sns.set(style='darkgrid')
% matplotlib inline

  import pandas.util.testing as tm


## Preprocessing

### Basic cleaning

In [None]:
# finding null values
def nullFinder(df, capitalizedDatasetName):
    df_counts = pd.DataFrame(len(df) - df.count())
    df_counts.columns = ['# missing values']
    display(df_counts.style
            .set_caption('{} DATA'.format(capitalizedDatasetName))
            .set_table_styles([{'selector': 'caption', 
                                'props': [('color', 'black'),
                                          ('font-size', '16px')]}]) )

In [None]:
# employment_industry and employment_occupation have several missing values, probably because that information isn't
# relevant to people whose employment_status indicates that they're unemployed - MISSING NOT AT RANDOM
# also drop health insurance, because half of its values are missing
def organize(df):
    df.loc[df['employment_status']=='Unemployed', 'employment_industry'] = 'NA'
    df.loc[df['employment_status']=='Unemployed', 'employment_occupation'] = 'NA'
    df.loc[df['employment_status']=='Not in Labor Force', 'employment_industry'] = 'NA'
    df.loc[df['employment_status']=='Not in Labor Force', 'employment_occupation'] = 'NA'
    df.drop(['health_insurance'], axis=1, inplace=True)

### Column engineering
| feature | new level | new level | new level |
| ------- | ------ | ------ | ------| 
| h1n1_concern | [(0,1) to become] 0 = less concern <br> | [(2,3) to become] 1 = more concern<br> | NONE | 
| household_adults (will be renamed: live_alone) | [(1,2,3) to become] 0 = does not live alone | [(0) to become] 1 = lives alone | NONE | 
| household_children (will be renamed: have_children) | [(0) to become] 0 = no children | [(1,2,3) to become] 1 = children | NONE | 
| employment_status | [(['Not in Labor Force', 'Unemployed']) to become] 0 = not working | [(['Employed']) to become] 1 = working | NONE |
| education (will be renamed: college) | [(['< 12 Years', '12 Years']) to become] 0 = no college | [(['Some College', 'College Graduate']) to become] 1 = some college or more | NONE |
| opinion_h1n1_vacc_effective | [(1,2) to become] 0 = not effective | [(3) to become] 1 = don't know | [(4,5) to become] 2 = effective | 
| opinion_seas_vacc_effective | [(1,2) to become] 0 = not effective | [(3) to become] 1 = don't know | [(4,5) to become] 2 = effective | 
|opinion_h1n1_sick_from_vacc | DROP 3 (DON'T KNOW) - too small, creates noise | [(1,2) to become] 0 = not worried | [(4,5) to become] 1 = worried |
| opinion_seas_sick_from_vacc  | DROP 3 (DON'T KNOW) - too small, creates noise | [(1,2) to become] 0 = not worried | [(4,5) to become] 1 = worried |
| opinion_h1n1_risk | DROP 3 (DON'T KNOW) - too small, creates noise | [(1,2) to become] 0 = low | [(4,5) to become] 1 = high |
| opinion_seas_risk | DROP 3 (DON'T KNOW) - too small, creates noise | [(1,2) to become] 0 = low | [(4,5) to become] 1 = high |

In [None]:
## HELPER FUNCTIONS FOR TRANSFORMING COLUMNS (PER THE SPECIFICATIONS IN THE TABLE ABOVE)
# rename columns
def rename_col(df, old_name, new_name):
  df.rename(columns={old_name : new_name}, inplace=True)
  return

# opinion_XXX_vacc_effective
def opinion_vaccEffective_transformer(df, h1n1_or_szn):
  colname = 'opinion_' + h1n1_or_szn + '_vacc_effective'
  # extract indices to transform
  from12_to0_idx = np.where(df[colname]<=2.0)[0].tolist()
  from3_to1_idx = np.where(df[colname]==3.0)[0].tolist()
  from45_to2_idx = np.where(df[colname]>=4.0)[0].tolist()
  # transform
  df.loc[from12_to0_idx, colname] = df.loc[from12_to0_idx, colname].replace(1.0, 0.0).replace(2.0, 0.0)
  df.loc[from3_to1_idx, colname] = df.loc[from3_to1_idx, colname].replace(3.0, 1.0)
  df.loc[from45_to2_idx, colname] = df.loc[from45_to2_idx, colname].replace(4.0, 2.0).replace(5.0, 2.0)
  return

# opinion_XXX_sick_from_vacc anad opinion_XXX_risk
def opinion_XXX_transformer(df, h1n1_or_szn, vacc_or_risk):
  if vacc_or_risk == 'vacc':
      colname = 'opinion_' + h1n1_or_szn + '_sick_from_vacc'
  elif vacc_or_risk == 'risk':
    colname = 'opinion_' + h1n1_or_szn + '_risk'
  # drop 3 ('don't know')
  drop = np.where(df[colname]==3.0)[0].tolist()
  df.drop(drop, inplace=True)
  df.reset_index(drop=True, inplace=True)

  # extract indices to transform
  from12_to0_idx = np.where(df[colname]<=2.0)[0].tolist()
  from45_to1_idx = np.where(df[colname]>=4.0)[0].tolist()
  # transform
  df.loc[from12_to0_idx, colname] = df.loc[from12_to0_idx, colname].replace(1.0, 0.0).replace(2.0, 0.0)
  df.loc[from45_to1_idx, colname] = df.loc[from45_to1_idx, colname].replace(4.0, 1.0).replace(5.0, 1.0)
  return 

In [None]:
## AGGREGATING ALL HELPER FUNCTIONS INTO ONE AND IMPLEMENTING THEM
## TRANSFORMING THE REST OF THE COLUMNS 

def columnTransformer(df):
  # h1n1_concern
  # extract indices to transform
  from01_to0_idx = np.where(df['h1n1_concern']<=1.0)[0].tolist()
  from23_to1_idx = np.where(df['h1n1_concern']>=2.0)[0].tolist()
  # transform
  df.loc[from01_to0_idx, 'h1n1_concern'] = df.loc[from01_to0_idx, 'h1n1_concern'].replace(1.0, 0.0)
  df.loc[from23_to1_idx, 'h1n1_concern'] = df.loc[from23_to1_idx, 'h1n1_concern'].replace(2.0, 1.0).replace(3.0,1.0)

  # household_adults
  rename_col(df,'household_adults', 'lives_alone')
  # extract indices to transform
  from123_to0_idx = np.where(df['lives_alone']>=1.0)[0].tolist()
  from0_to1_idx = np.where(df['lives_alone']==0.0)[0].tolist()
  # transform
  df.loc[from123_to0_idx, 'lives_alone'] = df.loc[from123_to0_idx, 'lives_alone'].replace(1.0, 0.0).replace(2.0, 0.0).replace(3.0,0.0)
  df.loc[from0_to1_idx, 'lives_alone'] = df.loc[from0_to1_idx, 'lives_alone'].replace(0.0, 1.0)

  # household_children
  rename_col(df,'household_children', 'have_children')
  # extract indices to transform
  from123_to1_idx = np.where(df['have_children']>=1.0)[0].tolist()
  # transform
  df.loc[from123_to1_idx, 'have_children'] = df.loc[from123_to1_idx, 'have_children'].replace(1.0, 1.0).replace(2.0, 1.0).replace(3.0,1.0)

  # employment_status
  # extract indices to transform
  notEmp_idx = np.where(df['employment_status']!='Employed')[0].tolist()
  emp_idx = np.where(df['employment_status']=='Employed')[0].tolist()
  # transform
  df.loc[notEmp_idx, 'employment_status'] = df.loc[notEmp_idx, 
                                                          'employment_status'].replace('Not in Labor Force', 0.0).replace('Unemployed', 0.0)
  df.loc[emp_idx, 'employment_status'] = df.loc[emp_idx, 'employment_status'].replace('Employed', 1.0)

  # education
  # extract indices to transform
  hs_idx = np.where((df['education']!='Some College') & (df['education']!='College Graduate'))[0].tolist()
  college_idx = np.where((df['education']!='< 12 Years') & (df['education']!='12 Years'))[0].tolist()
  # transform
  df.loc[hs_idx, 'education'] = df.loc[hs_idx,'education'].replace('< 12 Years', 0.0).replace('12 Years', 0.0)
  df.loc[college_idx, 'education'] = df.loc[college_idx,'education'].replace('Some College', 1.0).replace('College Graduate', 1.0)

  opinion_vaccEffective_transformer(df, 'h1n1')
  opinion_vaccEffective_transformer(df, 'seas') 
  opinion_XXX_transformer(df, 'h1n1', 'vacc')
  opinion_XXX_transformer(df, 'seas', 'vacc') 
  opinion_XXX_transformer(df, 'h1n1', 'risk')
  opinion_XXX_transformer(df, 'seas', 'risk') 

  return df

### Encoding features
| Feature type | Feature description | Associated features | Action |
| ------------ | ------------------- | ------------------- | ------ |
| binary | 2 levels [0,1] - <br> described as no/yes, low/high, etc | - 'h1n1_concern' <br> - 'behavioral_avoidance' <br> - 'behavioral_wash_hands' <br> - 'behavioral_large_gatherings' <br>- 'behavioral_outside_home' <br> - 'behavioral_touch_face' <br> - 'doctor_hecc_h1n1' <br> - 'doctor_recc_seasonal' <br> - 'chronic_med_condition' <br> - 'health_worker' <br>- 'opinion_h1n1_risk' <br>- 'opinion_h1n1_sick_from_vacc' <br>- 'opinion_seas_risk' <br>- 'opinion_seas_sick_from_vacc' <br> - employment_status <br>- 'live_alone' <br> - 'have_children' <br>- 'education' <br> - 'behavioral_antiviral_meds' <br> - 'behavioral_face_mask' <br> - 'child_under_6_months' | No action - 2 levels each (0,1) |
| multiple | >2 levels [0,1,2,n...] | - 'opinion_h1n1_vacc_effective' <br>- 'opinion_seas_vacc_effective' <br> - 'h1n1_knowledge' | No action - 3 levels each | 
| text | levels are strings | - 'age_group' <br>- 'sex'<br>- 'income_poverty'<br>- 'rent_or_own'<br>- 'hhs_geo_region'<br>- 'census_msa'<br>- 'race'<br>- 'employment_industry'<br>- 'employment_occupation' <br>- marital_status | - census_msa and income_poverty to be manually ordinally encoded <br> - age_group to be ordinally encoded <br> - hhs_geo_region, race, employment_industry, employment_occupation <br>&nbsp; to be feature hash encoded OR binary encoded <br> - sex, marital_status, and rent_or_own to be one hot encoded (drop_first=True) |


In [None]:
# label encoding low cardinality (non-binary) text columns

def manualEncoder(df):  
  cols = df.columns
  # manually encoding census_msa and income_poverty because they cannot be properly sorted 
  df['income_poverty'].replace('Below Poverty', 0, inplace=True)
  df['income_poverty'].replace('<= $75,000, Above Poverty', 1, inplace=True)
  df['income_poverty'].replace('> $75,000', 2, inplace=True)

  df['census_msa'].replace('Non-MSA', 0, inplace=True)
  df['census_msa'].replace('MSA, Not Principle  City', 1, inplace=True)
  df['census_msa'].replace('MSA, Principle City', 2, inplace=True)

  # ordinal encoding - age_group
  df['age_group'], originalAges = pd.factorize(df['age_group'], sort=True)
  df.columns = cols
  df.reset_index(drop=True, inplace=True)
  return 

In [None]:
# binary encoding high cardinality text columns 
def binaryEncoder(train_df, test_df, binaryEncode_test=True):
  feats = ['hhs_geo_region', 'race', 'employment_industry', 'employment_occupation']
  transformed = pd.DataFrame()
  transformed2 = pd.DataFrame()

  for f in feats:
    transformed_names = ['region_', 'race_', 'industry_', 'occup_']
    lb = LabelBinarizer()

    # train set
    fit = lb.fit_transform(train_df[f])
    transformed_feat = pd.DataFrame(fit, columns=[transformed_names[feats.index(f)]+str(col) for col in list(train_df[f].unique())])
    transformed = pd.concat([transformed, transformed_feat], axis=1)
    train_df.drop(f, axis=1, inplace=True)

    if binaryEncode_test == True:
      # test set
      fit_testdata = lb.transform(test_df[f])
      transformed_feat2 = pd.DataFrame(fit_testdata, columns=[transformed_names[feats.index(f)]+str(col) for col 
                                                                        in list(test_df[f].unique())])
      transformed2 = pd.concat([transformed2, transformed_feat2], axis=1)
      test_df.drop([f], axis=1, inplace=True)
  
  train_df = pd.concat([train_df, transformed], axis=1)
  test_df = pd.concat([test_df, transformed2], axis=1)
  return train_df, test_df

In [None]:
# feature hash encoding high cardinality text columns 

def featureHashEncoder(train_df, test_df, featurehashEncode_test):
  feats = ['hhs_geo_region', 'race', 'employment_industry', 'employment_occupation']
  transformed = pd.DataFrame()
  transformed2 = pd.DataFrame()

  for f in feats:
    transformed_names = ['region_', 'race_', 'industry_', 'occup_']
    fh = FeatureHasher(n_features=len(train_df[f].unique()), input_type='string')

    # train set
    fit = fh.fit_transform(train_df[f])
    transformed_feat = pd.DataFrame(fit.toarray(), columns=[transformed_names[feats.index(f)]+str(col) for col in list(train_df[f].unique())])
    transformed = pd.concat([transformed, transformed_feat], axis=1)
    train_df.drop(f, axis=1, inplace=True)

    if featurehashEncode_test == True:
      # test set
      fit_testdata = fh.transform(test_df[f])
      transformed_feat2 = pd.DataFrame(fit_testdata.toarray(), columns=[transformed_names[feats.index(f)]+str(col) for col 
                                                                        in list(test_df[f].unique())])
      transformed2 = pd.concat([transformed2, transformed_feat2], axis=1)
      test_df.drop([f], axis=1, inplace=True)
  
  train_df = pd.concat([train_df, transformed], axis=1)
  test_df = pd.concat([test_df, transformed2], axis=1)
  return train_df, test_df

In [None]:
# one hot encoding binary text columns

def onehotEncoder(df):
  df['sex'] = pd.get_dummies(df['sex'], prefix='sex', prefix_sep='_', drop_first=True) # returns MALE
  df['rent_or_own'] = pd.get_dummies(df['rent_or_own'], prefix='rentVown', prefix_sep='_', drop_first=True) # returns RENT
  df['marital_status'] = pd.get_dummies(df['marital_status'], prefix='married', prefix_sep='_', drop_first=True) # returns NOT MARRIED
  return df

aggregate the above functions into one

In [None]:
# AGGREGATOR
def encoder(train_df, test_df, binary_or_featurehash, encode_test=True):
  manualEncoder(train_df)
  manualEncoder(test_df)
  if binary_or_featurehash=='binary':
    train_df, test_df = binaryEncoder(train_df, test_df, encode_test)
  elif binary_or_featurehash=='featurehash':  
    train_df, test_df = featureHashEncoder(train_df, test_df, encode_test) 
  onehotEncoder(train_df)
  onehotEncoder(test_df)
  return train_df, test_df

## Feature selection

#### Feature selection using VIF

In [None]:
# remove multicolinear features - not necessary for boosted trees, but good practice to do so 
def checkVIF(df):
  vif = pd.DataFrame()
  vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
  vif["features"] = df.columns
  return vif

#### Feature selection using correlations

##### Correlations using Cramer's V

In [None]:
def cramersV(df):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    cols = df.columns
    
    combos = itertools.combinations_with_replacement(cols, 2)
    cramers = []
    for c in combos: 
        # create contingency table 
        contingencyTbl = pd.crosstab(df[c[0]], df[c[1]])
        
        # cramer's v
        chi2 = stats.chi2_contingency(contingencyTbl)[0]
        n = contingencyTbl.sum().sum()
        phi2 = chi2/n
        r,k = contingencyTbl.shape
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        cramersv = np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
        cramers.append([c[0], c[1], cramersv])
        
    mat = pd.DataFrame(np.zeros((len(cols), len(cols))))
    mat.index = cols 
    mat.columns = cols
    for c in cramers: 
        mat.loc[c[0],c[1]] = round(c[2],2)
        mat.loc[c[1],c[0]] = round(c[2],2)
    return mat

In [None]:
# plotting the correlations
def corrPlot(corrMat, name):
    sns.set(font_scale=2)
    fig, ax = plt.subplots(figsize=(35,39))
    heatmap = sns.heatmap(corrMat, annot=True, annot_kws={"size": 20}, 
                        linewidths=0.75, cmap='Blues', cbar=False, ax=ax)
    heatmap.set_title('{} CORRELATIONS'.format(name.upper()), fontsize=48)
    plt.show()

#####Correlation Analysis 
returns pairs of features and chooses a feature in each pair to drop based on mean correlations with other variables

In [None]:
## HELPER FUNCTIONS FOR ANALYZING THE CORRELATIONS

# find highly correlated features (indices and features)
def find_corrFeatures(corrMat, corrThresh):
  ## indices
  correlated_idx = []
  for row_idx in range(len(corrMat.values)):
    for col_idx in range(len(corrMat.values[row_idx])):
      if (corrMat.values[row_idx, col_idx] > corrThresh and row_idx!=col_idx):
        if list(set([row_idx,col_idx])) not in correlated_idx:
          correlated_idx.append(list(set([row_idx,col_idx])))

  ## features
  f1 =[]
  f2 = []
  for c in range(len(correlated_idx)):
    f1.append(correlated_idx[c][0])
    f2.append(correlated_idx[c][1])
  return f1, f2

# find correlations for highly correlated features
def find_corrs(corrMat, f1, f2):
  # rows and columns
  featureList1 = list(corrMat.index)
  featureList2 = list(corrMat.columns)

  # find the correlations for the highly correlated features
  corr_feats = pd.concat([pd.Series([featureList1[f] for f in f1]), pd.Series([featureList2[f] for f in f2])], axis=1)
  bad_corrs = [corrMat.loc[corr_feats.iloc[c,0], corr_feats.iloc[c,1]] for c in range(len(corr_feats))]
  problems_df = pd.concat([corr_feats.iloc[:,0], corr_feats.iloc[:,1], pd.Series(bad_corrs)], axis=1)
  problems_df.columns = ['feature1', 'feature2', 'cramerv']
  return problems_df, bad_corrs

# Find mean correlation of each of the highly correlated features with other variables (each row)
def find_meanCorrs(corrMat, problems_df, bad_corrs):
  try:
    corrMat.drop(['h1n1_vaccine', 'seasonal_vaccine'],axis=1, inplace=True)
    corrMat.drop(['h1n1_vaccine', 'seasonal_vaccine'], inplace=True)
  except KeyError:
    pass 

  meancorrs1 = [round(np.mean(corrMat.loc[l,:].drop(l)),2) for l in list(problems_df['feature1'])]
  meancorrs2 = [round(np.mean(corrMat.loc[:,l].drop(l)),2) for l in list(problems_df['feature2'])]

  zipped_corrs = list(zip(meancorrs1,meancorrs2))
  max_meancorrs = [max(corrs) for corrs in zipped_corrs]
  zipped_meancorrs = list(zip(max_meancorrs, zipped_corrs))
  return zipped_meancorrs

# Return the features in each row with the highest mean correlation with other variables
def find_highCorrFeatures(zipped_meancorrs, problems_df):
  to_remove = []
  for col_idx in range(len(zipped_meancorrs)):
    biggest = zipped_meancorrs[col_idx][0]
    both = list(zipped_meancorrs[col_idx][1])
    for row_idx in range(len(both)): 
        if both[row_idx] == biggest:
          to_remove.append([col_idx,row_idx])
  corr_TOREMOVE = pd.DataFrame([[bad[0], problems_df.iloc[bad[0], bad[1]]] for bad in to_remove], columns = ['row', 'feature_name'])
  return corr_TOREMOVE

In [None]:
## AGGREGATING INFO GATHERED USING HELPER FUNCTIONS AND RETURN NEATLY
def organize_corrAnalysis(corr_TOREMOVE, problems_df):
  # creating the dataframe
  feature_todrop = []
  for idx in range(len(problems_df)):
    droppable = corr_TOREMOVE[corr_TOREMOVE['row']==idx]
    if len(droppable)==2:
      feature_todrop.append('both')
    else: 
      feature_todrop = feature_todrop + droppable['feature_name'].values.tolist()

  corrAnalysis_df = pd.concat([problems_df.iloc[:,:2], pd.Series(feature_todrop), problems_df.iloc[:,-1]], axis=1)
  corrAnalysis_df.columns = ['compared_feature1', 'compared_feature2', 'feature_to_drop', 'cramersV_for_featuretoDrop'] 

  # aggregate final list of features to be dropped
  features_toDrop = []
  for idx in range(len(corrAnalysis_df)):
    if corrAnalysis_df.loc[idx,'feature_to_drop']=='both':
      features_toDrop.append(corrAnalysis_df.loc[idx, 'compared_feature1'])
      features_toDrop.append(corrAnalysis_df.loc[idx,'compared_feature2'])
    else:
      features_toDrop.append(corrAnalysis_df.loc[idx,'feature_to_drop'])
  features_toDrop = list(set(features_toDrop))
  return corrAnalysis_df, features_toDrop

# AGGREGATING EVERYTHING UP THERE INTO ONE FUNCTION 
def corrAnalysis(corrMat, corrThresh):
  features1, features2 = find_corrFeatures(corrMat, corrThresh)
  problem_df, problem_corrs = find_corrs(corrMat, features1, features2)
  zipped = find_meanCorrs(corrMat, problem_df, problem_corrs)
  bad_features = find_highCorrFeatures(zipped, problem_df)
  analysis_df, to_drop = organize_corrAnalysis(bad_features, problem_df)
  return analysis_df, to_drop

#### Remove unbalanced columns

In [None]:
# style function
def color_lowSamples(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val <= 15 else 'black'
    return 'color: %s' % color

In [None]:
# value counts for each level of each category 
def colVals(df, plots=1):
  sns.set(font_scale=1)

  cols = list(df.columns)
  percents = []

  for col in cols:
    percent_perLevel = round(df[col].value_counts()/len(df[col])*100,2)
    percents.append(percent_perLevel)

    if plots == 1:
      colVals = pd.DataFrame(df[col].value_counts())
      ax = sns.countplot(df[col])
      plt.xticks(rotation=90)
      plt.xlabel(col.replace('_', ' '))
      plt.ylabel('number of participants')

      plt.show()

    elif plots == 0:
      pass

    else:
      print('invalid plots value')
  return percents

#### Feature importances using random forest

In [None]:
def scale(features, labels):
  fcols = features.columns
  mm = MinMaxScaler()
  features_transformed = pd.DataFrame(mm.fit_transform(features), columns=fcols)
  features_transformed = pd.DataFrame(features_transformed, columns = fcols)
  labels_transformed = pd.Series(labels, name = labels.name)
  return features_transformed, labels_transformed

def balance(features, labels):
  fcols = features.columns
  sm = SMOTE(random_state=0)
  features_transformed, labels_transformed  = sm.fit_resample(features, labels)
  features_transformed = pd.DataFrame(features_transformed, columns = fcols)
  labels_transformed = pd.Series(labels_transformed, name = labels.name)
  return features_transformed, labels_transformed