# Data Cleaning

## Loading Packages

In [14]:
#Standard data analytical libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt
import os, warnings, time, dmba
import scikitplot as skplt 

#Data Mining Book Libraries
from dmba import liftChart, gainsChart,regressionSummary, classificationSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection, adjusted_r2_score, AIC_score, BIC_score
from os.path import exists
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc, roc_auc_score, plot_confusion_matrix,confusion_matrix,r2_score
#Classification 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import Perceptron, LogisticRegression,  LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import imblearn
from imblearn.over_sampling import SMOTE

# Used to save keystrokes when wanting to print something. Now we can just use
# p("Hello") instead of print("Hello")
p = print
# import csv
# import re

# Change this value if you are not using o_desktop
computer = 'o_desktop'
#computer = 'other'
if (computer == 'o_desktop'):
    os.environ['NUMEXPR_MAX_THREADS'] = '24'
else:
    # default is 4 or 8
    os.environ['NUMEXPR_MAX_THREADS'] = '8'

# For future use:
# import threading
# import multiprocessing

## Loading Data

In [2]:
# Setting directories and loading training set and training labels
repo_directory = r'C:/ADS_599_Final/'
data_folder_directory = r'C:/ADS_599_Final/Data_Folder/'
df_features_file = 'C:/ADS_599_Final/Data_Folder/training_set_features.csv'
df_labels_file = 'C:/ADS_599_Final/Data_Folder/training_set_labels.csv'
df = pd.read_csv(df_features_file)
df_labels = pd.read_csv(df_labels_file)

# Combining training data with training labels for modeling
df = df.join(df_labels.set_index('respondent_id'), on='respondent_id')

df.head(5)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


## Handling Null Values

Replace NaNs with a new value. A person refusing to answer a question could be significant. Note: for imputations we will have to put back the NaNs.

In [3]:
# Renaming df
df_train = df

# Categories
df_train['h1n1_concern'] = df_train['h1n1_concern'].fillna(-1)
df_train['h1n1_knowledge'] = df_train['h1n1_knowledge'].fillna(-1)
df_train['behavioral_antiviral_meds'] = df_train['behavioral_antiviral_meds'].fillna(-1)
df_train['behavioral_avoidance'] = df_train['behavioral_avoidance'].fillna(-1)
df_train['behavioral_face_mask'] = df_train['behavioral_face_mask'].fillna(-1)
df_train['behavioral_large_gatherings'] = df_train['behavioral_large_gatherings'].fillna(-1)
df_train['behavioral_outside_home'] = df_train['behavioral_outside_home'].fillna(-1)
df_train['behavioral_wash_hands'] = df_train['behavioral_wash_hands'].fillna(-1)          
df_train['behavioral_touch_face'] = df_train['behavioral_touch_face'].fillna(-1)
df_train['doctor_recc_h1n1'] = df_train['doctor_recc_h1n1'].fillna(-1)
df_train['doctor_recc_seasonal'] = df_train['doctor_recc_seasonal'].fillna(-1)
df_train['chronic_med_condition'] = df_train['chronic_med_condition'].fillna(-1)
df_train['child_under_6_months'] = df_train['child_under_6_months'].fillna(-1)
df_train['health_worker'] = df_train['health_worker'].fillna(-1)
df_train['health_insurance'] = df_train['health_insurance'].fillna(-1)
df_train['opinion_h1n1_vacc_effective'] = df_train['opinion_h1n1_vacc_effective'].fillna(-1)
df_train['opinion_h1n1_sick_from_vacc'] = df_train['opinion_h1n1_sick_from_vacc'].fillna(-1)
df_train['opinion_h1n1_risk'] = df_train['opinion_h1n1_risk'].fillna(-1)
df_train['opinion_seas_vacc_effective'] = df_train['opinion_seas_vacc_effective'].fillna(-1)
df_train['opinion_seas_risk'] = df_train['opinion_seas_risk'].fillna(-1)
df_train['opinion_seas_sick_from_vacc'] = df_train['opinion_seas_sick_from_vacc'].fillna(-1)
df_train['household_adults'] = df_train['household_adults'].fillna(-1)
df_train['household_children'] = df_train['household_children'].fillna(-1)

# Numbers
df_train['age_group'] = df_train['age_group'].fillna("no_response")
df_train['education'] = df_train['education'].fillna("no_response")
df_train['race'] = df_train['race'].fillna("no_response")
df_train['income_poverty'] = df_train['income_poverty'].fillna("no_response")
df_train['marital_status'] = df_train['marital_status'].fillna("no_response")
df_train['rent_or_own'] = df_train['rent_or_own'].fillna("no_response")
df_train['employment_status'] = df_train['employment_status'].fillna("no_response")
df_train['employment_occupation'] = df_train['employment_occupation'].fillna("no_response")
df_train['employment_industry'] = df_train['employment_industry'].fillna("no_response")

Converting those categories to numbers. Category encoding known in the R world as as.factor-ing.

In [4]:
# Label encoding
df_train_label = df_train
    # Encode labels the below is equivalent to df_train['hhs_geo_region']= label_encoder.fit_transform(df_train['hhs_geo_region'])
df_train_label["hhs_geo_region"] = df_train["hhs_geo_region"].astype('category')
df_train_label["hhs_geo_region"] = df_train["hhs_geo_region"].cat.codes
df_train_label["census_msa"] = df_train["census_msa"].astype('category')
df_train_label["census_msa"] = df_train["census_msa"].cat.codes
df_train_label["employment_industry"] = df_train["employment_industry"].astype('category')
df_train_label["employment_industry"] = df_train["employment_industry"].cat.codes
df_train_label["employment_occupation"] = df_train["employment_occupation"].astype('category')
df_train_label["employment_occupation"] = df_train["employment_occupation"].cat.codes
df_train_label["employment_status"] = df_train["employment_status"].astype('category')
df_train_label["employment_status"] = df_train["employment_status"].cat.codes
df_train_label["rent_or_own"] = df_train["rent_or_own"].astype('category')
df_train_label["rent_or_own"] = df_train["rent_or_own"].cat.codes
df_train_label["marital_status"] = df_train["marital_status"].astype('category')
df_train_label["marital_status"] = df_train["marital_status"].cat.codes
df_train_label["income_poverty"] = df_train["income_poverty"].astype('category')
df_train_label["income_poverty"] = df_train["income_poverty"].cat.codes
df_train_label["race"] = df_train["race"].astype('category')
df_train_label["race"] = df_train["race"].cat.codes
df_train_label["education"] = df_train["education"].astype('category')
df_train_label["education"] = df_train["education"].cat.codes
df_train_label["age_group"] = df_train["age_group"].astype('category')
df_train_label["age_group"] = df_train["age_group"].cat.codes
df_train_label["sex"] = df_train["sex"].astype('category')
df_train_label["sex"] = df_train["sex"].cat.codes

p("After encoding the null counts per column are: ")
p(df_train.isnull().sum())

After encoding the null counts per column are: 
respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                 

In [5]:
# Handling nulls three ways

handling_nulls = "median" # options "median" "iterative" "dropall"
if handling_nulls == "iterative":
    #Need to add back the NaN for the imputations.
    df_train.replace(-1, np.nan) 
    df_train.replace("no_response", np.nan) 
    
    # SMOTE Sampling
    temp_columns = df_train.columns
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df_train)
    df_train = pd.DataFrame(data=imp.transform(df_train))
    df_train.columns = temp_columns
    df_train
elif handling_nulls == "median":
    df_train_median = df_train
    #Need to add back the NaN for the imputations.
    df_train_median.replace(-1, np.nan) 
    df_train_median.replace("no_response", np.nan) 
    df_train_median.fillna(df_train.median())
elif handling_nulls == "dropall":
    df_train_drop = df_train
    #Need to add back the NaN for the imputations.
    df_train_drop.replace(-1, np.nan) 
    df_train_drop.replace("no_response", np.nan) 
    # See how it is if we drop the NaNs
    df_train_drop = df_train.dropna(inplace=False) #This should be replace with imputation.

In [6]:
p("There should be no nulls now: ")
p(df_train_median.isnull().sum())

There should be no nulls now: 
respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_s

In [7]:
handling_nulls = "dropall"
if handling_nulls == "iterative":
    #Need to add back the NaN for the imputations.
    df_train.replace(-1, np.nan) 
    df_train.replace("no_response", np.nan) 
    
    # SMOTE Sampling
    temp_columns = df_train.columns
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df_train)
    df_train = pd.DataFrame(data=imp.transform(df_train))
    df_train.columns = temp_columns
    df_train
elif handling_nulls == "median":
    df_train_median = df_train
    #Need to add back the NaN for the imputations.
    df_train_median.replace(-1, np.nan) 
    df_train_median.replace("no_response", np.nan) 
    df_train_median.fillna(df_train.median())
elif handling_nulls == "dropall":
    df_train_drop = df_train
    #Need to add back the NaN for the imputations.
    df_train_drop.replace(-1, np.nan) 
    df_train_drop.replace("no_response", np.nan) 
    # See how it is if we drop the NaNs
    df_train_drop = df_train.dropna(inplace=False) #This should be replace with imputation.
p("There should be no nulls now: ")
p(df_train_drop.isnull().sum())

There should be no nulls now: 
respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_s

## Removing columns we don't need

Dropping ID as we do not need to use it in the modeling.

In [8]:
# Respondent_id are all unique so its irrelevant now that we merged.
df_train = df_train.drop(columns=['respondent_id'], inplace=False)
df_train_label = df_train_label.drop(columns=['respondent_id'], inplace=False)
df_train_median = df_train_median.drop(columns=['respondent_id'], inplace=False)
df_train_drop = df_train_drop.drop(columns=['respondent_id'], inplace=False)

## Class Balancing

Balancing/Oversampling minority classes using SMOTE.

In [9]:
oversample = SMOTE()

# Separating the features and targets
# Original Data
X_h1n1 = df_train.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_seasonal = X_h1n1
y_h1n1 = df_train['h1n1_vaccine']
y_seasonal = df_train['seasonal_vaccine']
X_h1n1, y_h1n1 = oversample.fit_resample(X_h1n1, y_h1n1)
X_seasonal, y_seasonal = oversample.fit_resample(X_seasonal, y_seasonal)

# Encoded Data
X_label_h1n1 = df_train_label.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_label_seasonal = X_label_h1n1
y_label_h1n1 = df_train_label['h1n1_vaccine']
y_label_seasonal = df_train_label['seasonal_vaccine']
X_label_h1n1, y_label_h1n1 = oversample.fit_resample(X_label_h1n1, y_label_h1n1)
X_label_seasonal, y_label_seasonal = oversample.fit_resample(X_label_seasonal, y_label_seasonal)

# Nulls replaced with median data
X_median_h1n1 = df_train_median.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_median_seasonal = X_median_h1n1
y_median_h1n1 = df_train_median['h1n1_vaccine']
y_median_seasonal = df_train_median['seasonal_vaccine']
X_median_h1n1, y_median_h1n1 = oversample.fit_resample(X_median_h1n1, y_median_h1n1)
X_median_seasonal, y_median_seasonal = oversample.fit_resample(X_median_seasonal, y_median_seasonal)

# Nulls dropped data
X_drop_h1n1 = df_train_drop.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
X_drop_seasonal = X_drop_h1n1
y_drop_h1n1 = df_train_drop['h1n1_vaccine']
y_drop_seasonal = df_train_drop['seasonal_vaccine']
X_drop_h1n1, y_drop_h1n1 = oversample.fit_resample(X_drop_h1n1, y_drop_h1n1)
X_drop_seasonal, y_drop_seasonal = oversample.fit_resample(X_drop_seasonal, y_drop_seasonal)

In [11]:
# Testing to see if y is balanced
ytest = y_drop_h1n1.to_frame()
ytest.value_counts()

h1n1_vaccine
0               21033
1               21033
dtype: int64

## Preparing train/test/validation sets

In [12]:
# Splitting data into 70-20-10 train-test-validation sets

# Original Data
X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(X_h1n1, y_h1n1, train_size=.7)
X_test_h1n1, X_val_h1n1, y_test_h1n1, y_val_h1n1 = train_test_split(X_test_h1n1, y_test_h1n1, train_size=.67)

X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(X_seasonal, y_seasonal, train_size=.7)
X_test_seasonal, X_val_seasonal, y_test_seasonal, y_val_seasonal = train_test_split(X_test_seasonal, y_test_seasonal, train_size=.67)

# Encoded Data
X_train_label_h1n1, X_test_label_h1n1, y_train_label_h1n1, y_test_label_h1n1 = train_test_split(X_label_h1n1, y_label_h1n1, train_size=.7)
X_test_label_h1n1, X_val_label_h1n1, y_test_label_h1n1, y_val_label_h1n1 = train_test_split(X_test_label_h1n1, y_test_label_h1n1, train_size=.67)

X_train_label_seasonal, X_test_label_seasonal, y_train_label_seasonal, y_test_label_seasonal = train_test_split(X_label_seasonal, y_label_seasonal, train_size=.7)
X_test_label_seasonal, X_val_label_seasonal, y_test_label_seasonal, y_val_label_seasonal = train_test_split(X_test_label_seasonal, y_test_label_seasonal, train_size=.67)

# Nulls replaced with median data
X_train_median_h1n1, X_test_median_h1n1, y_train_median_h1n1, y_test_median_h1n1 = train_test_split(X_median_h1n1, y_median_h1n1, train_size=.7)
X_test_median_h1n1, X_val_median_h1n1, y_test_median_h1n1, y_val_median_h1n1 = train_test_split(X_test_median_h1n1, y_test_median_h1n1, train_size=.67)

X_train_median_seasonal, X_test_median_seasonal, y_train_median_seasonal, y_test_median_seasonal = train_test_split(X_median_seasonal, y_median_seasonal, train_size=.7)
X_test_median_seasonal, X_val_median_seasonal, y_test_median_seasonal, y_val_median_seasonal = train_test_split(X_test_median_seasonal, y_test_median_seasonal, train_size=.67)

# Nulls dropped data
X_train_drop_h1n1, X_test_drop_h1n1, y_train_drop_h1n1, y_test_drop_h1n1 = train_test_split(X_drop_h1n1, y_drop_h1n1, train_size=.7)
X_test_drop_h1n1, X_val_drop_h1n1, y_test_drop_h1n1, y_val_drop_h1n1 = train_test_split(X_test_drop_h1n1, y_test_drop_h1n1, train_size=.67)

X_train_drop_seasonal, X_test_drop_seasonal, y_train_drop_seasonal, y_test_drop_seasonal = train_test_split(X_drop_seasonal, y_drop_seasonal, train_size=.7)
X_test_drop_seasonal, X_val_drop_seasonal, y_test_drop_seasonal, y_val_drop_seasonal = train_test_split(X_test_drop_seasonal, y_test_drop_seasonal, train_size=.67)

## Normalization and Standardization

In [13]:
# Data Normalization with sklearn

# Fitting a scaler on the training datasets
normh1n1= MinMaxScaler().fit(X_train_h1n1)
normseasonal = MinMaxScaler().fit(X_train_seasonal)

# Transforming the training datasets
X_train_norm_h1n1 = normh1n1.transform(X_train_h1n1)
X_train_norm_seasonal = normseasonal.transform(X_train_seasonal)

# transform the testing dataset
X_test_norm_h1n1 = normh1n1.transform(X_test_h1n1)
X_test_norm_seasonal = normseasonal.transform(X_test_seasonal)

In [15]:
# Data standardization with sklearn

# copy the two datasets
X_train_stand_h1n1 = X_train_h1n1.copy()
X_train_stand_seasonal = X_train_seasonal.copy()
X_test_stand_h1n1 = X_test_h1n1.copy()
X_test_stand_seasonal = X_test_seasonal.copy()

# Group the numerical features and not categorical

num_cols = ['h1n1_concern','h1n1_knowledge','behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_large_gatherings',
'behavioral_outside_home','behavioral_wash_hands','behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition',
'child_under_6_months','health_worker','health_insurance','opinion_h1n1_vacc_effective','opinion_h1n1_sick_from_vacc','opinion_h1n1_risk',
'opinion_seas_sick_from_vacc','household_adults','household_children']

# Apply standardization on the numerical features
for i in num_cols:
    
    # Fit the scaler on the training data column
    scale_h1n1 = StandardScaler().fit(X_train_stand_h1n1[[i]])
    scale_seasonal = StandardScaler().fit(X_train_stand_seasonal[[i]])
    
    # Transform the training data column
    X_train_stand_h1n1[i] = scale_h1n1.transform(X_train_stand_h1n1[[i]])
    X_train_stand_seasonal[i] = scale_seasonal.transform(X_train_stand_seasonal[[i]])
    
    # Transform the testing data column
    X_test_stand_h1n1[i] = scale_h1n1.transform(X_test_stand_h1n1[[i]])
    X_test_stand_seasonal[i] = scale_seasonal.transform(X_test_stand_seasonal[[i]])