In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

## Import Features dataset

In [129]:
df = pd.read_csv('./data/features_labels.csv')
df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine,is_vaccinated
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0,No Vaccine
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1,Seasonal
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0,No Vaccine
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1,Seasonal
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0,No Vaccine


In [47]:
df.shape

(26707, 39)

## Identify the percent of Missing values

In [48]:
missing_value_df = pd.DataFrame(
    {'percent_missing': df.isnull().sum() * 100 / len(df),
     'Data type ': df.dtypes})

In [49]:
missing_value_df.sort_values('percent_missing', 
                             inplace=True, 
                             ascending=False)

## 3 columns below have close to 50% data missing.
    # employment_occupation, 
    # employment_industry, 
    # health_insurance

In [50]:
missing_value_df

Unnamed: 0,percent_missing,Data type
employment_occupation,50.436215,object
employment_industry,49.912008,object
health_insurance,45.957989,float64
income_poverty,16.561201,object
doctor_recc_h1n1,8.087767,float64
doctor_recc_seasonal,8.087767,float64
rent_or_own,7.645936,object
employment_status,5.477965,object
marital_status,5.272026,object
education,5.268282,object


### Unique values per column in the dataset

In [51]:
for i in list(df.columns):
    print(i, ": " ,df[i].unique())

respondent_id :  [    0     1     2 ... 26704 26705 26706]
h1n1_concern :  [ 1.  3.  2.  0. nan]
h1n1_knowledge :  [ 0.  2.  1. nan]
behavioral_antiviral_meds :  [ 0.  1. nan]
behavioral_avoidance :  [ 0.  1. nan]
behavioral_face_mask :  [ 0.  1. nan]
behavioral_wash_hands :  [ 0.  1. nan]
behavioral_large_gatherings :  [ 0.  1. nan]
behavioral_outside_home :  [ 1.  0. nan]
behavioral_touch_face :  [ 1.  0. nan]
doctor_recc_h1n1 :  [ 0. nan  1.]
doctor_recc_seasonal :  [ 0. nan  1.]
chronic_med_condition :  [ 0.  1. nan]
child_under_6_months :  [ 0.  1. nan]
health_worker :  [ 0.  1. nan]
health_insurance :  [ 1. nan  0.]
opinion_h1n1_vacc_effective :  [ 3.  5.  4.  2.  1. nan]
opinion_h1n1_risk :  [ 1.  4.  3.  2.  5. nan]
opinion_h1n1_sick_from_vacc :  [ 2.  4.  1.  5.  3. nan]
opinion_seas_vacc_effective :  [ 2.  4.  5.  3.  1. nan]
opinion_seas_risk :  [ 1.  2.  4.  3.  5. nan]
opinion_seas_sick_from_vacc :  [ 2.  4.  1.  5. nan  3.]
age_group :  ['55 - 64 Years' '35 - 44 Years' '1

### In household_adults, there are 8056 rows of data where the number of adults is mentioned as 0. 
#### Should 0 value be considered as a null value?

In [52]:
df[df['household_adults'] == 0]
print(len(df[df['household_adults'] == 0]) * 100 / len(df))

30.164376380724153


In [53]:
df.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0,26707.0,26707.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583,0.212454,0.465608
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173,0.409052,0.498825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0,0.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0,1.0


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

### Below features seems to have missing values by MNAR (Missing Not at Random)
### People might not have answered 'Doctor Recommendations for H1N1 and Seasonal Flu' vaccines as below. 
    Accounts to 8% of total data. 
### Should this kind of data be deleted or imputed. 
    
    doctor_recc_h1n1               2160
    doctor_recc_seasonal           2160

###  'household_adults' & 'household_children' with same number of empty values, 
    household_adults                249
    household_children              249

### Selected Features

In [55]:
# Create a new data frame with selected features.
df_features = df.drop(['respondent_id','h1n1_vaccine','seasonal_vaccine','is_vaccinated'], axis = 1)
df_features.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


### Object Column types from df_features dataframe
    * Get list of columns of type object
    * Perform forward fill to eradicate NaN values

In [57]:
df_features.dtypes != "object"

h1n1_concern                    True
h1n1_knowledge                  True
behavioral_antiviral_meds       True
behavioral_avoidance            True
behavioral_face_mask            True
behavioral_wash_hands           True
behavioral_large_gatherings     True
behavioral_outside_home         True
behavioral_touch_face           True
doctor_recc_h1n1                True
doctor_recc_seasonal            True
chronic_med_condition           True
child_under_6_months            True
health_worker                   True
health_insurance                True
opinion_h1n1_vacc_effective     True
opinion_h1n1_risk               True
opinion_h1n1_sick_from_vacc     True
opinion_seas_vacc_effective     True
opinion_seas_risk               True
opinion_seas_sick_from_vacc     True
age_group                      False
education                      False
race                           False
sex                            False
income_poverty                 False
marital_status                 False
r

In [115]:
numeric_cols = df_features.columns[df_features.dtypes != "object"].values
print(numeric_cols)
df_num_features = df_features[numeric_cols]
display(df_num_features.head())

['h1n1_concern' 'h1n1_knowledge' 'behavioral_antiviral_meds'
 'behavioral_avoidance' 'behavioral_face_mask' 'behavioral_wash_hands'
 'behavioral_large_gatherings' 'behavioral_outside_home'
 'behavioral_touch_face' 'doctor_recc_h1n1' 'doctor_recc_seasonal'
 'chronic_med_condition' 'child_under_6_months' 'health_worker'
 'health_insurance' 'opinion_h1n1_vacc_effective' 'opinion_h1n1_risk'
 'opinion_h1n1_sick_from_vacc' 'opinion_seas_vacc_effective'
 'opinion_seas_risk' 'opinion_seas_sick_from_vacc' 'household_adults'
 'household_children']


Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0


In [61]:
category_cols = df_features.columns[df_features.dtypes == "object"].values
print(category_cols)

['age_group' 'education' 'race' 'sex' 'income_poverty' 'marital_status'
 'rent_or_own' 'employment_status' 'hhs_geo_region' 'census_msa'
 'employment_industry' 'employment_occupation']


In [96]:
df_cat_features = pd.DataFrame(df_features[category_cols])

In [97]:
df_cat_features = df_cat_features.fillna(method='ffill', axis=0)

# Populate the first row NaN values
df_cat_features['employment_industry'][0] = df_cat_features['employment_industry'][1]
df_cat_features['employment_occupation'][0] = df_cat_features['employment_occupation'][1]

In [98]:
df_cat_cols = list(df_cat_features)

In [99]:
df_cat_features

Unnamed: 0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,pxcmvdjn,xgwztkwe
1,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",rucpziij,xtkaffoo
4,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...
26702,65+ Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,pxcmvdjn,xgwztkwe
26703,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",fcxhlnwr,cmhcxjea
26704,55 - 64 Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,"MSA, Not Principle City",fcxhlnwr,cmhcxjea
26705,18 - 34 Years,Some College,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,fcxhlnwr,haliazsg


In [100]:
df_cat_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age_group              26707 non-null  object
 1   education              26707 non-null  object
 2   race                   26707 non-null  object
 3   sex                    26707 non-null  object
 4   income_poverty         26707 non-null  object
 5   marital_status         26707 non-null  object
 6   rent_or_own            26707 non-null  object
 7   employment_status      26707 non-null  object
 8   hhs_geo_region         26707 non-null  object
 9   census_msa             26707 non-null  object
 10  employment_industry    26707 non-null  object
 11  employment_occupation  26707 non-null  object
dtypes: object(12)
memory usage: 2.4+ MB


In [101]:
len(df_cat_cols)

12

### Function to convert Categorical (String) data into Ordinal values

In [102]:
ord_enc = OrdinalEncoder()
def convertOrdEncode(col):
    df_cat_features[col] = ord_enc.fit_transform(df_cat_features[[col]])

In [104]:
for col in df_cat_cols:
    convertOrdEncode(col)

In [105]:
df_cat_features

Unnamed: 0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,3.0,1.0,3.0,0.0,2.0,1.0,0.0,1.0,8.0,2.0,12.0,19.0
1,1.0,0.0,3.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,12.0,19.0
2,0.0,2.0,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,14.0,21.0
3,4.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,5.0,1.0,14.0,21.0
4,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,18.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
26702,4.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,9.0,2.0,12.0,19.0
26703,0.0,2.0,3.0,1.0,0.0,1.0,1.0,0.0,6.0,1.0,4.0,2.0
26704,3.0,3.0,3.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,4.0,2.0
26705,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,2.0,4.0,6.0


In [118]:
df_features_final = df_num_features.join(df_cat_features)
display(df_features_final)

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,3.0,0.0,2.0,1.0,0.0,1.0,8.0,2.0,12.0,19.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,3.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,12.0,19.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,14.0,21.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,3.0,0.0,2.0,1.0,1.0,1.0,5.0,1.0,14.0,21.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,18.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,1.0,0.0,1.0,9.0,2.0,12.0,19.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,1.0,0.0,1.0,1.0,0.0,6.0,1.0,4.0,2.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,3.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,4.0,2.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,5.0,2.0,4.0,6.0


In [119]:
scaler = StandardScaler(copy=True,with_mean=True,with_std=True)

In [120]:
df_scaled = scaler.fit_transform(df_features_final)

In [121]:
df_scaled.size

934745

In [122]:
imputer = SimpleImputer(
                        add_indicator=False, 
                        copy=True, 
                        fill_value=None, 
                        strategy='median',
                        missing_values=np.NaN)

In [123]:
idf = pd.DataFrame(imputer.fit_transform(df_scaled))
idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.679436,-2.042478,-0.22661,-1.626185,-0.272201,-2.175870,-0.747788,1.401639,0.690311,-0.531568,...,0.465442,-0.827124,2.068808,1.074551,-0.562720,0.794763,1.129841,1.416875,0.361274,1.066170
1,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,-0.531568,...,0.465442,1.209008,2.068808,1.074551,1.777084,-0.860782,-1.364875,-1.012379,0.361274,1.066170
2,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,-2.175870,-0.747788,-0.713450,-1.448623,-0.531568,...,0.465442,1.209008,-0.786206,1.074551,-0.562720,-0.860782,1.486229,-1.012379,0.660871,1.354578
3,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.713450,-1.448623,-0.531568,...,0.465442,-0.827124,2.068808,1.074551,1.777084,0.794763,0.060677,0.202248,0.660871,1.354578
4,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.713450,0.690311,-0.531568,...,0.465442,-0.827124,-0.786206,-0.930622,-0.562720,-0.860782,1.486229,-1.012379,1.260065,-0.952684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.419111,-2.042478,-0.22661,0.614936,-0.272201,-2.175870,-0.747788,1.401639,-1.448623,-0.531568,...,0.465442,-0.827124,-0.786206,1.074551,-0.562720,0.794763,1.486229,1.416875,0.361274,1.066170
26703,-0.679436,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.713450,-1.448623,1.881227,...,0.465442,1.209008,-0.786206,1.074551,1.777084,-0.860782,0.417065,0.202248,-0.837116,-1.385296
26704,0.419111,1.193048,-0.22661,0.614936,3.673754,0.459586,1.337277,-0.713450,0.690311,-0.531568,...,0.465442,-0.827124,-0.786206,1.074551,-0.562720,-0.860782,0.417065,-1.012379,-0.837116,-1.385296
26705,-0.679436,-0.424715,-0.22661,-1.626185,-0.272201,-2.175870,-0.747788,-0.713450,0.690311,-0.531568,...,-1.700915,-0.827124,-0.786206,-0.930622,1.777084,-0.860782,0.060677,1.416875,-0.837116,-0.808480


In [124]:
columnTransformer = ColumnTransformer(
                        [('encoder', OneHotEncoder(), [0])],
                        remainder="drop",
                        n_jobs=None, 
                        sparse_threshold=0.3,verbose=False
                        )

In [125]:
columnTransformer.fit_transform(idf)

<26707x4 sparse matrix of type '<class 'numpy.float64'>'
	with 26707 stored elements in Compressed Sparse Row format>

In [128]:
#idf.to_csv("./data/vaccine_fill_new.csv")
idf.to_csv("./data/features_numeric.csv", index=False)

In [127]:
idf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.679436,-2.042478,-0.22661,-1.626185,-0.272201,-2.17587,-0.747788,1.401639,0.690311,-0.531568,...,0.465442,-0.827124,2.068808,1.074551,-0.56272,0.794763,1.129841,1.416875,0.361274,1.06617
1,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,-0.531568,...,0.465442,1.209008,2.068808,1.074551,1.777084,-0.860782,-1.364875,-1.012379,0.361274,1.06617
2,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,-2.17587,-0.747788,-0.71345,-1.448623,-0.531568,...,0.465442,1.209008,-0.786206,1.074551,-0.56272,-0.860782,1.486229,-1.012379,0.660871,1.354578
3,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.71345,-1.448623,-0.531568,...,0.465442,-0.827124,2.068808,1.074551,1.777084,0.794763,0.060677,0.202248,0.660871,1.354578
4,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.71345,0.690311,-0.531568,...,0.465442,-0.827124,-0.786206,-0.930622,-0.56272,-0.860782,1.486229,-1.012379,1.260065,-0.952684


### Convert to categorical data types

In [202]:
df_cat.dtypes

h1n1_concern                   category
h1n1_knowledge                 category
behavioral_antiviral_meds      category
behavioral_avoidance           category
behavioral_face_mask           category
behavioral_wash_hands          category
behavioral_large_gatherings    category
behavioral_outside_home        category
behavioral_touch_face          category
doctor_recc_h1n1               category
doctor_recc_seasonal           category
chronic_med_condition          category
child_under_6_months           category
health_worker                  category
opinion_h1n1_vacc_effective    category
opinion_h1n1_risk              category
opinion_h1n1_sick_from_vacc    category
opinion_seas_vacc_effective    category
opinion_seas_risk              category
opinion_seas_sick_from_vacc    category
age_group                      category
education                      category
race                           category
sex                            category
income_poverty                 category


### Print cat codes map

### output cleaned csv file

In [252]:
idf.to_csv("./data/vaccine_cleanup_new.csv")