In [120]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [98]:
dir = '/content/drive/MyDrive/Colab/ML-Projects/Predict-Flu-Vaccine/'

X_test = pd.read_csv(dir + 'test_set_features.csv', index_col='respondent_id')
X_train = pd.read_csv(dir + 'training_set_features.csv', index_col='respondent_id')
y_train = pd.read_csv(dir + 'training_set_labels.csv', index_col='respondent_id')

In [99]:
print(f"Train data: {X_train.shape}")
print(f"Test data: {X_test.shape}")
print(f"Train labels: {y_train.shape}")

Train data: (26707, 35)
Test data: (26708, 35)
Train labels: (26707, 2)


In [100]:
cols = [
    'age_group',
    'education',
    'race',
    'sex',
    'income_poverty',
    'marital_status',
    'rent_or_own',
    'employment_status',
    'employment_industry',
    'employment_occupation',
    'hhs_geo_region',
    'census_msa'
]
## Remove '<' '>' characters from data
remove_special_chars = lambda row: str(row).replace('<', 'less than ').replace('>', 'greater than ')

for col in cols:
  X_train[col] = X_train[col].map(remove_special_chars)
  X_test[col] = X_test[col].map(remove_special_chars)

In [101]:
## Label encode categorical columns
from sklearn import preprocessing
## Encoding categorical variables - age_group, education, race, sex, income_poverty, maritial_status, rent_or_own, 
# employmant_status, employment_industry, employment_occupation

encoder = preprocessing.LabelEncoder()
combined = pd.concat([X_train, X_test])
for col in cols:
  encoder = encoder.fit(combined[col].values.reshape(-1, 1))
  X_train[col] = encoder.transform(X_train[col].values.reshape(-1, 1))
  X_test[col] = encoder.transform(X_test[col].values.reshape(-1,1))

print(X_train.shape)
print(X_test.shape)

(26707, 35)
(26708, 35)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [102]:
## Replace Nan values

imputed_X_train = X_train.copy()
imputed_X_test = X_test.copy()

cols_with_missing = [col for col in X_train.columns.tolist() if X_train[col].isnull().any()]

for col in cols_with_missing:
  imputed_X_train[col + '_was_missing'] = imputed_X_train[col].isnull()
  imputed_X_train[col].fillna(imputed_X_train[col].mean(), inplace=True)
  imputed_X_test[col + '_was_missing'] = imputed_X_test[col].isnull()
  imputed_X_test[col].fillna(imputed_X_test[col].mean(), inplace=True)

# Imputation
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputed_X_train = imputer.fit_transform(imputed_X_train)
# imputed_X_test = imputer.transform(imputed_X_test)

In [113]:
combined_df_h1n1 = pd.concat([imputed_X_train, y_train.drop('seasonal_vaccine', axis=1)], axis=1)

combined_df_h1n1['kfold'] = -1

combined_df_h1n1 = combined_df_h1n1.sample(frac=1).reset_index(drop=True)

y = combined_df_h1n1['h1n1_vaccine'].values

kf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=combined_df_h1n1, y=y)):
  combined_df_h1n1.loc[v_, 'kfold'] = f

# save to csv
combined_df_h1n1.to_csv(dir + 'train_folds_h1n1.csv', index=False)   

In [118]:
combined_df_seasonal = pd.concat([imputed_X_train, y_train.drop('h1n1_vaccine', axis=1)], axis=1)

combined_df_seasonal['kfold'] = -1

combined_df_seasonal = combined_df_seasonal.sample(frac=1).reset_index(drop=True)

y = combined_df_seasonal['seasonal_vaccine'].values

kf = model_selection.KFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=combined_df_seasonal, y=y)):
  combined_df_seasonal.loc[v_, 'kfold'] = f

# save to csv
combined_df_seasonal.to_csv(dir + 'train_folds_seasonal.csv', index=False)   

In [119]:
combined_df_seasonal.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_concern_was_missing,h1n1_knowledge_was_missing,behavioral_antiviral_meds_was_missing,behavioral_avoidance_was_missing,behavioral_face_mask_was_missing,behavioral_wash_hands_was_missing,behavioral_large_gatherings_was_missing,behavioral_outside_home_was_missing,behavioral_touch_face_was_missing,doctor_recc_h1n1_was_missing,doctor_recc_seasonal_was_missing,chronic_med_condition_was_missing,child_under_6_months_was_missing,health_worker_was_missing,health_insurance_was_missing,opinion_h1n1_vacc_effective_was_missing,opinion_h1n1_risk_was_missing,opinion_h1n1_sick_from_vacc_was_missing,opinion_seas_vacc_effective_was_missing,opinion_seas_risk_was_missing,opinion_seas_sick_from_vacc_was_missing,household_adults_was_missing,household_children_was_missing,seasonal_vaccine,kfold
0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.220312,0.329735,1.0,0.0,0.0,0.87972,5.0,2.0,1.0,5.0,4.0,1.0,2,1,3,1,1,1,0,1,3,1,1.0,0.0,10,11,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False,1,0
1,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,2.0,1.0,4.0,4.0,1.0,4,0,1,0,3,1,0,1,6,1,0.0,0.0,10,11,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,0
2,3.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.87972,4.0,5.0,2.0,4.0,4.0,5.0,3,2,3,0,1,0,0,0,0,2,0.0,0.0,4,2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,1,0
3,2.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.87972,4.0,2.0,1.0,5.0,2.0,1.0,2,2,3,0,2,0,0,1,6,0,2.0,2.0,10,11,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,1,0
4,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.283261,0.0,0.0,0.87972,4.0,1.0,2.0,4.0,4.0,1.0,0,2,3,0,2,1,0,0,0,0,1.0,0.0,13,20,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,0,0


In [122]:
imputed_X_test.to_csv(dir + 'imputed_test.csv', index=False)