In [1]:
# imports
import numpy as np
import pandas as pd
import statistics
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer


In [2]:
#initial data cleaning (causes pipeline to freeze after one use if pd.drop() is in pipeline.)
features = pd.read_csv("Data/training_set_features.csv")
labels = pd.read_csv("Data/training_set_labels.csv")

features.drop(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1',
               'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 
               'opinion_h1n1_sick_from_vacc', 'hhs_geo_region', 
               'employment_industry', 'employment_occupation'], axis = 1, inplace = True)

labels.drop(['h1n1_vaccine', 'respondent_id'], axis = 1, inplace= True)
labels = np.ravel(labels, order = 'C')

In [3]:
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [4]:
#create functions for preprocessing 
def replace_NAN_median(X_df):
    opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
                'household_children']
    for column in opinions:
        X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
    return X_df
        
def replace_NAN_mode(X_df):
    miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
    for column in miss_cat_features:
        X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
    return X_df

        

In [5]:
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
col_transformer = ColumnTransformer(transformers=
    [("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0), 
    ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition', 
    'child_under_6_months', 'health_worker', 'health_insurance']),
    
    ("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                                'opinion_seas_sick_from_vacc', 
                                'household_adults', 'household_children']),
     
    ("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex', 
                                'income_poverty', 'marital_status', 'rent_or_own',
                                'employment_status', 'census_msa'])],
     
    remainder="passthrough")



In [6]:
preprocessing_pipe = Pipeline(steps=[
    ("NAN_median", NAN_median), 
    ("NAN_mode", NAN_mode), 
    ("col_transformer", col_transformer)
    ])

In [7]:
logreg_base_model_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
                                    ("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42))])
    

In [8]:

logreg_base_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing_pipe',
                 Pipeline(steps=[('NAN_median',
                                  FunctionTransformer(func=<function replace_NAN_median at 0x000002B60AEBE4C0>)),
                                 ('NAN_mode',
                                  FunctionTransformer(func=<function replace_NAN_mode at 0x000002B60AEBE3A0>)),
                                 ('col_transformer',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('NAN_0',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy=...
                                                                   ['opinion_seas_vacc_effective',
                                                                    'opinion_seas_risk',
                                                          

In [9]:
logreg_base_model_pipe.score(X_train, y_train)


0.7679480778831752

In [11]:
param_grid = {
    'log_reg__penalty': ['l1', 'l2'], 
    'log_reg__C': [0.001,0.01,0.1,1,10,100,1000]   
}

gs = GridSearchCV(estimator=logreg_base_model_pipe,
                  param_grid=param_grid,
                  cv=5)


In [None]:
#logreg_model_pipe.get_params().keys()


In [12]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing_pipe',
                                        Pipeline(steps=[('NAN_median',
                                                         FunctionTransformer(func=<function replace_NAN_median at 0x000002B60AEBE4C0>)),
                                                        ('NAN_mode',
                                                         FunctionTransformer(func=<function replace_NAN_mode at 0x000002B60AEBE3A0>)),
                                                        ('col_transformer',
                                                         ColumnTransformer(remainder='passthrough',
                                                                           transformers=[('NAN_0',
                                                                                          SimpleIm...
                                                                                           'household_adults',
                   

In [13]:
gs.best_params_

{'log_reg__C': 10, 'log_reg__penalty': 'l1'}

In [14]:
logreg_optimized_pipe =  Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
                                    ("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
    

In [15]:
logreg_optimized_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing_pipe',
                 Pipeline(steps=[('NAN_median',
                                  FunctionTransformer(func=<function replace_NAN_median at 0x000002B60AEBE4C0>)),
                                 ('NAN_mode',
                                  FunctionTransformer(func=<function replace_NAN_mode at 0x000002B60AEBE3A0>)),
                                 ('col_transformer',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('NAN_0',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy=...
                                                                   ['opinion_seas_vacc_effective',
                                                                    'opinion_seas_risk',
                                                          

In [16]:
logreg_optimized_pipe.score(X_train, y_train)

0.7741387918122816