In [1]:
# imports
import numpy as np
import pandas as pd
import statistics
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer


In [2]:
#things that will be in the main Jupyter Notebook (load data, pre-pre-processing)
features = pd.read_csv("Data/training_set_features.csv")
labels = pd.read_csv("Data/training_set_labels.csv")

features.drop(['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1',
               'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 
               'opinion_h1n1_sick_from_vacc', 'hhs_geo_region', 
               'employment_industry', 'employment_occupation'], axis = 1, inplace = True)

labels.drop(['h1n1_vaccine', 'respondent_id'], axis = 1, inplace= True)
labels = np.ravel(labels, order = 'C')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
X_train

Unnamed: 0,respondent_id,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_seasonal,chronic_med_condition,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children
25194,25194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,12 Years,White,Female,,Not Married,Own,Not in Labor Force,Non-MSA,1.0,1.0
14006,14006,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,Some College,White,Female,,Married,,Employed,"MSA, Not Principle City",2.0,1.0
11285,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,College Graduate,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Principle City",0.0,1.0
2900,2900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,College Graduate,White,Male,Below Poverty,Not Married,Own,Employed,"MSA, Not Principle City",0.0,0.0
19083,19083,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,,...,,White,Female,,,,,"MSA, Not Principle City",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,21575,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,Some College,White,Male,"> $75,000",Not Married,Own,Not in Labor Force,"MSA, Principle City",0.0,0.0
5390,5390,0.0,0.0,0.0,1.0,0.0,0.0,1.0,,0.0,...,Some College,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Unemployed,"MSA, Principle City",0.0,0.0
860,860,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,12 Years,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,Non-MSA,1.0,0.0
15795,15795,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,College Graduate,Black,Female,"> $75,000",Married,Own,Employed,"MSA, Principle City",1.0,0.0


In [4]:
#create functions for preprocessing

#def features_drop(X_df):
#    X_df.drop(["h1n1_concern", "h1n1_knowledge", "doctor_recc_h1n1", "opinion_h1n1_vacc_effective", "opinion_h1n1_risk", "opinion_h1n1_sick_from_vacc", "hhs_geo_region", "employment_industry", "employment_occupation"], 
#              axis = 1, inplace = True)
#    return X_df

def replace_NAN_median(X_df):
    opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
                'household_children']
    for column in opinions:
        X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
    return X_df
        
def replace_NAN_mode(X_df):
    miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
    for column in miss_cat_features:
        X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
    return X_df

        
    


In [9]:
# Instantiate transformers
#feat_drop = FunctionTransformer(features_drop)
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
col_transformer = ColumnTransformer(transformers=
    [("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0), 
    ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition', 
    'child_under_6_months', 'health_worker', 'health_insurance']),
    
    ("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                                'opinion_seas_sick_from_vacc', 
                                'household_adults', 'household_children']),
     
    ("ohe", OneHotEncoder(), ['age_group','education', 'race', 'sex', 
                                'income_poverty', 'marital_status', 'rent_or_own',
                                'employment_status', 'census_msa'])],
     
    remainder="passthrough")



In [10]:
preprocessing_pipe = Pipeline(steps=[
    #("feat_drop", feat_drop),
    ("NAN_median", NAN_median), 
    ("NAN_mode", NAN_mode), 
    ("col_transformer", col_transformer),
    ("model", LogisticRegression(solver = 'liblinear'))
    ], verbose = True)

In [11]:
base_log_model = preprocessing_pipe.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


[Pipeline] ........ (step 1 of 4) Processing NAN_median, total=   0.0s
[Pipeline] .......... (step 2 of 4) Processing NAN_mode, total=   0.0s
[Pipeline] ... (step 3 of 4) Processing col_transformer, total=   0.1s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


In [12]:
base_log_model.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


0.7763965852927962

In [15]:
base_log_model.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


array([0, 0, 1, ..., 1, 0, 1], dtype=int64)