In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import math
from matplotlib import pyplot as plt
from scipy import stats as stats

# visualization
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline

# scaling and train test split
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import MinMaxScaler

# pipeline setup

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Import the evaluation matrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV


# evaluation on test data
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
file_path = "\\Users\\eggfr\\Flatiron\\Flatiron_phase3_project\\data\\H1N1_Flu_Vaccines.csv"
project3_raw_df = pd.read_csv(file_path)


# Identifying Features and Target

Once the data is loaded into a pandas dataframe, the next step is identifying which columns represent features and which column represents the target.
In the cell below, assign X to be the features and y to be the target. Remember that X should not contain the target.

In [None]:
project3_raw_df.info()

# Identifying Features and Target

Once the data is loaded into a pandas dataframe, the next step is identifying which columns represent features and which column represents the target. In this project, we are going to focus on predicting whether people got H1N1 vaccine using data collected in the National 2009 H1N1 Flu Survey which can be found from this link https://www.kaggle.com/datasets/arashnic/flu-data.
In the cell below, assign X to be the features and y to be the target, which is project3_raw_2_df['h1n1_vaccine']. Also, this is not an extremely inbalanced dataset, around 78% of the responses is not vaccinated. Also, for binary variables in the dataset, 0 -> No, while 1 -> Yes.

In [None]:
print(project3_raw_df["h1n1_vaccine"].value_counts())
print()
print("Percentages")
print(project3_raw_df["h1n1_vaccine"].value_counts(normalize=True))

In [None]:
y = project3_raw_df['h1n1_vaccine']
X = project3_raw_df.drop(columns=['h1n1_vaccine'], axis=1)

Separating data into training and testing sets is an important part of evaluating the models.Most of the data is used for training, and a smaller portion of the data is used for testing. For this analysis: we only split data into train and test. 75% of the data is for training and 25% for test. Also, the data split happened before we even do any EDA analysis to prevent data leakage. There is 20030 row of datas for the train set and 6677 rows of the data for test set before any data cleaning or analysis is done.

In [None]:
#create train-test set using 75%-25% ratio for the train set and test set and set the random state = 42) randomly split the data
x_train, x_test, y_train, y_test = train_test_split(X, y ,test_size=0.25,random_state=42)
# shape of train and test splits
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.isnull().sum().sort_values(ascending=False)

In [None]:
#x_train['doctor_recc_h1n1'].value_counts()

In [None]:
x_train.info()

You are provided a dataset with 36 columns. The first column respondent_id is a unique and random identifier. The remaining 35 features are described below.

For all binary variables: 0 = No; 1 = Yes.

h1n1_concern - Level of concern about the H1N1 flu.
0 = Not at all concerned; 1 = Not very concerned; 2 = Somewhat concerned; 3 = Very concerned.
h1n1_knowledge - Level of knowledge about H1N1 flu.
0 = No knowledge; 1 = A little knowledge; 2 = A lot of knowledge.
behavioral_antiviral_meds - Has taken antiviral medications. (binary)
behavioral_avoidance - Has avoided close contact with others with flu-like symptoms. (binary)
behavioral_face_mask - Has bought a face mask. (binary)
behavioral_wash_hands - Has frequently washed hands or used hand sanitizer. (binary)
behavioral_large_gatherings - Has reduced time at large gatherings. (binary)
behavioral_outside_home - Has reduced contact with people outside of own household. (binary)
behavioral_touch_face - Has avoided touching eyes, nose, or mouth. (binary)
doctor_recc_h1n1 - H1N1 flu vaccine was recommended by doctor. (binary)
doctor_recc_seasonal - Seasonal flu vaccine was recommended by doctor. (binary)
chronic_med_condition - Has any of the following chronic medical conditions: asthma or an other lung condition, diabetes, a heart condition, a kidney condition, sickle cell anemia or other anemia, a neurological or neuromuscular condition, a liver condition, or a weakened immune system caused by a chronic illness or by medicines taken for a chronic illness. (binary)
child_under_6_months - Has regular close contact with a child under the age of six months. (binary)
health_worker - Is a healthcare worker. (binary)
health_insurance - Has health insurance. (binary)
opinion_h1n1_vacc_effective - Respondent's opinion about H1N1 vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.
opinion_h1n1_risk - Respondent's opinion about risk of getting sick with H1N1 flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.
opinion_h1n1_sick_from_vacc - Respondent's worry of getting sick from taking H1N1 vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.
opinion_seas_vacc_effective - Respondent's opinion about seasonal flu vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.
opinion_seas_risk - Respondent's opinion about risk of getting sick with seasonal flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.
opinion_seas_sick_from_vacc - Respondent's worry of getting sick from taking seasonal flu vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.
age_group - Age group of respondent.
education - Self-reported education level.
race - Race of respondent.
sex - Sex of respondent.
income_poverty - Household annual income of respondent with respect to 2008 Census poverty thresholds.
marital_status - Marital status of respondent.
rent_or_own - Housing situation of respondent.
employment_status - Employment status of respondent.
hhs_geo_region - Respondent's residence using a 10-region geographic classification defined by the U.S. Dept. of Health and Human Services. Values are represented as short random character strings.
census_msa - Respondent's residence within metropolitan statistical areas (MSA) as defined by the U.S. Census.
household_adults - Number of other adults in household, top-coded to 3.
household_children - Number of children in household, top-coded to 3.
employment_industry - Type of industry respondent is employed in. Values are represented as short random character strings.
employment_occupation - Type of occupation of respondent. Values are represented as short random character strings.

There is no duplication for the train set and test set.

In [None]:
x_train.duplicated().sum()

In [None]:
x_test.duplicated().sum()

In [None]:
x_train['marital_status'].value_counts(normalize = True)

In [None]:
x_train['income_poverty'].value_counts()

In [None]:
x_train['education'].value_counts()

In [None]:
x_train['rent_or_own'].value_counts()

In [None]:
x_train['employment_status'].value_counts()

In [None]:
x_train['education'].value_counts()

In [None]:
x_train['household_children'].value_counts()

In [None]:
x_train['hhs_geo_region'].value_counts()

In [None]:
x_train['census_msa'].value_counts()

In [None]:
x_train['employment_industry'].value_counts(normalize = True)

In [None]:
x_train['health_insurance'].value_counts()

In [None]:
x_train['h1n1_concern'].value_counts()

In [None]:
print("age_group")
print(x_train.age_group.unique())

print("education")
print(x_train.education.unique())

print("race")
print(x_train.race.unique())

print("income_poverty")
print(x_train.income_poverty.unique())

print("marital_status")
print(x_train.marital_status.unique())

print("rent_or_own")
print(x_train.rent_or_own.unique())

print("employment_status")
print(x_train.employment_status.unique())

print("hhs_geo_region")
print(x_train.hhs_geo_region.unique())

print("census_msa")
print(x_train.census_msa.unique())

print("employment_industry")
print(x_train.employment_industry.unique())

print("employment_occupation")
print(x_train.employment_occupation.unique())

In [None]:
x_train.shape

Dropping unused column.
respondent_id is dropped since it is not going to be used in the analysis.
'opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc' are also dropped since the analysis is focused on H1N1 vaccine prediction.

In [None]:
x_train = x_train.drop(columns=['respondent_id','employment_occupation','employment_industry','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc'], axis=1)
x_train.shape

# Pipeline

Now we need to set a pipeline for our data with the imputing staregy from the discussion above.
We will set up a numeric pipeline for numerical variable, 

In [None]:
numeric_pipeline = Pipeline([('numimputer', SimpleImputer(strategy = 'mean')), ('numnorm', StandardScaler())])

We set up different ordinal pipelines for different categorical oridnal variables.

In [None]:
age_list = ['18 - 34 Years', '35 - 44 Years','45 - 54 Years', '55 - 64 Years', '65+ Years']
income_list = ['Below Poverty','<= $75,000, Above Poverty','> $75,000']
emp_stat_list = ['Not in Labor Force' ,'Unemployed','Employed']
edu_list =['< 12 Years','12 Years', 'Some College', 'College Graduate']
census_list = ['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City']
hhs_list = ['oxchjgsf', 'lzgpxyit', 'kbazzjca', 'mlyzmhmf', 'bhuqouqj', 'lrircsnp',
'atmpeygn', 'fpwskwrf', 'dqpwygqj', 'qufhixun']

In [None]:
ordinal_age_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [age_list])),
    ('ordnorm', StandardScaler())])

In [None]:
ordinal_income_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [income_list])),
    ('ordnorm', StandardScaler())])

In [None]:
ordinal_emp_status_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [emp_stat_list])),
    ('ordnorm', StandardScaler())])

In [None]:
ordinal_edu_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [edu_list])),
    ('ordnorm', StandardScaler())])

In [None]:
ordinal_census_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [census_list])),
    ('ordnorm', StandardScaler())])

In [None]:
ordinal_hhs_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [hhs_list])),
    ('ordnorm', StandardScaler())])

Lastly, we set up nominal pipeline using Onehotcoder for the categorical nominal variables. 

In [None]:
nominal_pipeline = Pipeline([
    ('onehotimputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehotenc', OneHotEncoder(sparse = False, drop = 'first')), 
    ('onehotnorm', MaxAbsScaler())])

In [None]:
nominal_pipeline = Pipeline([
    ('onehotimputer', KNNImputer(n_neighbors=5)),
    ('onehotenc', OneHotEncoder(sparse = False, drop = 'first')), 
    ('onehotnorm', MaxAbsScaler())])


Now, we unite different pipeline with the column transformer so we can specify columns each pipeline acts on.

In [None]:

 
num_cols = x_train.select_dtypes(['int', 'float']).columns
nom_resp_cols = ['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_h1n1','chronic_med_condition','child_under_6_months','health_worker','race','sex','marital_status','rent_or_own']

ct = ColumnTransformer(
    [ ("ordinalpipe", ordinal_age_pipeline, ['age_group']),
       ("ordinalpipe2", ordinal_income_pipeline, ['income_poverty']),
       ("ordinalpipe3", ordinal_emp_status_pipeline, ['employment_status']),
       ("ordinalpipe4", ordinal_edu_pipeline, ['education']),
       ("ordinalpipe5", ordinal_census_pipeline, ['census_msa']),
       ("ordinalpipe6", ordinal_hhs_pipeline, ['hhs_geo_region']),
       ("nominalpipe", nominal_pipeline,nom_resp_cols),
       ("nominalpipe2", nominal_pipeline,['health_insurance']),
       ("numpipe", numeric_pipeline, num_cols)])
    

     #("nominalpipe", nominal_pipeline,nom_resp_cols),
     #("numpipe", numeric_pipeline, num_cols)])

In [None]:
x_train_clean = pd.DataFrame(ct.fit_transform(x_train))
#x_train_clean.isnull().sum().sort_values(ascending=False)
x_train_clean.shape

In [None]:
x_train_clean.describe()

In [None]:
ct

In [None]:
ct.named_transformers_

In [None]:
ct.named_transformers_['ordinalpipe2']

In [None]:
model1_pipe = Pipeline([('preprocess', ct),
                      ('model',
                       LogisticRegression())])
model1_pipe


In [None]:
x_train.shape

In [None]:
model1_pipe.fit(x_train,y_train)

In [None]:
y_pred = model1_pipe.predict(x_test)

In [None]:
plot_confusion_matrix(baseline_pipe,x_test,y_test)

In [None]:
print(model1_pipe.score(x_train,y_train))
print(model1_pipe.score(x_test,y_test))

In [None]:
#def print_metrics(labels, preds):
   # print("Precision Score: {}".format(precision_score(labels, preds)))
    #   print("Recall Score: {}".format(recall_score(labels, preds)))
   # print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
   # print("F1 Score: {}".format(f1_score(labels, preds)))

In [None]:
#print_metrics(y_test, y_pred)

# Build and Evaluate a Baseline Model

#Using scikit-learn's LogisticRegression model, instantiate a classifier with random_state=42. Then use cross_val_score with scoring="neg_log_loss" to find the average cross-validated log loss for this model on X_train and y_train.

In [None]:

# Import relevant class and function
#from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import cross_val_score

# Instantiate a LogisticRegression with random_state=42
#baseline_model = LogisticRegression(random_state=42)

# Use cross_val_score with scoring="neg_log_loss" to evaluate the model
# on X_train and y_train
#baseline_neg_log_loss_cv = cross_val_score(baseline_model, X_train, y_train, scoring="neg_log_loss")

#baseline_log_loss = -(baseline_neg_log_loss_cv.mean())
#baseline_log_loss

In [None]:
#for feature in numerical_nan:
    ## We will replace by using median since there are outliers
 #   median_value=df[feature].median()
    
  #  df[feature].fillna(median_value,inplace=True)
    
#df[numerical_nan].isnull().sum()

In [None]:
# Instantiate the model
#logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

# Fit the model
#logreg.fit(X_train, y_train)
#LogisticRegression(C=1000000000000.0, class_weight=None, dual=False,
                   #fit_intercept=False, intercept_scaling=1, l1_ratio=None,
                   #max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   #random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   #warm_start=False)