* [1 Initialization](#1)
    + [1.1 Data Description](#1.1)
    + [1.2 Data Loading](#1.2)
* [2 Feature Engineer And Data Exploration](#2)
    + [2.1 Check for null and missing values](#2.1)
    + [2.2 Deal with missing values](#2.2)
    + [2.3 Data Exploration](#2.3)
        - [2.3.1 Respondent's Knowledge And Opinion About H1N1](#2.3.1)
        - [2.3.2 Respondent's Opinion About Seasonal Flu](#2.3.2)
        - [2.3.3 Behavior Difference Between Respondents Received H1N1 Flu Vaccine Or Not](#2.3.3)
        - [2.3.4 Behavior Difference Between Respondents Received Seasonal Flu Vaccine Or Not](#2.3.4)
        - [2.3.5 Background Difference Between Respondents Received H1N1 Vaccine Or Not](#2.3.5)
        - [2.3.6 Background Difference Between Respondents Received Seasonal Flu Vaccine Or Not](#2.3.6)
    + [2.4 Category Feature Transformation](#2.4)
    + [2.5 Skewed Data Process](#2.5)
    + [2.6 Feature Importance](#2.6)
* [3 Dataset Split](#3)
* [4 Modeling and Evaluation-H1N1](#4)
    + [4.1 Hypothesis](#4.1)
    + [4.2 Gaussian Naive Bayes](#4.2)
    + [4.3 Logistic Regression](#4.3)
    + [4.4 ROC](#4.4)
    + [4.5 Signficance Testing](#4.5)

This is about how to predict whether people got H1N1 vaccines using information they shared about their backgrounds, opinions, and health behaviors.   
And we can find detailed information about the dataset on this website: https://www.drivendata.org/competitions/66/flu-shot-learning/page/211/.

In [None]:
!pip install pyecharts

In [None]:
from pyecharts import options as opts
from pyecharts.globals import SymbolType
from pyecharts.charts import Liquid
liquid_h1n1=(Liquid()
.add("lq", [0.21])
.set_global_opts(title_opts=opts.TitleOpts(title="Proportion Of People Received H1N1 Vaccine In 2009 Flu Survey", pos_left="20%"))
.render_notebook()
  )
liquid_h1n1

<a id='1'></a>
# 1.Initialization

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.subplots import make_subplots
%matplotlib inline

<a id='1.1'></a>
## 1.1 Data Description

In [None]:
description= pd.read_csv("../input/flu-description/description.csv")

In [None]:
fig = go.Figure(data=[go.Table(
    columnorder = [1,2,3],
    columnwidth = [60,20,400],
    header=dict(values=list(description.columns),
                line_color='darkslategray',
    fill_color='royalblue',
    align=['center'],
    font=dict(color='white', family="Franklin Gothic", size=15),
    height=25),
    cells=dict(values=[description["Feature name"],description["Type"],description["Description and values"]],
               fill_color='white',
               line_color='darkslategray',
               align=['left'], 
               font=dict(size=12),
               height=30))
])

fig.show()

<a id='1.2'></a>
## 1.2 Data Loading

In [None]:
X = pd.read_csv("../input/flu-shot-prediction/training_set_features.csv")
Y = pd.read_csv("../input/flu-shot-prediction/training_set_labels.csv")
X.head()

In [None]:
Y.head()

In [None]:
print(X.shape,Y.shape)

<a id='2'></a>
# 2 Feature Engineer And Data Exploration

<a id='2.1'></a>
## 2.1 Check for null and missing values

In [None]:
def missing_value(all_data):
    all_data_na = all_data.isnull().sum() 
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=True)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    return missing_data

def missing_ratio(all_data):
    all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=True)
    missing_ratio = pd.DataFrame({'Missing Ratio' :all_data_na})
    return missing_ratio

In [None]:
missing_value=missing_value(X)
missing_ratio=missing_ratio(X)
missing_value.shape

We can see that there are 30 features contain missing values.

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_xaxes=True,
                    shared_yaxes=False, vertical_spacing=0.001)

fig.append_trace(go.Bar(
            x=missing_value.values.flatten(),
            y=missing_value.index,
            orientation='h',
            name='Missing Values In Flu Dataset',
), 1, 1)

fig.append_trace(go.Scatter(
    x=missing_ratio.values.flatten(),
    y=missing_ratio.index,
    mode='lines+markers',
    line_color='rgb(128, 0, 128)',
    name='Missing Ratio In Flu Dataset',
), 1, 2)

fig.update_layout(
    title='Missing Features In Flu Dataset',
    title_x=0.5,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        domain=[0, 0.85],
    ),
    yaxis2=dict(
        showgrid=False,
        showline=True,
        showticklabels=False,
        linecolor='rgba(102, 102, 102, 0.8)',
        linewidth=2,
        domain=[0, 0.85],
    ),
    xaxis=dict(
        zeroline=False,
        showline=False,
        showticklabels=True,
        showgrid=True,
        domain=[0, 0.42],
    ),
    xaxis2=dict(
        zeroline=False,
        showline=True,
        showticklabels=True,
        showgrid=True,
        domain=[0.47, 1],
        side='bottom',

    ),
    legend=dict(x=0.5, y=1.01, font_size=12),
    margin=dict(l=100, r=20, t=70, b=70),
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    width=1000,
    height=700,
    font=dict(family="Franklin Gothic", size=12)
)

# Update xaxis properties
fig.update_xaxes(title_text="number", row=1, col=1)
fig.update_xaxes(title_text="ratio(%)",  row=1, col=2)

fig.show()

<a id='2.2'></a>
## 2.2 Deal with missing values

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def deal_missing(all_data):
    # manually process
    all_data["employment_status"]=all_data["employment_status"].fillna('Unknown')
    all_data.loc[all_data['employment_status']!="Employed","employment_industry"]="None"
    all_data.loc[all_data['employment_status']!="Employed","employment_occupation"]="None"
    
    # use median to fill in numeric data
    sim_impute_num= SimpleImputer(strategy="median")
    imputed_num =pd.DataFrame( sim_impute_num.fit_transform(all_data._get_numeric_data()))
    imputed_num.columns=all_data._get_numeric_data().columns

    # use mode to fill in categorial data
    sim_impute_cat= SimpleImputer(strategy="most_frequent")
    imputed_cat =pd.DataFrame( sim_impute_cat.fit_transform(all_data.select_dtypes(exclude=[np.number])))
    imputed_cat.columns=all_data.select_dtypes(exclude=[np.number]).columns
    
    # merge
    all_data_merge=pd.concat([imputed_num, imputed_cat], axis=1)

    return all_data_merge

In [None]:
X=deal_missing(X)

<a id='2.3'></a>
## 2.3 Data Exploration

In [None]:
Data_visual=pd.concat([X, Y], axis=1)
Data_visual["count"]=1

In [None]:
Data_h1n1_1=Data_visual[Data_visual["h1n1_vaccine"]==1]
Data_h1n1_0=Data_visual[Data_visual["h1n1_vaccine"]==0]
Data_seas_1=Data_visual[Data_visual["seasonal_vaccine"]==1]
Data_seas_0=Data_visual[Data_visual["seasonal_vaccine"]==0]

In [None]:
df1=pd.DataFrame(Data_visual["h1n1_vaccine"].value_counts())
df2=pd.DataFrame(Data_visual["seasonal_vaccine"].value_counts())

vacc_count=pd.concat([df1,df2], axis=1)
vacc_count

In [None]:
fig, axis = plt.subplots(1,2,figsize=(15,3))

axis[0].bar(["0","1"],vacc_count['h1n1_vaccine'],color=["blue","green"])
axis[1].bar(["0","1"],vacc_count['seasonal_vaccine'],color=["blue","green"])

axis[0].set(xlabel='Received Vaccine', ylabel='Number of People', alpha=0.6)
axis[1].set(xlabel='Received Vaccine', ylabel='Number of People', alpha=0.6)
axis[0].set_title('H1N1')
axis[1].set_title('Seasonal Flu')

plt.show()

Compared with H1N1 Vaccine(only1/5 people received this vaccine), around half people received seasonal flu vaccine.

<a id='2.3.1'></a>
### 2.3.1 Respondent's Knowledge And Opinion About H1N1

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
fig.subplots_adjust(hspace=0.3, wspace=0.3)
numlist=range(1,6)
featurelist=['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc']

for i,feature in zip(numlist,featurelist):
    plt.subplot(2,3,i) 
    plt.hist(data=Data_h1n1_0,x=feature, histtype='bar',fill=True, alpha=0.8)
    plt.hist(data=Data_h1n1_1,x=feature, histtype='bar',fill=True, alpha=0.8)
    plt.xlabel(feature, fontsize=13)
    plt.ylabel('value', fontsize=13)
    
labels=["0","1"]
fig.legend(labels, loc='center right',title="H1N1 Vaccine")
fig.suptitle("Respondent's Knowledge And Opinion About H1N1", fontsize=16,y=0.93)
plt.show()

People who received H1N1 vaccine, consider the H1N1 risk is serious and trust the effective of the vaccine when compared with those who didn't received H1N1 vaccine.

<a id='2.3.2'></a>
### 2.3.2 Respondent's Opinion About Seasonal Flu

In [None]:
fig, ax = plt.subplots(figsize=(17,4))
fig.subplots_adjust(hspace=0.3, wspace=0.3)
numlist=range(1,4)
featurelist=['opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc']

for i,feature in zip(numlist,featurelist):
    plt.subplot(1,3,i) 
    plt.hist(data=Data_seas_0,x=feature, histtype='bar',fill=True, alpha=0.8)
    plt.hist(data=Data_seas_1,x=feature, histtype='bar',fill=True, alpha=0.8)
    plt.xlabel(feature, fontsize=13)
    plt.ylabel('value', fontsize=13)

labels=["0","1"]
fig.legend(labels, loc='center right',title="Seasonal Flu Vaccine")
fig.suptitle("Respondent's Opinion About Seasonal Flu", fontsize=16,y=0.95)
plt.show()

Similar with H1N1, people who trust the effective and worried about the risk of seasonal flu, are more like to receive seasonal flu vaccine

<a id='2.3.3'></a>
### 2.3.3 Behavior Difference Between Respondents Received H1N1 Flu Vaccine Or Not

In [None]:
def to_matrix(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]

We can see from the following piecharts, people who have chronic medical conditions and are recommended H1N1 flu vaccine by doctor, are more likely to receive H1N1 flu vaccine.

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px
from itertools import repeat
import math

fig_num=12
numlist=range(1,fig_num+1)
col_num=2
specs_list=[]
specs_list.extend(repeat({"type": "pie"},fig_num*2))
specs_list=to_matrix(specs_list,col_num*2)
featurelist=['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_h1n1','chronic_med_condition','child_under_6_months','health_worker','health_insurance']
subplot_titlelist=['behavioral_antiviral_meds','behavioral_avoidance','behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_h1n1','behavioral_touch_face','doctor_recc_h1n1','chronic_med_condition','child_under_6_months','chronic_med_condition','child_under_6_months','health_worker','health_insurance','health_worker','health_insurance']
fig = make_subplots(
    rows=math.ceil(fig_num//col_num), cols=col_num*2,specs=specs_list,subplot_titles=subplot_titlelist
)

for i,feature in zip(numlist,featurelist):
    fig.add_trace(go.Pie(labels=Data_h1n1_0[feature], values=Data_seas_0['count']), row=math.ceil((i)/col_num), col=(i-1)%col_num+1)
    fig.add_trace(go.Pie(labels=Data_h1n1_1[feature], values=Data_seas_1['count']), row=math.ceil((i)/col_num), col=(i-1)%col_num+3)


fig.update_layout(height=1600, width=1400, showlegend=False)
fig.update_traces(textposition='inside', textinfo='percent+label',hole=.2, hoverinfo="label+percent+name")

fig.add_annotation(x=0.08, y=1.05,
            text="Respondent Didn't Receive H1N1 flu vaccine",
                   showarrow=False,
                   font=dict(
                   size=18 ))

fig.add_annotation(x=0.9, y=1.05,
            text="Respondent Received H1N1 flu vaccine",
                   showarrow=False,
                   font=dict(
                   size=18 )
                  )
fig.show()

When compared the left part (people who didn't receive H1N1 flu vaccine) and right part (people who received H1N1 flu vaccine), we can figure out that doctors recommend influence the result a lot (who will receive a vaccine). Meanwhile, chronic medical conditions may be an important aspect that doctors considered.

<a id='2.3.4'></a>
### 2.3.4 Behavior Difference Between Respondents Received Seasonal Flu Vaccine Or Not

Similar with H1N1, we can see that people who have chronic medical conditions and are recommended seasonal flu vaccine by doctor, are more likely to receive seasonal flu vaccine.

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px
from itertools import repeat
import math

fig_num=12
numlist=range(1,fig_num+1)
col_num=2
specs_list=[]
specs_list.extend(repeat({"type": "pie"},fig_num*2))
specs_list=to_matrix(specs_list,col_num*2)
featurelist=['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']
subplot_titlelist=['behavioral_antiviral_meds','behavioral_avoidance','behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home','behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_seasonal','behavioral_touch_face','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','chronic_med_condition','child_under_6_months','health_worker','health_insurance','health_worker','health_insurance']
fig = make_subplots(
    rows=math.ceil(fig_num//col_num), cols=col_num*2,specs=specs_list,subplot_titles=subplot_titlelist
)

for i,feature in zip(numlist,featurelist):
    fig.add_trace(go.Pie(labels=Data_seas_0[feature], values=Data_seas_0['count']), row=math.ceil((i)/col_num), col=(i-1)%col_num+1)
    fig.add_trace(go.Pie(labels=Data_seas_1[feature], values=Data_seas_1['count']), row=math.ceil((i)/col_num), col=(i-1)%col_num+3)


fig.update_layout(height=1600, width=1400, showlegend=False)
fig.update_traces(textposition='inside', textinfo='percent+label',hole=.2, hoverinfo="label+percent+name")

fig.add_annotation(x=0.08, y=1.05,
            text="Respondent Didn't Received Seasonal Flu Vaccine",
                   showarrow=False,
                   font=dict(
                   size=18 ))

fig.add_annotation(x=0.9, y=1.05,
            text="Respondent Received Seasonal Flu Vaccine",
                   showarrow=False,
                   font=dict(
                   size=18 )
                  )
fig.show()

Similar conclusion can be observed in seasonal flu behavior difference analysis. Chronic medical conditions and doctors' recommendation are most important influencer on whether receiving a vaccine.

<a id='2.3.5'></a>
### 2.3.5 Background Difference Between Respondents Received H1N1 Vaccine Or Not

In [None]:
col_num=3
fig_num=12
fig, axes = plt.subplots(math.ceil(fig_num/col_num), col_num, figsize=(25, 15), sharey=True)
fig.subplots_adjust(hspace=0.3, wspace=0.3)

numlist=range(1,fig_num+1)
featurelist_ver=['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','census_msa','household_adults','household_children']
sns.set(color_codes=True)
for i,feature in zip(numlist,featurelist_ver):
    x,y = feature, 'h1n1_vaccine'
    (Data_visual
    .groupby(x)[y]
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), x=x,y='number',hue=y,ax=axes[math.ceil((i)/col_num)-1,(i-1)%col_num+1-1])
    )
            
fig.suptitle("Respondent's Background Info By H1N1 Vaccine", fontsize=16,y=0.93)
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('hhs_geo_region')['h1n1_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='hhs_geo_region',x='number',hue='h1n1_vaccine')
    )

plt.title("Respondent's Residence Info By H1N1 Vaccine")
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('employment_industry')['h1n1_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='employment_industry',x='number',hue='h1n1_vaccine')
    )

plt.title("Respondent's Employment Industry By H1N1 Vaccine")
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('employment_occupation')['h1n1_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='employment_occupation',x='number',hue='h1n1_vaccine')
    )

plt.title("Respondent's Employment Occupation By H1N1 Vaccine")
plt.show()

We can see that people in some certain occupation have higher proportion to receive H1N1 vaccine.

<a id='2.3.6'></a>
### 2.3.6 Background Difference Between Respondents Received Seasonal Flu Vaccine Or Not

We can see that older people are more likely to receive seasonal flu vaccine compared with young people.

In [None]:
col_num=3
fig_num=12
fig, axes = plt.subplots(math.ceil(fig_num/col_num), col_num, figsize=(25, 15), sharey=True)
fig.subplots_adjust(hspace=0.3, wspace=0.3)

numlist=range(1,fig_num+1)
featurelist_ver=['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','census_msa','household_adults','household_children']
sns.set(color_codes=True)
for i,feature in zip(numlist,featurelist_ver):
    x,y = feature, 'seasonal_vaccine'
    (Data_visual
    .groupby(x)[y]
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), x=x,y='number',hue=y,ax=axes[math.ceil((i)/col_num)-1,(i-1)%col_num+1-1])
    )
            
fig.suptitle("Respondent's Background Info By Seasonal Flu Vaccine", fontsize=16,y=0.93)
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('hhs_geo_region')['seasonal_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='hhs_geo_region',x='number',hue='seasonal_vaccine')
    )

plt.title("Respondent's Residence By Seasonal Flu Vaccine")
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('employment_industry')['seasonal_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='employment_industry',x='number',hue='seasonal_vaccine')
    )

plt.title("Respondent's Employment Industry By Seasonal Flu Vaccine")
plt.show()

In [None]:
plt.figure()
(Data_visual
    .groupby('employment_occupation')['seasonal_vaccine']
    .value_counts()
    .rename('number')
    .reset_index()
    .pipe((sns.barplot,'data'), y='employment_occupation',x='number',hue='seasonal_vaccine')
    )

plt.title("Respondent's Employment Occupation By Seasonal Flu Vaccine")
plt.show()

<a id='2.4'></a>
## 2.4 Category Feature Transformation

In [None]:
X.select_dtypes(exclude=[np.number]).columns

Here we will do different feature engineer according to vaccine type: 

In [None]:
def feature_transform(all_data):
    all_data['age_group']=all_data['age_group'].apply(lambda x: 1 if x in ['55 - 64 Years','65+ Years'] else 0)
    all_data['education']=all_data['education'].apply(lambda x: 1 if x in ['< 12 Years','12 Years'] else 0)
    all_data['race']=all_data['race'].apply(lambda x: 1 if x in ['White'] else 0)
    all_data['sex']=all_data['sex'].apply(lambda x: 1 if x in ['Female'] else 0)
    all_data['income_poverty']=all_data['income_poverty'].apply(lambda x: 2 if x  == '> $75,000' else 1 if x == '<= $75,000, Above Poverty' else 0)
    all_data['marital_status']=all_data['marital_status'].apply(lambda x: 1 if x in ['Married'] else 0)
    all_data['rent_or_own']=all_data['rent_or_own'].apply(lambda x: 1 if x in ['Own'] else 0)
    all_data['employment_status']=all_data['employment_status'].apply(lambda x: 1 if x in ['Employed'] else 0)
    all_data['hhs_geo_region']=all_data['hhs_geo_region'].apply(lambda x: 1 if x in ['atmpeygn','bhuqouqj','kbazzjca'] else 0)
    all_data['employment_industry']=all_data['employment_industry'].apply(lambda x: 1 if x in ['fcxhlnwr'] else 0)
    all_data['employment_occupation']=all_data['employment_occupation'].apply(lambda x: 1 if x in ['cmhcxjea','haliazsg'] else 0)
    return all_data

In [None]:
X=feature_transform(X)

We will use dummy on `census_msa`

In [None]:
X = pd.get_dummies(X)

Drop `respondent_id` column

In [None]:
X= X.drop('respondent_id', axis=1)

<a id='2.5'></a>
## 2.5 Skewed Data Process

In [None]:
from scipy.stats import norm, skew 

numeric_feats = X.dtypes[X.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
print(skewness.head(10))

X[skewness.index] = np.log1p(X[skewness.index])

In [None]:
X.shape

<a id='2.6'></a>
## 2.6 Feature Importance

In [None]:
Y_h1n1=Y['h1n1_vaccine']
Y_seas=Y['seasonal_vaccine']

In [None]:
X_h1n1=X
#X_h1n1=X.drop(['opinion_seas_vacc_effective', 'doctor_recc_seasonal','opinion_seas_risk','opinion_seas_sick_from_vacc'], 1)
X_seas=X.drop(['opinion_h1n1_vacc_effective', 'h1n1_concern','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','h1n1_knowledge','doctor_recc_h1n1'], 1)

**feature importance about h1n1**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200)
rf = rf.fit(X_h1n1,Y_h1n1)

# show the importance of each feature
rf_importance=pd.DataFrame({'Features':np.array(X_h1n1.columns),'Importance':rf.feature_importances_*100 }).sort_values(by='Importance', ascending=False)
rf_importance

In [None]:
rf_importance.sort_values(by='Importance').plot(kind='barh',x='Features', y='Importance',figsize=(4,10),legend=False,title="Feature Importance About H1N1 Vaccine")

Resorted feature columns according to the importance

In [None]:
X_h1n1=X_h1n1[rf_importance['Features']]

**feature importance about seasonal flu**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200)
rf = rf.fit(X_seas,Y_seas)

# show the importance of each feature
rf_importance=pd.DataFrame({'Features':np.array(X_seas.columns),'Importance':rf.feature_importances_*100 }).sort_values(by='Importance', ascending=False)
rf_importance

Resorted feature columns according to the importance

In [None]:
X_seas=X_seas[rf_importance['Features']]

<a id='3'></a>
# 3 Dataset Split

In [None]:
from sklearn.model_selection import train_test_split

X_h1n1_tra,X_h1n1_val,y_h1n1_tra,y_h1n1_val = train_test_split(X_h1n1, Y_h1n1, test_size = 0.25, random_state = 25)
X_seas_tra,X_seas_val,y_seas_tra,y_seas_val = train_test_split(X_seas, Y_seas, test_size = 0.25, random_state = 25)
print("h1n1:")
print(X_h1n1_tra.shape)
print(X_h1n1_val.shape)
print(y_h1n1_tra.shape)
print(y_h1n1_val.shape)
print("seasonal flu:")
print(X_seas_tra.shape)
print(X_seas_val.shape)
print(y_seas_tra.shape)
print(y_seas_val.shape)

<a id='4'></a>
# 4 Modeling and Evaluation-H1N1

In [None]:
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn import metrics

<a id='4.1'></a>
## 4.1 Hypothesis

The null hypothesis for this study is simply that we will see no significant difference between prediction using Naive Bayes Classifier and using Logistic Regression.   
The alternative is that there will be observed significant difference between the two algorithms results. 


<a id='4.2'></a>
## 4.2 Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
NB_col={}
time_cost={}
for n in range(2,33,2):
    start_time = time.time()
    NB = GaussianNB()
    train_acc = cross_val_score(NB, X_h1n1.iloc[:,0:n], Y_h1n1.ravel(), cv=5, scoring='f1')  #accuracy based on 5-fold cv
    NB_col[n]=np.mean(train_acc)   # mean accuracy after 5-fold cv
    spend_time=time.time() - start_time
    time_cost[n]=spend_time
    print("features number ={}, Accuracy: {}".format(n,train_acc))
    print("--- %s seconds ---" % (spend_time))

In [None]:
k = list(NB_col.keys())  
acc_nb = list(NB_col.values()) 

#plot the accuracy on NB
plt.plot(k,acc_nb)
plt.title('F1 Score on NB Classifier By Number of Features')
plt.xlabel('number of features')
plt.ylabel('accuracy')
plt.show()

In [None]:
best_feature_num=max(NB_col, key=NB_col.get)
best_feature_num

In [None]:
start_time = time.time()
NB = GaussianNB()
NB_scores = cross_val_score(NB, X_h1n1.iloc[:,0:best_feature_num], Y_h1n1.ravel(), cv=10, scoring='f1')
print("feature numbers={}, \n10-fold F1 Score : \n{}".format(best_feature_num,NB_scores))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
from sklearn.metrics import accuracy_score,cohen_kappa_score,f1_score
start_time = time.time()
NB = GaussianNB()
NB.fit(X_h1n1_tra.iloc[:,0:best_feature_num], y_h1n1_tra.ravel())
y_h1n1_prd = NB.predict(X_h1n1_val.iloc[:,0:best_feature_num])
print(f1_score(y_h1n1_val,y_h1n1_prd))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
from sklearn.metrics import classification_report
pd.DataFrame(classification_report(y_h1n1_val,y_h1n1_prd,output_dict=True))

In [None]:
con_matrix = pd.crosstab(pd.Series(y_h1n1_val, name='Actual' ),pd.Series(y_h1n1_prd, name='Predicted')) 

# visualize it
plt.figure(figsize = (4,3))
plt.title("Test set Confusion Matrix on Gaussian Naive Bayes")
sns.heatmap(con_matrix, cmap="Reds",  fmt='g',annot=True)
plt.show()

roc caculate

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = NB.predict_proba(X_h1n1_val.iloc[:,0:best_feature_num])
preds = probs[:,1]
fpr_nb, tpr_nb, threshold_nb = metrics.roc_curve(y_h1n1_val, preds)
roc_auc_nb = metrics.auc(fpr_nb, tpr_nb)

<a id='4.3'></a>
## 4.3 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg_col={}
time_cost={}
for n in range(2,33,2):
    start_time = time.time()
    lg = LogisticRegression(solver='saga',max_iter = 4000,random_state=25)  
    train_acc = cross_val_score(lg, X_h1n1.iloc[:,0:n], Y_h1n1.ravel(), cv=5, scoring='f1')  #accuracy based on 5-fold cv
    lg_col[n]=np.mean(train_acc)   # mean accuracy after 5-fold cv
    spend_time=time.time() - start_time
    time_cost[n]=spend_time
    print("features number ={}, F1 Score: {}".format(n,train_acc))
    print("--- %s seconds ---" % (spend_time))

In [None]:
k = list(lg_col.keys())  
f1_lg = list(lg_col.values()) 

#plot the accuracy on NB
plt.plot(k,f1_lg)
plt.title('F1 Score on Logistic Regression By Number of Features')
plt.xlabel('number of features')
plt.ylabel('accuracy')
plt.show()

In [None]:
best_feature_num=max(lg_col, key=lg_col.get)
best_feature_num

In [None]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
lg = LogisticRegression(solver='saga',max_iter = 4000,random_state=25)  
lg_scores  = cross_val_score(lg, X_h1n1.iloc[:,0:best_feature_num], Y_h1n1.ravel(), cv=10, scoring='f1')  #accuracy based on 5-fold cv
lg_acc_mean=np.mean(lg_scores )   # mean accuracy after 5-fold cv
spend_time=time.time() - start_time
print("features number ={}, F1 Score avg = {},\n10-fold F1 Score: {},".format(n,lg_acc_mean,lg_scores ))
print("--- %s seconds ---" % (spend_time))

In [None]:
from sklearn.metrics import accuracy_score
start_time = time.time()
lg = LogisticRegression(solver='saga',max_iter = 4000,random_state=25)  
lg.fit(X_h1n1_tra.iloc[:,0:best_feature_num], y_h1n1_tra.ravel())
y_h1n1_prd = lg.predict(X_h1n1_val.iloc[:,0:best_feature_num])
print(f1_score(y_h1n1_val,y_h1n1_prd))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
from sklearn.metrics import classification_report
pd.DataFrame(classification_report(y_h1n1_val,y_h1n1_prd,output_dict=True))

In [None]:
con_matrix = pd.crosstab(pd.Series(y_h1n1_val, name='Actual' ),pd.Series(y_h1n1_prd, name='Predicted')) 

# visualize it
plt.figure(figsize = (4,3))
plt.title("Test set Confusion Matrix on Logistic Regression")
sns.heatmap(con_matrix, cmap="Reds",  fmt='g',annot=True)
plt.show()

roc caculate

In [None]:
probs = lg.predict_proba(X_h1n1_val.iloc[:,0:best_feature_num])
preds = probs[:,1]
fpr_lg, tpr_lg, threshold_lg = metrics.roc_curve(y_h1n1_val, preds)
roc_auc_lg = metrics.auc(fpr_lg, tpr_lg)

<a id='4.4'></a>
## 4.4 ROC

In [None]:

# method I: plt
import matplotlib.pyplot as plt
plt.title('ROC On H1N1 Vaccine Prediction')
plt.plot(fpr_nb, tpr_nb, 'b', label = 'NB AUC = %0.2f' % roc_auc_nb)
plt.plot(fpr_lg, tpr_lg, 'g', label = 'LG AUC = %0.2f' % roc_auc_lg)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

<a id='4.5'></a>
## 4.5 Signficance Testing

In [None]:
from scipy import stats
tStat, pValue = stats.ttest_rel(NB_scores, lg_scores)
print("p value is {:.9f}".format(pValue))
print("Yes, there is a significance difference in f1-score between NB and LG" 
      if bool(pValue < 0.05) else "No, there is no significance difference in f1-score between NB and LG")