In [None]:
# Importing packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFECV,RFE
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the dataset
data = pd.read_csv(r'../input/h1n1-vaccination/h1n1_vaccine_prediction.csv')
data

In [None]:
data.info()

In [None]:
pd.set_option('display.max_columns',100)
data.describe()

In [None]:
# Now lets look out for missing values

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
# Missing Percentage
miss_percent=((data.isna().sum()/len(data))*100).sort_values(ascending=False)
miss_percent.plot.bar()
plt.show()

In [None]:
# droping columns which has missing percentage greater than 10 percent
miss_cols=list(miss_percent[miss_percent>10].index)
data1=data.drop(data[miss_cols],axis=1)

In [None]:
data1.isna().sum().sort_values(ascending=False)

In [None]:
# Hence we can see that large proportion of the column have value zero in valuecounts
# we are imputing the null value with the mode of the column have lesser that 1000 null values
miss_row = data1.isna().sum().sort_values(ascending=False)
miss_row = miss_row[(miss_row<1000) & (miss_row!=0)].index
miss_row

In [None]:
data2= data1.copy()
data2[miss_row] = data2[miss_row].apply(lambda x: x.fillna(x.mode()[0]))

In [None]:
# droping rest of the na values
data3 = data2.dropna()

In [None]:
# changing some of categorical values into numbers to analyse it
clean = {'age_bracket':{'18 - 34 Years':1,'35 - 44 Years':2,'45 - 54 Years':3,'55 - 64 Years':4,'65+ Years':5},
        'qualification':{'< 12 Years':1,'12 Years':2,'College Graduate':3,'Some College':4}}
data3=data3.replace(clean)

In [None]:
# Lets analyse all feature with corresponding with the target variable by creating a function
def analysis(df,graph_per_row,max_graphs):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col]>1 and nunique[col]<50]]
    nrow, ncol = df.shape
    colname = list(df)
    graph_row = (ncol+graph_per_row-1)/graph_per_row
    plt.figure(figsize=(12*graph_per_row,8*graph_row))
    for i in range(min(ncol,max_graphs)):
        plt.subplot(graph_row,graph_per_row,i+1)
        coltype = df.iloc[:,i]
        if (not np.issubdtype(type(coltype.iloc[0]),np.str)):
            sns.countplot(colname[i], hue='h1n1_vaccine',data=df)
        else:
            coltype.hist()
        plt.title(f'{colname[i]}')
        plt.xticks(rotation=60)
    plt.show()

In [None]:
analysis(data3,5,25)

In [None]:
data3.isna().sum().sort_values(ascending=False)

In [None]:
# Check for Multicollinearity
obj = data3.select_dtypes(include='object').columns
[print(i,'-->',data3[i].unique()) for i in obj]

In [None]:
clean = {'sex':{'Female':0 ,'Male':1},
        'employment':{'Not in Labor Force':1,'Employed':2,'Unemployed':3},
        'census_msa':{'Non-MSA':1,'MSA, Not Principle  City':2,'MSA, Principle City':3},
         'housing_status':{'Own':1,'Rent':0},
        'marital_status':{'Not Married':0, 'Married':1}
}
data4=data3.replace(clean)

In [None]:
plt.figure(figsize=(30,24))
corr = data4.corr()
sns.heatmap(corr,annot=True)
plt.show()

In [None]:
# we can more multicollinearity from the heat map 
# so we can check using Variance Influencing Factor
def vif_scores(df):
    VIF_Scores = pd.DataFrame()
    VIF_Scores["Independent Features"] = df.columns
    VIF_Scores["VIF Scores"] = [variance_inflation_factor(df.values,i) for i in range(df.shape[1])]
    return VIF_Scores

In [None]:
df1=data4.drop(data4[['unique_id','race','is_h1n1_vacc_effective','is_seas_vacc_effective','qualification']],axis=1)#'sex','marital_status','is_h1n1_vacc_effective','is_seas_vacc_effective'
df2 = df1.iloc[:,:-1]
vif_scores(df2)

In [None]:
# So, Except 'race','is_h1n1_vacc_effective','is_seas_vacc_effective','qualification' no other columns have Multicolinearity

In [None]:
# One Hot Encoding Race
one_hot = data4[['race']]
one_hot= pd.get_dummies(one_hot)
one_hot.columns

In [None]:
data5= data4.drop(data4[['unique_id','race']],axis=1)

In [None]:
data5 = pd.concat([data5,one_hot],axis = 1)

In [None]:
data5.info()

# Creating Models

In [None]:
# firt Creating model with all the features
x = data5.drop(['h1n1_vaccine'],axis=1)
y= data5['h1n1_vaccine']
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20, random_state=1)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred= model.predict(x_test)

In [None]:
# Creating Function for viewing Result of the predicted
def res(y_valid):
    cm1 = confusion_matrix(y_test,y_valid)
    ConfusionMatrixDisplay(cm1).plot().ax_.set(ylabel = 'Actual value', xlabel ='Predicted value')
    print('Accuracy',accuracy_score(y_test,y_valid))
    print(classification_report(y_test,y_valid))
    plt.show()

In [None]:
res(y_pred)

In [None]:
plot_roc_curve(model,x_train,y_train,response_method='predict_proba')

In [None]:
y_prob = model.predict_proba(x_test)
y_prob = y_prob[:,1]

In [None]:
#predict using custom thershold
Thersold = 0.2
y_pred1 =  np.where(y_prob>Thersold,1,0)
res(y_pred1)

In [None]:
# We can use Thershold based on our recuriment of the model

In [None]:
# DecisionTree Model
dec = DecisionTreeClassifier()
dec.fit(x_train,y_train)
y_pred_dec = dec.predict(x_test)
res(y_pred_dec)

In [None]:
# Random Forest Model
rand = RandomForestClassifier()
rand.fit(x_train,y_train)
y_pred_rand = rand.predict(x_test)
res(y_pred)

In [None]:
# KNN Model
error = []
for i in range(1,20,2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    y_pred_knn = knn.predict(x_test)
    error.append(np.mean(y_test!=y_pred_knn))
plt.plot(range(1,20,2), error, marker='o')

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred_knn = knn.predict(x_test)
res(y_pred_knn)

In [None]:
# Getting best Features out of all of them Using
#Recursive Feature Engineering
rfe = RFE(rand)
rfe.fit(x_train,y_train)

In [None]:
select = []
features = rfe.support_
cols = x.columns
for i,j in enumerate(features):
    if j==True:
        select.append(cols[i])
select

In [None]:
# Now from the best Features, Creating logistice Model 
x = data5[['h1n1_worry',
 'h1n1_awareness',
 'dr_recc_h1n1_vacc',
 'is_h1n1_vacc_effective',
 'is_h1n1_risky',
 'sick_from_h1n1_vacc',
 'is_seas_vacc_effective',
 'is_seas_risky',
 'sick_from_seas_vacc',
 'age_bracket',
 'qualification',
 'sex',
 'employment',
 'census_msa',
 'no_of_adults',
 'no_of_children']]
y= data5['h1n1_vaccine']
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20, random_state=1)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred_features= model.predict(x_test)
res(y_pred_features)

In [None]:
# We can see from the Results that 16 features give the same accuracy as 32 features

In [None]:
# Hyperparameter tunning
# first geting best parameter to use in Random Forest model and predict with it
parameters = {'n_estimators':[10,20,30,40,50],'max_depth':[3,4,5,6,7], 'criterion':('entropy', 'gini'),'max_leaf_nodes':[5,10,15,20]}
clf = GridSearchCV(rand, parameters)
clf.fit(x_train,y_train)
clf.best_params_

In [None]:
y_pred_GS = clf.predict(x_test)
res(y_pred_GS)

# Ensemble Models

In [None]:
# !pip install xgboost
# %pip install lightgbm
# %pip install catboost

In [None]:
# XGB 
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train,y_train)
y_pred_xgb = xgb_model.predict(x_test)
res(y_pred_xgb)

In [None]:
# Cat Boost
model_cat = CatBoostClassifier()
model_cat.fit(x_train,y_train,verbose=False)
y_pred_cat = model_cat.predict(x_test)
res(y_pred_cat)

In [None]:
# LightGB
train_data = lgb.Dataset(x_train,y_train)
params = {'learning_rate':0.001}
model_lgb = lgb.train(params,train_data)
y_pred_lgb=model.predict(x_test)
res(y_pred_lgb)

In [None]:
# So from the above model we can conclude that the XGB give some good recall and precision when compared to all
# More over we cannot depend on accuracy on Classification Problem 
# We can change the True Positive Rate or False Positive Rate depending on our problem statement