In [None]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# read data
training_features_data = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_features.csv",
                    sep=',')


test_features_data = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/test_set_features.csv",
                    sep=',')



training_set_labels = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_labels.csv",
                    sep=',')



In [None]:
training_features_data["age_group"]=training_features_data["age_group"].astype(str)

In [None]:
training_features_data.isna().sum()

In [None]:
training_features_data=training_features_data.fillna(training_features_data.mean())


training_features_data=training_features_data.fillna('out-of-category')

In [None]:
#no missing values are left 
training_features_data.isna().sum()

In [None]:
#encoding categorical features  --> (str-->float)

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

enc.fit(training_features_data)
training_features_data_arr=enc.transform(training_features_data)

In [None]:
col_names_list=training_features_data.columns

encoded_categorical_df=pd.DataFrame(training_features_data_arr, columns=col_names_list)

In [None]:
encoded_categorical_df

In [None]:
#normalization(bet. 0-1)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(encoded_categorical_df)
normalized_arr=scaler.transform(encoded_categorical_df)

In [None]:
normalized_df=pd.DataFrame(normalized_arr, columns=col_names_list)

In [None]:
normalized_df.describe()

# **CLASSIFICATION**

In [None]:
#import sklearn methods 
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression


In [None]:
training_set_labels.head()

In [None]:
# split df to X and Y
y = training_set_labels.loc[:, 'seasonal_vaccine'].values
X = normalized_df
y

In [None]:
# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 42)

# **Regressor-1: Decision Tree regressor**

In [None]:
#decision tree regressor
regressor = DecisionTreeRegressor(random_state = 0)

# parameters 
parameters = {
                "criterion": ["mse", "friedman_mse", "mae"],
                "splitter": ["best","random"],
                }

# grid search for parameters
grid = GridSearchCV(estimator=regressor, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

#detailed dataframe of gridsearch
detailed_grid_results = pd.DataFrame(grid.cv_results_)
detailed_grid_results

In [None]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes


# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# **Regressor-2: Bayesian-Ridge**

In [None]:
#Bayesian Ridge for regression 

clf_ridge = linear_model.BayesianRidge()


# parameters 
parameters = {
                'alpha_init': [None, 1],
                'lambda_init': [1, 1e-3],
            }


# grid search for parameters
grid = GridSearchCV(estimator=clf_ridge, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)


# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# **Regressor-3: SVR**

In [None]:
regr = SVR(C=1.0, epsilon=0.2)

# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.01,0.1,1,10,100],
                'max_iter': [100,1000],
            }

# grid search for parameters
grid = GridSearchCV(estimator=regr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# **Regressor-4: SGDRegressor**

In [None]:
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor( tol=1e-3)


# parameters 
parameters = {
                'alpha': [0.0001, 0.001, 0.01, 1],
                'max_iter': [10,100,1000],
                'learning_rate': ['invscaling', 'optimal', 'adaptive'],
            }

# grid search for parameters
grid = GridSearchCV(estimator=reg, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))


# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# **Regressor-5: RandomForestRegressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=0)

# parameters 
parameters = {
                'n_estimators': [20, 50, 100],
            }

# grid search for parameters
grid = GridSearchCV(estimator=rfr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))



# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)
