In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

X = pd.read_csv('../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_features.csv', index_col='respondent_id')
X_test = pd.read_csv('../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/test_set_features.csv', index_col='respondent_id')
y = pd.read_csv('../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_labels.csv', index_col='respondent_id')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

seed = 0

In [None]:
num_cols = [col for col in X.columns if X[col].dtype in ['float64']]
cat_cols = [col for col in X.columns if X[col].dtype in ['object']]
low_cardinal_cols = [col for col in cat_cols if X[col].nunique()<10]

num_transformer = Pipeline(steps=[('standard_scaler', StandardScaler()),
                                  ('imputer', SimpleImputer(strategy='most_frequent'))])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_cols),
                                                ('cat', cat_transformer, low_cardinal_cols)])

#preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_columns)], 
#                                                  remainder = 'drop')

In [None]:
#X_train = X[num_cols+low_cardinal_cols]
#y_train = y['seasonal_vaccine']
X_train, X_val, y_train, y_val = train_test_split(X[num_cols+low_cardinal_cols], 
                                                  y['seasonal_vaccine'], 
                                                  test_size=0.2, 
                                                  random_state=seed)

In [None]:
xgb = XGBClassifier(objective='binary:logistic',
                    tree_method='gpu_hist',
                    silent=True, 
                    nthread=1)
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('xgb', xgb)])
#estimators = MultiOutputClassifier(estimator)

params = {'xgb__n_estimators' : [500, 600, 700, 800, 900, 1000], 
        'xgb__learning_rate' : [0.02, 0.03, 0.04, 0.05, 0.1],
        'xgb__min_child_weight': [4, 5, 6],
        'xgb__gamma': [1, 2, 3, 4, 5],
        'xgb__subsample': [0.5, 0.6, 0.7],
        'xgb__colsample_bytree': [0.7, 0.8, 0.9, 1],
        'xgb__max_depth': [7, 8, 9, 10]
        }
folds = 5
param_comb = 30
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = seed)

random_search = RandomizedSearchCV(estimator=full_pipeline, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='roc_auc', 
                                   n_jobs=-1,
                                   cv=skf.split(X_train,y_train), 
                                   verbose=5, 
                                   random_state=seed )

random_search.fit(X_train, y_train)
print('\n Best hyperparameters:')
print(random_search.best_params_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best score:')
print(random_search.best_score_)

In [None]:
my_pipeline1 = Pipeline(steps=[('preprocessor', preprocessor),
                                ('xgb', XGBClassifier(
                                subsample=0.6, n_estimators=800,
                                min_child_weight=6, max_depth=8,
                                learning_rate=0.04, gamma=5,
                                colsample_bytree=0.7,
                                importance_type='gain',
                                interaction_constraints='',
                                max_delta_step=0, missing=None,
                                monotone_constraints='()', 
                                n_jobs=1, nthread=1, 
                                num_parallel_tree=1, random_state=0, 
                                reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, silent=True, 
                                tree_method='gpu_hist', validate_parameters=1,
                                verbosity=None))])

my_pipeline1.fit(X_train,y_train)
preds1 = my_pipeline1.predict_proba(X_train)[:, 1]
print('roc_auc_score of h1n1 flu:', roc_auc_score(y_train, preds1))

#'xgb__subsample': 0.6, 'xgb__n_estimators': 900, 'xgb__min_child_weight': 5, 'xgb__max_depth': 9, 'xgb__learning_rate': 0.05, 'xgb__gamma': 3, 'xgb__colsample_bytree': 0.7}
#roc_auc_score of h1n1 flu: 0.9942369964609553

In [None]:
preds_val = my_pipeline1.predict_proba(X_val)[:, 1]
print('roc_auc score of validation:', roc_auc_score(y_val, preds_val))

In [None]:
my_pipeline2 = Pipeline(steps=[('preprocessor',preprocessor),
                                ('xgb', XGBClassifier(
                                subsample=0.6, n_estimators=900,
                                min_child_weight=5, max_depth=9,
                                gamma=0.3, learning_rate=0.05,
                                colsample_bytree=0.7,
                                importance_type='gain',
                                interaction_constraints='',
                                max_delta_step=0, missing=None,
                                monotone_constraints='()', 
                                n_jobs=1, nthread=1, 
                                num_parallel_tree=1, random_state=0, 
                                reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, silent=True, 
                                tree_method='gpu_hist', validate_parameters=1,
                                verbosity=None))])

my_pipeline2.fit(X_train,y_train)
preds2 = my_pipeline2.predict_proba(X_train)[:, 1]
print('roc_auc_score of seasonal flu:', roc_auc_score(y_train, preds2))

#'xgb__subsample': 0.5, 'xgb__n_estimators': 600, 'xgb__min_child_weight': 4, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.02, 'xgb__gamma': 3, 'xgb__colsample_bytree': 0.7}
#roc_auc_score of seasonal flu: 0.9010093912109788

In [None]:
y1_test = my_pipeline1.predict_proba(X_test[num_cols+low_cardinal_cols])[:, 1]
y2_test = my_pipeline2.predict_proba(X_test[num_cols+low_cardinal_cols])[:, 1]

outputfull = pd.DataFrame({'respondent_id': X_test.index,
                       'h1n1_vaccine': y1_test,
                       'seasonal_vaccine': y2_test
                      })
outputfull.to_csv('Submission.csv', index=False)