In [None]:
import numpy as np 
import pandas as pd
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold

import optuna

# Loading training and testing data

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test_data = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sample_solution = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

# Getting labels from "target" column
train_labels = train_data['target']
X_test = test_data.copy()
X_test = X_test.drop(['id'], axis=1)

## Viewing training data

In [None]:
train_data.head()

## Creating 5-fold data

In [None]:
#Creating KFold training to to avoid overfitting.
# A good resource for understanding k-fold and stratified k-fold is written here:
# https://machinelearningmastery.com/k-fold-cross-validation/
train_data['kfold'] = -1
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)
for fold, (train_index, valid_index) in enumerate(kf.split(train_data, train_labels)):
    print("TRAIN:", train_index, "TEST:", valid_index)
    train_data.loc[valid_index, 'kfold'] = fold

In [None]:
exclude_cols = ["id", "kfold", "target"]
useful_cols = [i for i in train_data.columns if i not in exclude_cols]
feature_cols = [col for col in train_data.columns if col.startswith('f')]

In [None]:
num_cols_with_missing = sum(train_data.isnull().sum() > 0)
num_cols_with_missing

## Using Optuna to tune hyperparameters for XGBoost

Optuna is a hyperparameter optimization framework. It is framework agnostic and can be used with any ML or DL Framework.

It takes in an objective function and tries to optimize it based on the metric defined.

More on Optune: 1. https://optuna.org/
                2. https://www.analyticsvidhya.com/blog/2020/11/hyperparameter-tuning-using-optuna/

In [None]:
final_predictions = []
def run(trial):
    fold = 0
    param_grid = {'objective': 'binary:logistic',
                  'use_label_encoder': False,
                  'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
                  'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.1,0.01),
                  'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.1),
                  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1.0, 0.1),
                  'max_depth': trial.suggest_int('max_depth', 2, 20),
                  'booster': 'gbtree',
                  'gamma': trial.suggest_uniform('gamma',1.0,10.0),
                  'reg_alpha': trial.suggest_int('reg_alpha',50,100),
                  'reg_lambda': trial.suggest_int('reg_lambda',50,100),
                  'random_state': 42,
                 }
    
    
    
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    
    y_train = X_train['target']
    X_train = X_train.drop(exclude_cols, axis=1)
    y_valid = X_valid['target']
    X_valid = X_valid.drop(exclude_cols, axis=1)
    
    model = XGBClassifier(**param_grid, tree_method='gpu_hist', predictor='gpu_predictor', eval_metric=['logloss'])
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             early_stopping_rounds = 200)
    preds_valid_prob = model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, preds_valid_prob)
    return roc_auc

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=10)

In [None]:
param_best = study.best_params
study.best_params

## Training XGBoost using best parameters

Using the k-fold training data to train the data by applying the best hyperparameters defined from Optuna

In [None]:
final_predictions = []
for fold in range(5):

    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    
    y_train = X_train['target']
    X_train = X_train.drop(exclude_cols, axis=1)
    y_valid = X_valid['target']
    X_valid = X_valid.drop(exclude_cols, axis=1)
    
    model = XGBClassifier(**param_best, tree_method='gpu_hist', predictor='gpu_predictor', eval_metric=['logloss'])
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             early_stopping_rounds = 200)
    preds_valid_prob = model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, preds_valid_prob)
    test_pred_prob = model.predict_proba(X_test)[:, 1]
    final_predictions.append(test_pred_prob)
    print(fold, roc_auc)

## Viewing final predictions

In [None]:
final_predictions

## Using Final Predictions for Submission

In [None]:
sample_solution['target'] = final_predictions[0]

Writing the final submission file

In [None]:
sample_solution.to_csv("submission_default_xgboost_1.csv", index=False)

In [None]:
sample_solution.head()