In [None]:
# imports
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-nov-2021/')
list(input_path.iterdir())

In [None]:
train_df = pd.read_csv(input_path/'train.csv')
test_df = pd.read_csv(input_path/'test.csv')
submission_df = pd.read_csv(input_path/'sample_submission.csv')

train_df.shape, test_df.shape

In [None]:
submission_df.head(3)

In [None]:
train_df.head(3)

## Data sanity check

In [None]:
# null values
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

In [None]:
# duplicates check
len(train_df) - len(train_df.drop(['id', 'target'], axis=1).drop_duplicates())

## EDA

1. It is said that, all the variables are continuous
2. Target variable is binary

In [None]:
test_id = test_df.loc[:, 'id']
train_target = train_df.loc[:, 'target']
train_df.drop(['id', 'target'], axis=1, inplace=True)

In [None]:
train_target_counts = train_target.value_counts()
labels = train_target_counts.index
counts = train_target_counts.values

plt.bar(labels, counts)
plt.xticks(labels)
plt.show()

## Model building

In [None]:
from xgboost import XGBClassifier
import optuna
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, auc, roc_auc_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, train_target, test_size=0.25, stratify=train_target,
                                                    shuffle=True, random_state=13)
X_train.shape, X_test.shape

In [None]:
def objective(trial):
    """
    Objective function to tune XGBoost classifier
    """
    params = {
        'tree_method': 'gpu_hist',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': trial.suggest_float('eta', 1e-8, 1., log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1., log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9, step=2),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 1., log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.2, 1.),
        'use_label_encoder': False,
    }
    
    # KFold split
    skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
    cv_scores = []
    
    for train_ix, test_ix in skf.split(X_train, y_train):
        X_train_k, X_test_k = X_train.iloc[train_ix], X_train.iloc[test_ix]
        y_train_k, y_test_k = y_train.iloc[train_ix], y_train.iloc[test_ix]
    
        booster = XGBClassifier(**params)
        booster.fit(X_train_k, y_train_k, eval_metric='auc', eval_set=[(X_test_k, y_test_k)], verbose=0, early_stopping_rounds=100)
        preds = booster.predict_proba(X_test)
        preds = preds[:, 1]
        cv_scores.append(preds)
    cv_score = np.mean(cv_scores, axis=0)
    return roc_auc_score(y_test, cv_score)    
    # return cv_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
study.best_trial, study.best_params

In [None]:
test_df_preds = []
skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
for train_ix, test_ix in skf.split(train_df, train_target):
    X_train, X_test = train_df.iloc[train_ix], train_df.iloc[test_ix]
    y_train, y_test = train_target.iloc[train_ix], train_target.iloc[test_ix]
    # xgb model
    xgb_model = XGBClassifier(**study.best_params, use_label_encoder=False, tree_method='gpu_hist',
                              objective='binary:logistic', eval_metric='auc')
    xgb_model.fit(X_train, y_train, verbose=0, eval_set=[(X_test, y_test)], early_stopping_rounds=100)
    print(xgb_model.best_score)
    test_preds = xgb_model.predict_proba(test_df.drop('id', axis=1))
    test_preds = test_preds[:, 1]
    test_df_preds.append(test_preds)

In [None]:
submission_df.target = np.mean(test_df_preds, axis=0)

In [None]:
submission_df.to_csv('submission.csv', index=False)