In [None]:
from numbers import Real
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from functools import partial
from skopt import space
from skopt import gp_minimize 
from xgboost import XGBClassifier


import optuna
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
X_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
print(df_train.shape)
print(X_test.shape)

### Dividing dependent and independent variables and adding new features
It can be seen that columns like "Id" are unique, hence wont contribute for our predictions. Therefore, these must be removed from both training and testing datasets.

In [None]:
y = df_train['target']
df_train.pop('target')
df_train.pop('id')
X_test.pop('id')
X=df_train

del df_train

In [None]:
print(X.shape)
print(X_test.shape)

In [None]:
st_scaler = StandardScaler()
X = st_scaler.fit_transform(X)
X_test = st_scaler.fit_transform(X_test)

### Optuna Search using XGBClassifier
#### Important note
The following cell can be uncommented to run the hyperparameter tuning process which uses optuna method

In [None]:
"""def optimize(trial, x, y):

   
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int('max_depth', 3, 25)
    learning_rate = trial.suggest_uniform('learning_rate', 1e-3, 0.25)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1)
    

    
    model = XGBClassifier(n_estimators = n_estimators, max_depth = max_depth, 
                          learning_rate = learning_rate, 
                          subsample = subsample,
                         colsample_bytree = colsample_bytree,
                            random_state  = 42, 
                          use_label_encoder=False, 
                         tree_method = 'gpu_hist',
                         gpu_id = 0,
                         predictor = 'gpu_predictor')
    

    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y = y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain, 
                  eval_set = [(xtest, ytest)],
              early_stopping_rounds = 100,
              eval_metric = 'auc',
             verbose = False)
        preds = model.predict_proba(xtest)[:,1]
        fold_acc = metrics.roc_auc_score(ytest, preds)
        accuracies.append(fold_acc)

    return -1.0*np.mean(accuracies)"""

In [None]:
"""optimization_function = partial(optimize, x=X, y=y)
    
study = optuna.create_study(direction = "minimize")
study.optimize(optimization_function, n_trials=15)"""

## Best Parameters
{'n_estimators': 1243, 'max_depth': 3, 'learning_rate': 0.11716080504654952, 'subsample': 0.8098472503300551, 'colsample_bytree': 0.7047898383605972}

In [None]:
best_params = {'n_estimators': 1243, 
               'max_depth': 3, 'learning_rate': 0.11716080504654952, 
               'subsample': 0.8098472503300551, 
               'colsample_bytree': 0.7047898383605972}

In [None]:
folds = model_selection.StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
y_pred = np.zeros(len(X_test))
scores = []
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    model =  XGBClassifier(**best_params, random_state  = 42, 
                          use_label_encoder=False, 
                         tree_method = 'gpu_hist',
                         gpu_id = 0,
                         eval_metric = 'error',
                         predictor = 'gpu_predictor')
   
    model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], 
              verbose = False, early_stopping_rounds = 100)
    final_preds = model.predict_proba(X_val)[:,1]
    fold_score = metrics.roc_auc_score(y_val, final_preds)
    scores.append(fold_score)
    y_pred += model.predict_proba(X_test)[:,1] / folds.n_splits 

print(scores)

In [None]:
sample_submission.head()

In [None]:
sample_submission['target'] = y_pred
sample_submission.to_csv('Submission.csv',index = False)