In [None]:
from numbers import Real
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from functools import partial
from skopt import space
from skopt import gp_minimize 
from xgboost import XGBClassifier


import optuna
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
X_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
print(df_train.shape)
print(X_test.shape)

In [None]:
df_train.target.hist()

From the above histogram of the target, it is known that the it is a classification problem

In [None]:
print(df_train.isnull().sum().sum())
print(X_test.isnull().sum().sum())

It is also known that there no null values in the training and test data and from the data there seems no categorical values

### Dividing dependent and independent variables and adding new features
It can be seen that columns like "Id" are unique, hence wont contribute for our predictions. Therefore, these must be removed from both training and testing datasets.

In [None]:
y = df_train['target']
df_train.pop('target')
df_train.pop('id')
X_test.pop('id')
X=df_train

del df_train

In [None]:
st_scaler = StandardScaler()
X = st_scaler.fit_transform(X)
X_test = st_scaler.fit_transform(X_test)

In [None]:
print(X.shape)
print(X_test.shape)

### Bayesian optimization with gaussian process
#### Important note
The following cell can be uncommented to run the hyperparameter tuning process which uses optuna method

In [None]:

"""def optimize(params, param_names, x, y):

    # convert params to dictionary
    params = dict(zip(param_names, params))
    
    model = XGBClassifier(**params, random_state  = 42, 
                          use_label_encoder=False, 
                         tree_method = 'gpu_hist',
                         gpu_id = 0,
                         predictor = 'gpu_predictor')
    
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    count=0
    for idx in kf.split(X=x, y = y):
        train_idx, test_idx = idx[0], idx[1]
        
        #xtrain, xtest = x.iloc[train_idx], x.iloc[test_idx]
        #ytrain, ytest = y.iloc[train_idx], y.iloc[test_idx]
        xtrain, xtest = x[train_idx], x[test_idx]
        ytrain, ytest = y[train_idx], y[test_idx]

        model.fit(xtrain, ytrain, 
                  eval_set = [(xtrain, ytrain), (xtest, ytest)],
              early_stopping_rounds = 100,
              eval_metric = 'auc',
             verbose = False)
        preds = model.predict_proba(xtest)[:,1]
        fold_acc = metrics.roc_auc_score(ytest, preds)
        accuracies.append(fold_acc)

    return -1.0*np.mean(accuracies)"""

In [None]:
"""param_space = [space.Real(0.01, 0.1, name = "eta"),
               space.Integer(3,25, name = "max_depth"),
               space.Integer(1, 7, name = "min_child_weight"),
               space.Real(0.6, 1.0, name = "subsample"),
               space.Real(0.6, 1.0, name= "colsample_bytree"),
               space.Real(0.01, 1.0, name = "alpha")]

param_names = ["eta", "max_depth", "min_child_weight", "subsample", 
                "colsample_bytree", "alpha"]

optimization_function = partial(optimize, param_names = param_names, x = X, y = y)
result = gp_minimize(optimization_function, dimensions = param_space, n_calls = 15, 
                    n_random_starts = 10, verbose = 10)

best_params = dict(zip(param_names, result.x))
print(best_params)"""


## Best Parameters
{'eta': 0.04515703133340612, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.8166775634490886, 'colsample_bytree': 0.7418578862489605, 'alpha': 0.6287561392899984}

In [None]:
#optimized_params = best_params
optimized_params = {'eta': 0.04515703133340612, 'max_depth': 10, 
                    'min_child_weight': 1, 'subsample': 0.8166775634490886, 
                    'colsample_bytree': 0.7418578862489605, 
                    'alpha': 0.6287561392899984}

In [None]:
folds = model_selection.StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
y_pred = np.zeros(len(X_test))
scores = []
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    model =  XGBClassifier(**optimized_params, random_state  = 42, 
                          use_label_encoder=False, 
                         tree_method = 'gpu_hist',
                         gpu_id = 0,
                         eval_metric = 'auc',
                         predictor = 'gpu_predictor')
   
    model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], 
              verbose = False, early_stopping_rounds = 100)
    final_preds = model.predict_proba(X_val)[:,1]
    fold_score = metrics.roc_auc_score(y_val, final_preds)
    scores.append(fold_score)
    y_pred += model.predict_proba(X_test)[:,1] / folds.n_splits 

print(scores)

In [None]:
sample_submission.head()

In [None]:
sample_submission['target'] = y_pred
sample_submission.to_csv('Submission.csv',index = False)