In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# to read cvs faster (uses GPU)
import cudf

# to split dataset
from sklearn.model_selection import train_test_split
# model
from xgboost import XGBClassifier
# to split dataset in folds for cross-validation preserving the percentage of samples for each class
from sklearn.model_selection import StratifiedKFold
# to perform a randomized search for cross-validation
from sklearn.model_selection import RandomizedSearchCV

# to perform a hyperparameter scan using Bayesian Optimization
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

# to calculate the score
from sklearn.metrics import roc_auc_score

# garbage collector: to free-up memory when needed
import gc

# to keep track of time
import time

# Standard Scaling
from sklearn.preprocessing import StandardScaler
# Preprocessing
from sklearn.compose import ColumnTransformer
# Pipeline
from sklearn.pipeline import Pipeline

In [None]:
%%time
# Loading data sets using cudf (faster) and coverts to pandas (DataFrame)
train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv', index_col = 'id').to_pandas()

train.describe()

In [None]:
train.describe()

In [None]:
# checking to make sure all columns are normalised
outlmax = [col for col in train.columns if train[col].max()>1]
outlmin = [col for col in train.columns if train[col].max()<0]

print(outlmax, outlmin)

In [None]:
# Check how many entries are missing per column, how many unique entries per column
train.isnull().sum().values, train.dtypes.values, train.nunique().values

****From the results above, we see that there are no missing values and no categorical entries.
There are features with only 2 unique entries.****

****Let us preprocess the features with several entries with StandardScaler****

In [None]:
# First let us separate X from y and divide the sets into search set - to perform a randomized search -
# and opt set - to be fitted by the model with the best "optimized hyperparameter set" from the search.
X = train.drop(columns='target')
y = train.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.4, random_state=7)

# since the datasets below won't be used here, we free up memory space by removing them
del train
del X
del y
gc.collect()

In [None]:
# Preprocessing
# First, we separate the binary and non-binary features (columns)
nb_cols = [col for col in X_train.columns if X_train[col].nunique() > 2]

# Using StandardScaler on the columns with real entries (nb_cols)
#sc = StandardScaler()
#prepr = ColumnTransformer(transformers=[('stdscaler', sc, nb_cols)], remainder='passthrough')

# Defining the model: 'gpu_hist' is important to run it faster with GPU
model_search = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='auc', random_state=7)

# Defining the pipeline to apply prepr and model
#pipe = Pipeline(steps=[('preprocessing', prepr), ('model', model)])

****Now, we proceed to perform a ramdomized scan within "interesting" hyperparameter regions****

In [None]:
# Let us vary through the XGBoost hyperparamenters to see which setup gives the best result (score)

# define the hyperparameters and the ranges to perform the scan
search_spaces = {'n_estimators': Integer(100, 1000),'learning_rate': Real(0.01, 0.31, 'log-uniform'), 'max_depth': Integer(3, 14), 
                 'subsample': Real(0.1, 1, 'log-uniform'), 'colsample_bytree': Real(0.1, 1, 'log-uniform'),
                 'colsample_bylevel':Real(0.1, 1, 'log-uniform'), 'min_child_weight': Integer(0, 10), 'reg_alpha': Integer(0, 15), 
                 'reg_lambda': Integer(0, 30)}

# search_spaces = {'n_estimators': Integer(100, 1000),'learning_rate': Real(0.01, 0.31, 'log-uniform'), 'max_depth': Integer(3, 14), 
#                  'subsample': Real(0.1, 1, 'log-uniform')}



# for cross validation with 5 splits, using StratifiedKFold to keep the same percentage of sample per each class
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

# defining the Bayesian scan, n_iter=7 (picks 7 scenarios), using 'roc_auc' and the scoring method
b_search = BayesSearchCV(model_search, search_spaces, n_iter=7, scoring='roc_auc', cv=skfold, random_state=7)

print(b_search.total_iterations)

In [None]:
start = time.time()
# the model should fit the search set
b_result = b_search.fit(X_train, y_train)

# print the best score in the search and the corresponding best parameters
print("Best: %f using %s" % (b_result.best_score_, b_result.best_params_))

# print the total time...
elapsed = time.time() - start
print("Seconds to run the scan: %f" % (elapsed))

# Search datasets won't be used anymore, so we remove them from memory
del X_train
del y_train
gc.collect()

****Having found the best parameter set in the scan, we use these hyperparameters to fit the remaining dataset, i.e. X_opt and y_opt****

In [None]:
# now, using the tuned hyperparameters, we fit and test the model on the "opt" set 
# Defining the model
model_opt = XGBClassifier(**b_result.best_params_, tree_method='gpu_hist', use_label_encoder=False, 
                         eval_metric='auc', random_state=7)


# Optimal hyperparameters
#opt_params = {'learning_rate': 0.0335660337575312, 'max_depth': 9, 'n_estimators': 963, 'subsample': 0.9883632916174595}
#model_opt = XGBClassifier( n_estimators = 963, learning_rate = 0.0335660337575312, max_depth = 9, subsample= 0.9883632916174595, tree_method='gpu_hist', use_label_encoder=False, eval_metric='auc', random_state=7)

# Put it inside a new pipeline... use the same preprocessing as before: prepr
#pipe_opt = Pipeline(steps=[('preprocessing', prepr), ('model', model_opt)])


#Let us split X_valid and y_valid into trainning and validation sets again to make use of the early_stopping_rounds feature
X_train_2, X_valid_2, y_train_2, y_valid_2 = train_test_split(X_valid, y_valid, test_size=0.2, random_state=7)


# To use eval_set, early_stopping rounds in the pipeline, we need to preprocess the eval_set beforehand 
# otherwise we get an error
# Let's fit the prepr to the training set and transform the validation set to pass it to the pipeline
# X_train_2_prepr = prepr.fit_transform(X_train_2)
# X_valid_2_prepr = prepr.transform(X_valid_2)
# eval_set_prepr = [(X_valid_2_prepr, y_valid_2)]

del X_valid
del y_valid
# del X_valid_2
# del X_train_2_prepr
gc.collect()

In [None]:
# pipeline
#pipe_opt.fit(X_train_2, y_train_2, model__eval_set=eval_set_prepr, model__early_stopping_rounds=15)
# pipe_opt.fit(X_valid, y_valid)

model_opt.fit(X_train_2, y_train_2, eval_set=[(X_valid_2, y_valid_2)],early_stopping_rounds=10)

del X_train_2
del y_train_2
# del X_valid_2_prepr
# del y_valid
gc.collect()

****With the model determined, we use it to make predictions based on the X_test set and save them in a csv file to submit for the competition****

In [None]:
# Loading the test set
X_test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col = 'id').to_pandas()

### We calculate and store the probability of the positive prediction
pred_test = model_opt.predict_proba(X_test)[:,1]


output = pd.DataFrame({'id': X_test.index,
                       'target': pred_test})
output.to_csv('submission_TPSOct21.csv', index=False)