In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# to read cvs faster (uses GPU)
import cudf

# to split dataset
from sklearn.model_selection import train_test_split
# model
from xgboost import XGBClassifier
# to split dataset in folds for cross-validation preserving the percentage of samples for each class
from sklearn.model_selection import StratifiedKFold
# to perform a randomized search for cross-validation
from sklearn.model_selection import RandomizedSearchCV
# to calculate the score
from sklearn.metrics import roc_auc_score

# garbage collector: to free-up memory when needed
import gc

# to keep track of time
import time

In [None]:
%%time
# Loading data sets using cudf (faster) and coverts to pandas (DataFrame)
train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv', index_col = 'id').to_pandas()
X_test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col = 'id').to_pandas()


train

In [None]:
# Check how many entries are missing per column, how many unique entries per column
train.isnull().sum().values, train.dtypes.values, train.nunique().values

****From the results above, we see that there are no missing values and no categorical entries.
There are features with only 2 unique entries.****

****For now, let us not do any preprocessing on the data to see how the model performs****

In [None]:
# First let us separate X from y and divide the sets into search set - to perform a randomized search -
# and opt set - to be fitted by the model with the best "optimized hyperparameter set" from the search.
X = train.drop(columns='target')
y = train.target
X_search, X_opt, y_search, y_opt = train_test_split(X, y, train_size = 0.4, random_state=7)

# since the datasets below won't be used here, we free up memory space by removing them
del train
del X
del y
gc.collect()

# Defining the model: 'gpu_hist' is important to run it faster with GPU
model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='auc', random_state=7)

****Now, we proceed to perform a ramdomized scan within "interesting" hyperparameter regions****

In [None]:
# Let us vary thorough the XGBoost paramenters to see which setup gives the best result (score)

start = time.time()

# define the hyperparameters and the ranges to perform the scan
params_rnd = {'n_estimators':np.arange(100, 1000, 100),'learning_rate':np.arange(0.01, 0.31, 0.01),
          'max_depth':np.arange(3, 12, 1), 'subsample':np.arange(0.1, 1, 0.1), 'colsample_bytree':np.arange(0.1, 1.1, 0.1),
         'colsample_bylevel':np.arange(0.1, 1.1, 0.1), 'min_child_weight':np.arange(0,10,1), 'reg_alpha':np.arange(0,15,1), 
              'reg_lambda':np.arange(0,30,1)}


# for cross validation with 5 splits, using StratifiedKFold to keep the same percentage of sample per each class
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

# defining the random scan, n_iter=7 (picks 7 random scenarios from params_rnd), using 'roc_auc' and the scoring method
rnd_search = RandomizedSearchCV(model, params_rnd, n_iter=7, scoring='roc_auc', cv=skfold, random_state=7)


# the model should fit the search set
rnd_result = rnd_search.fit(X_search, y_search)

# print the best score in the search and the corresponding best parameters
print("Best: %f using %s" % (rnd_result.best_score_, rnd_result.best_params_))

# print the total time...
elapsed = time.time() - start
print("Seconds to run the scan: %f" % (elapsed))

# Search datasets won't be used anymore, so we remove them from memory
del X_search
del y_search
gc.collect()

****Having found the best parameter set in the scan, we use these hyperparameters to fit the remaining dataset, i.e. X_opt and y_opt****

In [None]:
# now, using the tuned hyperparameters, we fit and test the model on the "opt" set 
# Defining the model
model_opt = XGBClassifier(**rnd_result.best_params_, tree_method='gpu_hist', use_label_encoder=False, 
                          eval_metric='auc', random_state=7)

# We fit the whole opt set, no need to split into train and test set, as cross-validation was done 
# previously by the RandomizedSearchCV
# model_opt.fit(X_opt, y_opt)

#Let us split X_opt and y_opt into train and validation sets to make use of the early_stopping_rounds feature
X_train, X_valid, y_train, y_valid = train_test_split(X_opt, y_opt, test_size=0.2, random_state=7)

model_opt.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc', early_stopping_rounds=15)

del X_opt
del y_opt
gc.collect()

****With the model determined, we use it to make predictions based on the X_test set and save them in a csv file to submit for the competition****

In [None]:
### We calculate and store the probability of the positive prediction
pred_test = model_opt.predict_proba(X_test)[:,1]


output = pd.DataFrame({'id': X_test.index,
                       'target': pred_test})
output.to_csv('submission_TPSOct21.csv', index=False)