In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# to read cvs faster (uses GPU)
import cudf

# to split dataset
from sklearn.model_selection import train_test_split
# model
from xgboost import XGBClassifier
# to split dataset in folds for cross-validation preserving the percentage of samples for each class
from sklearn.model_selection import StratifiedKFold
# to perform a randomized search for cross-validation
from sklearn.model_selection import RandomizedSearchCV
# to calculate the score
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import mutual_info_classif

# to perform a hyperparameter scan using Bayesian Optimization
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

# garbage collector: to free-up memory when needed
import gc

# to keep track of time
import time


from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

In [None]:
%%time
# Loading data sets using cudf (faster) and coverts to pandas (DataFrame)
train = cudf.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col = 'id').to_pandas()
train

In [None]:
# Check how many entries are missing per column, how many unique entries per column
train.isnull().sum().values, train.dtypes.values, train.nunique().values

In [None]:
train.min().values <= -5, train.max().values>=5

In [None]:
# All features are continuous, and some seem to need standardisation
# First let us separate X from y and divide the sets into search set - to perform a randomized search -
# and opt set - to be fitted by the model with the best "optimized hyperparameter set" from the search.
X = train.drop(columns='target')
y = train.target
X_search, X_opt, y_search, y_opt = train_test_split(X, y, train_size = 0.5, random_state=7)

# since the datasets below won't be used here, we free up memory space by removing them
del train
del X
del y
del X_opt
del y_opt
gc.collect()

In [None]:
# Mutual information... selecting the features with higher MI scores...
discrete_features = X_search.dtypes == int

mi_scores = pd.Series(mutual_info_classif(X_search, y_search, discrete_features=discrete_features), index=X_search.columns).sort_values(ascending=False)

mi_cols=list(mi_scores[mi_scores.values>0.0001].index)

X_search= X_search[mi_cols]
X_search

In [None]:
sc = StandardScaler()

prep = ColumnTransformer([('sc', sc, X_search.columns)], remainder='passthrough')
# Defining the model: 'gpu_hist' is important to run it faster with GPU
model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='auc', random_state=7)

pipe = Pipeline(steps=[('preprocessing', prep), ('model', model)])

In [None]:
# Benchmark, adapted from TPC Oct 21 (randomized search, no scaling)
# Best: 0.740070 using {'subsample': 0.30000000000000004, 'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.07600000000000001}
# Seconds to run the scan: 1208.465334


# Let us vary thorough the XGBoost paramenters to see which setup gives the best result (score)

start = time.time()

# define the hyperparameters and the ranges to perform the scan
# params_rnd = {'n_estimators':np.arange(100, 1000, 100),'learning_rate':np.arange(0.01, 0.31, 0.01),
#           'max_depth':np.arange(3, 12, 1), 'subsample':np.arange(0.1, 1, 0.1), 'colsample_bytree':np.arange(0.1, 1.1, 0.1),
#          'colsample_bylevel':np.arange(0.1, 1.1, 0.1), 'min_child_weight':np.arange(0,10,1), 'reg_alpha':np.arange(0,15,1), 
#               'reg_lambda':np.arange(0,30,1)}


params_rnd = {'model__n_estimators':np.arange(300, 1000, 50),'model__learning_rate':np.arange(0.006, 0.21, 0.01),
          'model__max_depth':np.arange(3, 12, 1), 'model__subsample':np.arange(0.1, 1, 0.1)}

# for cross validation with 5 splits, using StratifiedKFold to keep the same percentage of sample per each class
skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)

# defining the random scan, n_iter=7 (picks 7 random scenarios from params_rnd), using 'roc_auc' and the scoring method
rnd_search = RandomizedSearchCV(pipe, params_rnd, n_iter=20, scoring='roc_auc', cv=skfold, random_state=7)

X_search=X_search[mi_cols]
# the model should fit the search set
rnd_result = rnd_search.fit(X_search, y_search)

# print the best score in the search and the corresponding best parameters
print("Best: %f using %s" % (rnd_result.best_score_, rnd_result.best_params_))

# print the total time...
elapsed = time.time() - start
print("Seconds to run the scan: %f" % (elapsed))

# Search datasets won't be used anymore, so we remove them from memory
del X_search
del y_search
gc.collect()

In [None]:
# In order to use rnd_result.best_params_ in the next model, we need to remove "model__" from the keys
best_parameters_rnd = dict(rnd_result.best_params_.copy())

for k in best_parameters_rnd.keys():
    best_parameters_rnd[k.replace("model__","")] = best_parameters_rnd.pop(k)

# for some reason, the loop is not replacing two model__ instances, so we do it one by one
best_parameters_rnd['learning_rate'] = best_parameters_rnd.pop('model__learning_rate')
best_parameters_rnd['max_depth'] = best_parameters_rnd.pop('model__max_depth')

best_parameters_rnd

In [None]:
# now, using the tuned hyperparameters, we fit and test the model on the "opt" set 
# Defining the model
model_opt = XGBClassifier(**best_parameters_rnd, tree_method='gpu_hist', use_label_encoder=False, 
                          eval_metric='auc', random_state=7)

pipe_opt = Pipeline(steps=[('preprocessing', prep), ('model', model_opt)])

train = cudf.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col = 'id').to_pandas()
X = train.drop(columns='target')
y = train.target
X_search, X_opt, y_search, y_opt = train_test_split(X, y, train_size = 0.3, random_state=7)

# Search datasets won't be used anymore, so we remove them from memory
del X
del y
del X_search
del y_search
gc.collect()

pipe_opt.fit(X_opt[mi_cols], y_opt)

del X_opt
del y_opt

In [None]:
### We calculate and store the probability of the positive prediction

X_test = cudf.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col = 'id').to_pandas()

pred_test = pipe_opt.predict_proba(X_test[mi_cols])[:,1]


output = pd.DataFrame({'id': X_test.index,
                       'target': pred_test})
output.to_csv('submission_TPSNov21.csv', index=False)

In [None]:
# # Let us vary thorough the XGBoost paramenters to see which setup gives the best result (score)


# # define the hyperparameters and the ranges to perform the scan
# # search_spaces = {'model__n_estimators': Integer(60, 500),'model__learning_rate': Real(0.001, 0.2, 'log-uniform'), 'model__max_depth': Integer(2, 12), 
# #                  'model__subsample': Real(0.1, 1, 'log-uniform'), 'model__colsample_bytree': Real(0.1, 1, 'log-uniform'),
# #                  'model__colsample_bylevel':Real(0.1, 1, 'log-uniform'), 'model__min_child_weight': Integer(0, 10), 'model__reg_alpha': Integer(0, 15), 
# #                  'model__reg_lambda': Integer(0, 50)}

# search_spaces = {'model__n_estimators': Integer(400, 1200),'model__learning_rate': Real(0.006, 0.21, 'log-uniform'), 'model__max_depth': Integer(3, 12), 
#                  'model__subsample': Real(0.1, 1, 'log-uniform')}

# # for cross validation with 5 splits, using StratifiedKFold to keep the same percentage of sample per each class
# skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)

# # defining a Bayes scan, n_iter=50 (picks 50 scenarios), using 'accuracy' and the scoring method
# search_bay = BayesSearchCV(pipe, search_spaces, n_iter=30, scoring='accuracy', cv=skfold, random_state=7)

In [None]:
# # start counting how long the fitting takes
# start = time.time()
# # fit with the whole dataset
# result_bay = search_bay.fit(X_search, y_search)

# # print the best score and parameters found during the scan
# print("(Bayes) Best: %f using %s" % (result_bay.best_score_, result_bay.best_params_))
# elapsed = time.time() - start
# print("Time to run the scan: %f" % (elapsed))

In [None]:
# # In order to use b_result.best_params_ in the next model, we need to remove "model__" from the keys
# best_parameters_bay = dict(result_bay.best_params_.copy())

# for k in best_parameters_bay.keys():
#     best_parameters_bay[k.replace("model__","")] = best_parameters_bay.pop(k)

# # for some reason, the loop is not replacing two model__ instances, so we do it one by one
# best_parameters_bay['n_estimators'] = best_parameters_bay.pop('model__n_estimators')
# best_parameters_bay['subsample'] = best_parameters_bay.pop('model__subsample')

# best_parameters_bay

In [None]:
# # now, we fit using the role train data and the best parameters in the scan

# model_opt_B = XGBClassifier(**best_parameters_bay, eval_metric='error', use_label_encoder=False, tree_method='gpu_hist')

# # Defining the pipeline with the same preprocessing as before, but with the tuned model
# pipe_opt_B = Pipeline(steps=[('preprocessing', prep), ('model', model_opt_B)])

# train = cudf.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col = 'id').to_pandas()
# X = train.drop(columns='target')
# y = train.target
# X_search, X_opt, y_search, y_opt = train_test_split(X, y, train_size = 0.5, random_state=7)

# # Search datasets won't be used anymore, so we remove them from memory
# del X
# del y
# del X_search
# del y_search
# gc.collect()


# # Fitting the whole dataset
# pipe_opt_B.fit(X_opt, y_opt)

# ### We calculate and store the probability of the positive prediction
# X_test = cudf.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col = 'id').to_pandas()

# pred_test_B = pipe_opt_B.predict_proba(X_test)[:,1]

# outputB = pd.DataFrame({'id': X_test.index,
#                        'target': pred_test_B})
# outputB.to_csv('submission_TPSNov21_B.csv', index=False)


# outputB