In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install scikit-learn  -U

In [None]:
import random
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import sklearn
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [None]:
seed = 47

# Tabular Playground Series - Oct 2021

The tabular series on kaggle are meant to help novices in data science field like me get acquainted with kaggle competitions.

The dataset created for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the biological response of molecules given various chemical properties.

The first step in almost every data science project is to perfom some exploratory data analysis, which is already done in a previous notebook [1]. Here we will try to use some assumption based on that analysis to
verify whether we can obtaion better performance. To begin with, we will use only 100k samples of the original data. Here we try some baseline models and also do some grid search on some data transformation methods.

[1] https://www.kaggle.com/peressim/tabular-playground-series-oct-2021-eda

In [None]:
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    y_pred_prob = model.predict_proba(x)[:, 1]
    auc_roc = roc_auc_score(y, y_pred_prob)
    return {'auc_roc_curve' : auc_roc}

In [None]:
random.seed(seed)
n = 1000000
s = 100000
skip = sorted(random.sample(range(n),n-s))

train_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv', sep=',', skiprows=skip)

In [None]:
train_df.head()

In [None]:
print('Dataset shape: ', train_df.shape )

In [None]:
train_df.info()

In [None]:
# Sanity check for balanced number of classes in the target variable

sns.countplot(train_df['target'])
plt.title('Distribution of classes in target variable (target) \n')
plt.xlabel('Target')
plt.ylabel('Count')

# Data splitting

Here we split the data into train and test sets

In [None]:
def get_train_test_split(test_size=0.2):
    x_train = train_df.drop(['id', 'target'], axis=1).values
    y_train = train_df['target'].values 
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = test_size, random_state = seed)

    print('x_train', x_train.shape, 'y_train', y_train.shape)
    print('x_test', x_test.shape, 'y_test', y_test.shape)
    return x_train, x_test, y_train, y_test

In [None]:
x_train, x_test, y_train, y_test = get_train_test_split()

# Baseline

Baseline models to choose the methods that will allow the model to achieve a good performance.

1 - Logistic Regression

2 - XGBoost

All the tests will be based on a fraction of 10% of all available data

# Experiment - 1

Here we just test the performance of logistic regression and an xgboost without in the dataset as it is. No preprocessing in the data is performed.

In [None]:
print("Fitting a simple Logistic Regression model")
model = LogisticRegression(random_state=seed, solver='liblinear')
model.fit(x_train, y_train)
model.score(x_test, y_test)
results = evaluate_model(model, x_test, y_test)
print(results)

In [None]:
print("Fitting XGBoost Classifier")
model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

# GridCV on Data Preparation

Before doing anything else with the baseline models, we will perform a grid search on data preprocessing techniques.

References:

    [1]  https://machinelearningmastery.com/grid-search-data-preparation-techniques/

In [None]:
# get modeling pipelines to evaluate
def get_pipelines(model):
    pipelines = list()
    # normalize
    p = Pipeline([('s',MinMaxScaler()), ('m',model)])
    pipelines.append(('norm', p))
    # standardize
    p = Pipeline([('s',StandardScaler()), ('m',model)])
    pipelines.append(('std', p))
    # quantile
    p = Pipeline([('s',QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m',model)])
    pipelines.append(('quan', p))
    # pca
    p = Pipeline([('s',PCA()), ('m',model)])
    pipelines.append(('pca', p))
    # svd
    p = Pipeline([('s',TruncatedSVD()), ('m',model)])
    pipelines.append(('svd', p))
    
    p = Pipeline([('s',StandardScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('std-power', p))
    # scale and power
    p = Pipeline([('s',MinMaxScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('min-max-power', p))
    
    p = Pipeline([('p', PowerTransformer()), ('m',model)])
    pipelines.append(('power', p))
    
    return pipelines

In [None]:
def score_model(x, y, model):
    # define the cross-validation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(model, x, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    return scores

# Logistic Regression - Pipelines

In [None]:
model = LogisticRegression(random_state=seed, solver='liblinear')
pipelines = get_pipelines(model)
x_train = train_df.drop(['id', 'target'], axis=1).values
y_train = train_df['target'].values 

# evaluate each pipeline
results, names = list(), list()
for name, pipeline in pipelines:
	# evaluate
	scores = score_model(x_train, y_train, pipeline)
	# summarize
	print('>%s: %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
	# store
	results.append(scores)
	names.append(name)

# No Transform
scores = score_model(x_train, y_train, model)
print('>%s: %.3f (%.3f)' % ('No-transform', np.mean(scores), np.std(scores)))
results.append(scores)
names.append('No-transform')

In [None]:
plt.figure(figsize=(15,8))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# XGBoost - Pipelines

In [None]:
# model = XGBClassifier(random_state=seed, verbosity=0)
model = XGBClassifier(random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
pipelines = get_pipelines(model)
x_train = train_df.drop(['id', 'target'], axis=1).values
y_train = train_df['target'].values 

# evaluate each pipeline
results, names = list(), list()
for name, pipeline in pipelines:
	# evaluate
	scores = score_model(x_train, y_train, pipeline)
	# summarize
	print('>%s: %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
	# store
	results.append(scores)
	names.append(name)

# No Transform
scores = score_model(x_train, y_train, model)
print('>%s: %.3f (%.3f)' % ('No-transform', np.mean(scores), np.std(scores)))
results.append(scores)
names.append('No-transform')

In [None]:
r = dict(zip(names, np.mean(results, axis=1).tolist()))
n = max(r, key=r.get)
print(n, r[n])
dict(zip(names, np.mean(results, axis=1).tolist()))

In [None]:
plt.figure(figsize=(15,8))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# Results

<h3>Best results</h3>

Logistic Regression when used with PowerTransformer overcome all the XGBoost.
XGBoost - No transform (any other transformation gives the same result, except when it is used with svd)


# Feature engineering - Creating some synthetic features

# Logistic regression

We create synthetic features and use power transform as well

In [None]:
geomean = lambda x, axis : np.exp(np.mean(np.log(x), axis=axis))
harmonic_mean = lambda x, axis : len(x) / np.sum(1.0/x, axis=axis) 

funcs = {'mean' : np.mean, 
         'std' : np.std, 
         'var' : np.var, 
         'geo_mean' : geomean, 
         'harmonic_mean' : harmonic_mean, 
         'median' : np.median}

In [None]:
results, names = list(), list()
p = PowerTransformer()

for key in funcs.keys():
    x = train_df.drop(['id', 'target'], axis=1)
    x[key] = funcs[key](x, axis=1)
    y = train_df['target']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = seed)
    x_train = p.fit_transform(x_train)
    x_test = p.transform(x_test)
    model = LogisticRegression(random_state=seed, solver='liblinear')
    
    model.fit(x_train, y_train)
    model.score(x_test, y_test)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['auc_roc_curve'])

for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

# XGBoost Classifier

In [None]:
results, names = list(), list()

for key in funcs.keys():
    x = train_df.drop(['id', 'target'], axis=1)
    x[key] = funcs[key](x, axis=1)
    y = train_df['target']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = seed)
    model = XGBClassifier(random_state=0, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['auc_roc_curve'])

for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

Logistic regression showed the best results. It seems to work well with mean or std, however std was almost identical it was a little better than the mean. XGboost present the best result with the var as synthetic feature. 

Logistic - std

XGboost - var

In [None]:
del x, y
gc.collect()

# Grid Search

References:

    [1] https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/

# Logistic Regression

In [None]:
p = PowerTransformer()
x_train = train_df.drop(['id', 'target'], axis=1)
x_train['std'] = np.std(x_train, axis=1)
y_train = train_df['target']
x_train = p.fit_transform(x_train)

In [None]:
space = dict()
space['solver'] = ['liblinear', 'newton-cg', 'lbfgs']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
model = LogisticRegression(random_state=seed, verbose=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)

In [None]:
result = search.fit(x_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
result.best_estimator_

# XGBoost Classifier

In [None]:
p = PowerTransformer()
x_train = train_df.drop(['id', 'target'], axis=1)
x_train['var'] = np.var(x_train, axis=1)
y_train = train_df['target']
x_train = p.fit_transform(x_train)

In [None]:
params = {'n_estimators' : [1000, 1290, 1295, 1300, 1305, 1310, 1315, 1325],
          'max_depth' : [3, 4],
          'subsample' : [0.8, 0.9, 1.0],
          'eta' : [0.12],
          'colsample_bytree' : [0.3, 0.4],
          'min_child_weight': [5],
          'gamma': [5],
         }

model = XGBClassifier(random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
search = GridSearchCV(model, param_grid=params, scoring='roc_auc', refit='roc_auc', n_jobs=-1, cv=cv)

In [None]:
result = search.fit(x_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# Individual Search on Full data set

Now with best params, we will do an individual search on the full dataset since we were using only 10% in the grid search.

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv', sep=',')

In [None]:
p = PowerTransformer()
x_train = train_df.drop(['id', 'target'], axis=1)
x_train['var'] = np.var(x_train, axis=1)
y_train = train_df['target']
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = seed)
x_train = p.fit_transform(x_train)
x_test = p.transform(x_test)

In [None]:
params = result.best_params_.copy()
print(params)

<h3>1 - Testing different number of estimators</h3>

In [None]:
results_trees = {}
trees = [100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 950, 975, 1000, 1025, 1050, 1100, 1150, 1290, 1295, 1300, 1305, 1310, 1315, 1325, 2000]
for n in trees:
    params['n_estimators'] = n
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_trees[n] = result['auc_roc_curve']
    print('n_estimators:', n, 'auc_roc_curve:', results_trees[n])

best_nestimator = max(results_trees, key=results_trees.get)
print('\nBest n_estimators:', best_nestimator, 'AUCROC score:', results_trees[best_nestimator])

<h3>2 - Testing different max_depth</h3>

In [None]:
results_max_depths = {}
params['n_estimators'] = best_nestimator
max_depths = [i for i in range(1,5)]

for max_depth in max_depths:
    params['max_depth'] = max_depth
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_max_depths[max_depth] = result['auc_roc_curve']
    print('max_depth:', max_depth, 'auc_roc_curve:', results_max_depths[max_depth])

best_max_depth = max(results_max_depths, key=results_max_depths.get)
print('\nBest max_depth:', best_max_depth, 'AUCROC score:', results_max_depths[best_max_depth])

<h3>Testing different subsamples</h3>

In [None]:
results_subsamples = {}
params['max_depth'] = best_max_depth
subsamples = [i for i in np.arange(0.1, 1.1, 0.1)]

for subsample in subsamples:
    params['subsample'] = subsample
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_subsamples[subsample] = result['auc_roc_curve']
    print('subsample:', subsample, 'auc_roc_curve:', results_subsamples[subsample])

best_subsample = max(results_subsamples, key=results_subsamples.get)
print('\nBest subsample:', best_subsample, 'AUCROC score:', results_subsamples[best_subsample])

<h3>4 - Testing different learning rates</h3>

In [None]:
results_etas = {}
params['subsample'] = best_subsample
etas = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]

for eta in etas:
    params['eta'] = eta
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_etas[eta] = result['auc_roc_curve']
    print('eta:', eta, 'auc_roc_curve:', results_etas[eta])

best_eta = max(results_etas, key=results_etas.get)
print('\nBest eta:', best_eta, 'AUCROC score:', results_etas[best_eta])

<h3>5 - Testing different number of features</h3>

In [None]:
results_colsample_bytrees = {}
params['eta'] = best_eta
colsample_bytrees = [i for i in np.arange(0.1, 1.1, 0.1)]

for colsample_bytree in colsample_bytrees:
    params['colsample_bytree'] = colsample_bytree
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_colsample_bytrees[colsample_bytree] = result['auc_roc_curve']
    print('colsample_bytree:', colsample_bytree, 'auc_roc_curve:', results_colsample_bytrees[colsample_bytree])

best_colsample_bytree = max(results_colsample_bytrees, key=results_colsample_bytrees.get)
print('\nBest colsample_bytree:', best_colsample_bytree, 'AUCROC score:', results_colsample_bytrees[best_colsample_bytree])
    


<h3> 6 - Testing different values for min_child_weight</h3>

In [None]:
results_min_child_weight = {}
params['colsample_bytree'] = best_colsample_bytree
min_child_weights = [i for i in range(1,10)]

for min_child_weight in min_child_weights:
    params['min_child_weight'] = min_child_weight
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_min_child_weight[min_child_weight] = result['auc_roc_curve']
    print('min_child_weight:', min_child_weight, 'auc_roc_curve:', results_min_child_weight[min_child_weight])

best_min_child_weight = max(results_min_child_weight, key=results_min_child_weight.get)
print('\nBest min_child_weight:', best_min_child_weight, 'AUCROC score:', results_min_child_weight[best_min_child_weight])

<h3> 7 - Testing different values for gamma</h3>

In [None]:
results_gamma = {}
params['min_child_weight'] = best_min_child_weight
gammas = [0.01, 0.02, 0.03, 0.1, 0.3, 0.5, 1, 1.1, 1.5, 2, 5, 7, 9, 10]

for gamma in gammas:
    params['gamma'] = gamma
    model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_gamma[gamma] = result['auc_roc_curve']
    print('gamma:', gamma, 'auc_roc_curve:', results_gamma[gamma])

best_gamma = max(results_gamma, key=results_gamma.get)
print('\nBest gamma:', best_gamma, 'AUCROC score:', results_gamma[best_gamma])

In [None]:
params['gamma'] = best_gamma
print('Best Score', results_gamma[best_gamma])
print('Best Hyperparameters:', params)

# Submission

In [None]:
model = XGBClassifier(**params, random_state=seed, tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, verbosity=0)
model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

In [None]:
del x_train, x_test, y_train, y_test, train_df
gc.collect()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv', sep=',')

In [None]:
x_test = test_df.drop(['id'], axis=1)
x_test['var'] = np.var(x_test, axis=1)
x_test = p.transform(x_test)

In [None]:
target = model.predict_proba(x_test)[:, 1]
ids = test_df['id'].values
submission = pd.DataFrame({'id' : ids, 'target' : target})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

# Next Steps

The next step is try to improve the results by training both Logistic Regression and XGBoost and then averaging their results. We are going to try it in a new notebook [1] using the best parameters found here so far.

[1] https://www.kaggle.com/peressim/tabular-playground-series-oct-2021-final-models