# Table of Contents
* [Import and First Checks](#1)
* [Target](#2)
* [Features](#3)
* [Model](#4)
* [Prediction on Test Set and Submission](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time
import gc

# plot
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# machine learning tools
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator, H2ORandomForestEstimator, H2OGradientBoostingEstimator

In [None]:
# configuration to show all columns in output
pd.set_option('display.max_columns', None)

<a id='1'></a>
# Import and first checks

In [None]:
# load data (this takes some time)
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure of data frame
df_train.info(verbose=True, show_counts=True)

### => no missing values!

In [None]:
df_test.info(verbose=True, show_counts=True)

<a id='2'></a>
# Target

In [None]:
# basic stats
print(df_train.target.value_counts())
df_train.target.value_counts().plot(kind='bar')
plt.grid()
plt.show()

### => Nicely balanced!

### Impact of Features on Target (examples)

In [None]:
# plot target vs binary features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in ['f22', 'f43']:
    plt.rcParams['figure.figsize'] = (6,4) # increase plot size for mosaics
    mosaic(df_train, [f, 'target'], title='Target vs ' + f)
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

In [None]:
# plot target vs BINNED numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in ['f179','f69']:
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df_train[new_var] = pd.qcut(df_train[f], 8)
    # then create mosaic plot
    plt.rcParams['figure.figsize'] = (16,6) # increase plot size for mosaics
    mosaic(df_train, [new_var, 'target'], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

# remove temporary columns
df_train = df_train.drop(['f69_bin','f179_bin'], axis=1)

<a id='3'></a>
# Features

In [None]:
# extract list of features
features_num = df_train.columns.tolist()
features_num.remove('id')
features_num.remove('target')

In [None]:
# basic stats
df_train[features_num].describe()

In [None]:
# boxplot of all features
n_plot_rows = 15
n_plot_cols = 19
for i in range(n_plot_rows):
    print('Columns', n_plot_cols*i+1 , 'to', n_plot_cols*i+n_plot_cols)
    df_train.iloc[:,n_plot_cols*i+1:n_plot_cols*i+n_plot_cols+1].plot(kind='box', figsize=(15,5))
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()

### Features f22, f43 and f242..f284 are of binary structure, let's plot them separately:

In [None]:
# select binary features
features_bin = ['f22','f43'] + ['f'+str(i) for i in range(242,284+1)]
print(features_bin)

In [None]:
# identify also float features
features_float = list(set(features_num) - set(features_bin))

In [None]:
# plot binary features
for f in features_bin:
    plt.figure(figsize=(4,3))
    df_train[f].value_counts().sort_index().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

<a id='4'></a>
# Model

In [None]:
# select predictors
predictors = features_num
print('Number of predictors: ', len(predictors))

In [None]:
# remove test set to reduce RAM footprint
del df_test

In [None]:
# convert training data to reduce RAM footprint
df_train[features_bin] = df_train[features_bin].astype(np.uint8)
df_train[features_float] = df_train[features_float].astype(np.float32)
df_train.target = df_train.target.astype(np.uint8)

In [None]:
# garbage collection
gc.collect();

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # define maximum memory usage and number of cores

In [None]:
# upload data in H2O environment
t1 = time.time()
# use SUBSET of training data only for lower RAM footprint:
n_sub = 500000
df_train_sub = df_train.sample(n=n_sub, random_state=999)
train_hex = h2o.H2OFrame(df_train_sub)
# train_hex = h2o.H2OFrame(df_train) # use all data
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,4))

# force categorical target
train_hex['target'] = train_hex['target'].asfactor()

In [None]:
# remove original training data frame
del df_train
gc.collect();

In [None]:
# fit Gradient Boosting model
n_cv = 5

fit_GBM = H2OGradientBoostingEstimator(ntrees=250,
                                       max_depth=6,
                                       min_rows=10,
                                       learn_rate=0.1, # default: 0.1
                                       sample_rate=1,
                                       col_sample_rate=0.5,
                                       nfolds=n_cv,
                                       score_each_iteration=True,
                                       stopping_metric='auc',
                                       stopping_rounds=5,
                                       stopping_tolerance=0.0001*0.5,
                                       seed=999)
# train model
t1 = time.time()
fit_GBM.train(x=predictors,
              y='target',
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_GBM.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [AUC]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('AUC')
    plt.ylim(0.8,0.9)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_GBM.varimp_plot()

In [None]:
# alternative variable importance using SHAP => see direction as well as severity of feature impact
t1 = time.time()
fit_GBM.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# training performance
perf_train = fit_GBM.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance
perf_cv = fit_GBM.model_performance(xval=True)
perf_cv.plot()

In [None]:
# predict on train set (extract probabilities only)
pred_train_GBM = fit_GBM.predict(train_hex)['p1']
pred_train_GBM = pred_train_GBM.as_data_frame().p1

# plot train set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_train_GBM, bins=100)
plt.title('Predictions on Train Set - GBM')
plt.grid()
plt.show()

In [None]:
# calibration
n_actual = sum(df_train_sub.target)
n_pred_GBM = sum(pred_train_GBM)

print('Actual Frequency    :', n_actual)
print('Predicted Frequency :', n_pred_GBM)
print('Calibration Ratio   :', n_pred_GBM / n_actual)

<a id='5'></a>
# Prediction on Test Set and Submission

In [None]:
# memory management
h2o.remove(train_hex)
gc.collect();

In [None]:
# reload test set into memory
t1 = time.time()
df_test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# upload data in H2O environment
t1 = time.time()
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,4))

In [None]:
# predict on test set (extract probabilities only)
pred_test_GBM = fit_GBM.predict(test_hex)['p1']
pred_test_GBM = pred_test_GBM.as_data_frame().p1

# plot test set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_test_GBM, bins=100)
plt.title('Predictions on Test Set - GBM')
plt.grid()
plt.show()

In [None]:
# GBM submission
df_sub_GBM = df_sub.copy()
df_sub_GBM.target = pred_test_GBM
display(df_sub_GBM.head())
# save to file
df_sub_GBM.to_csv('submission_GBM.csv', index=False)