# Table of Contents
* [Target Exploration](#1)
* [Features](#2)
* [Target vs Features](#3)
* [Fit Model](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# machine learning tools
import h2o
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
# load data + first glance
df_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

# first glance (training data)
df_train.head()

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure
df_train.info()

### => No missings this time :-)

<a id='1'></a>
# Target Exploration

In [None]:
# basic stats
print(df_train.target.value_counts())
df_train.target.value_counts().sort_index().plot(kind='bar')
plt.grid()
plt.show()

<a id='2'></a>
# Features

In [None]:
# extract features from column names
features = df_train.columns.tolist()
features.remove('id')
features.remove('target')

In [None]:
# basic summary stats
pd.set_option('display.max_columns', None) # show all columns
df_train[features].describe()

In [None]:
# correlation
corr_pearson = df_train[features].corr(method='pearson')
plt.figure(figsize=(12,12))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

### We observe almost no correlation between the features...

In [None]:
print('Maximum correlation:', np.round(corr_pearson[corr_pearson!=1].max().max(),5))
print('Minimum correlation:', np.round(corr_pearson[corr_pearson!=1].min().min(),5))

<a id='3'></a>
# Target vs Features

### We have too many features here to plot all, so let's just show an example:

In [None]:
# select a feature
f = 'feature_2'

In [None]:
# show distribution
df_train[f].value_counts().plot(kind='bar')
plt.title('Distribution of ' + f)
plt.grid()
plt.show()

In [None]:
# violinplots by class
plt.figure(figsize=(10,5))
sns.violinplot(x=f, y='target', data=df_train)
my_title = 'Distribution by class for ' + f
plt.title(my_title)
plt.grid()

In [None]:
# cross table - absolute counts...
ctab = pd.crosstab(df_train.target, df_train[f])
ctab

In [None]:
# ...and normalized by column
ctab_norm = ctab / ctab.sum()
ctab_norm

In [None]:
# visualize
plt.figure(figsize=(12,6))
p1 = plt.bar(ctab_norm.columns, ctab_norm.iloc[0])
bot = ctab_norm.iloc[0]
p2 = plt.bar(ctab_norm.columns, ctab_norm.iloc[1], bottom=bot)
bot = bot + ctab_norm.iloc[1]
p3 = plt.bar(ctab_norm.columns, ctab_norm.iloc[2], bottom=bot)
bot = bot + ctab_norm.iloc[2]
p4 = plt.bar(ctab_norm.columns, ctab_norm.iloc[3], bottom=bot)
plt.xlabel('Feature Value')
plt.ylabel('Relative Frequency of Target Classes')
plt.title('Target vs ' + f)
plt.legend((p1[0],p2[0],p3[0],p4[0]), ('Class_1', 'Class_2', 'Class_3', 'Class_4'))
plt.grid()
plt.show()

<a id='4'></a>
# Fit Model

In [None]:
# select predictors
predictors = features
print('Number of predictors: ', len(predictors))

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frames in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

# force categorical target
train_hex['target'] = train_hex['target'].asfactor()

In [None]:
# factors for class sampling => trying to mitigate unbalanced target distribution
csf = [6.77, 1.00, 2.68, 4.57]

In [None]:
# fit Gradient Boosting model
n_cv = 5

fit_GBM = H2OGradientBoostingEstimator(ntrees=100,
                                       max_depth=6,
                                       min_rows=50,
                                       learn_rate=0.1, # default: 0.1
                                       sample_rate=1,
                                       col_sample_rate=0.5,
                                       nfolds=n_cv,
                                       score_each_iteration=True,
                                       stopping_metric='logloss',
                                       stopping_rounds=5,
                                       stopping_tolerance=0.0001,
                                       balance_classes=True,
                                       class_sampling_factors=csf,
                                       seed=999)
# train model
t1 = time.time()
fit_GBM.train(x=predictors,
              y='target',
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_GBM.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_logloss, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_logloss, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('logloss')
    plt.ylim(0,2)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_GBM.varimp_plot(-1)

### Evaluate Predictions on Training Data

In [None]:
# predict on train set
pred_train_GBM = fit_GBM.predict(train_hex).as_data_frame()
# add ground truth
pred_train_GBM['target'] = train_hex['target'].as_data_frame()
pred_train_GBM.head()

In [None]:
# predicted frequencies
pred_train_GBM[['Class_1','Class_2','Class_3','Class_4']].sum()

In [None]:
# actual frequencies
df_train.target.value_counts().sort_index()

In [None]:
# confusion matrix - training data
conf_train = pd.crosstab(pred_train_GBM.target, pred_train_GBM.predict)
sns.heatmap(conf_train, cmap='Blues',
            annot=True, fmt='d',
            vmin=0, vmax=60000,
            linecolor='black',
            linewidths=0.1)
plt.title('Confusion Matrix - Training')
plt.show()

### Predict on Test Set

In [None]:
# predict on test set
pred_test_GBM = fit_GBM.predict(test_hex).as_data_frame()
pred_test_GBM

In [None]:
# submission
df_sub_GBM = df_sub.copy()
df_sub_GBM.Class_1 = pred_test_GBM.Class_1
df_sub_GBM.Class_2 = pred_test_GBM.Class_2
df_sub_GBM.Class_3 = pred_test_GBM.Class_3
df_sub_GBM.Class_4 = pred_test_GBM.Class_4
df_sub_GBM

In [None]:
# export submission
df_sub_GBM.to_csv('submission_GBM.csv', index=False)

In [None]:
# multi-dimensional visualization of submission
sns.pairplot(df_sub_GBM[['Class_1','Class_2','Class_3','Class_4']],
             diag_kws = {'alpha': 1.0},
             plot_kws = {'alpha': 0.1})
plt.show()