# Table of Contents
* [Target Exploration](#1)
* [Numerical Features](#2)
* [Categorical Features](#3)
* [Target vs Features](#4)
* [Build Model](#5)
* [Predict on Test Set and prepare Submission](#6)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# machine learning tools
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
# load data + first glance
df_train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')

# first glance (training data)
df_train.head()

In [None]:
# dimensions
df_train.shape

In [None]:
df_train.info()

#### We are lucky, no missing values!

<a id='1'></a>
# Target Exploration

### This time we have a categorical (binary) target!

In [None]:
# basic stats
print(df_train.target.value_counts())
df_train.target.value_counts().plot(kind='bar')
plt.grid()
plt.show()

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['cont0', 'cont1', 'cont2', 'cont3', 
                'cont4', 'cont5', 'cont6', 'cont7',
                'cont8', 'cont9', 'cont10']

In [None]:
# plot distribution of numerical features
for f in features_num:
    plt.figure(figsize=(8,4))
    df_train[f].plot(kind='hist', bins=100)
    plt.title(f)
    plt.grid()
    plt.show()

## Feature Correlations

In [None]:
corr_pearson = df_train[features_num].corr(method='pearson')
corr_spearman = df_train[features_num].corr(method='spearman')

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# example of scatter plot - we pick pair having highest (Pearson) correlation
sns.jointplot(data=df_train, x='cont1', y='cont2', kind='hex')
plt.show()

<a id='3'></a>
# Categorical Features

In [None]:
features_cat = ['cat0', 'cat1', 'cat2', 'cat3',
                'cat4', 'cat5', 'cat6', 'cat7',
                'cat8', 'cat9', 'cat10', 'cat11',
                'cat12', 'cat13', 'cat14', 'cat15',
                'cat16', 'cat17', 'cat18']

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(14,4))
    df_train[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

#### Well, "cat10" has lots of different values, this might require a closer look...

In [None]:
# count different values/levels
cat10_freq = df_train.cat10.value_counts()
print(cat10_freq)

# and plot frequency distribution using log scale
fig, ax = plt.subplots(figsize=(12,4))
ax.plot(np.log10(cat10_freq))
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.title('cat10 - Frequencies')
plt.ylabel('log10(Frequency)')
plt.grid()
plt.show()

In [None]:
# evaluate mean of target by level
cat10_target = df_train.groupby(['cat10']).agg({
    'target' : ['mean','count']})
# ... and sort by frequency of level
cat10_target = cat10_target.sort_values([('target','count')], ascending=False)

# plot mean of target by level; bubble area ~ frequency
fig, ax = plt.subplots(figsize=(12,6))
ax.scatter(cat10_target.index, cat10_target[('target','mean')],
           s=2*np.sqrt(cat10_target[('target','count')]),
           alpha = 0.5)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.title('cat10 - Average target by level (bubble area ~ frequency)')
plt.grid()
plt.show()

#### => It could be beneficial to group the less frequent levels (e. g. right of "CP") into a group "other".

In [None]:
# let's give it a try: define levels to be kept
n_keep = 201
cat10_keep = cat10_freq[0:n_keep].index.tolist()
print(cat10_keep)

In [None]:
# add new column with reduced number of levels
df_train['cat10_reduced'] = df_train.cat10.where(df_train.cat10.isin(cat10_keep), '_OTHER_')
df_train.cat10_reduced.value_counts()

In [None]:
# check frequency of _OTHER_ category
df_train[df_train.cat10_reduced=='_OTHER_'].cat10_reduced.value_counts()

In [None]:
# same for test set!
df_test['cat10_reduced'] = df_test.cat10.where(df_test.cat10.isin(cat10_keep), '_OTHER_')
df_test.cat10_reduced.value_counts()

In [None]:
# update feature list accordingly
features_cat = ['cat0', 'cat1', 'cat2', 'cat3',
                'cat4', 'cat5', 'cat6', 'cat7',
                'cat8', 'cat9', 'cat10_reduced', 'cat11',
                'cat12', 'cat13', 'cat14', 'cat15',
                'cat16', 'cat17', 'cat18']

<a id='4'></a>
# Target vs Features

## Numerical Features

In [None]:
# plot target vs binned numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_num:
    
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df_train[new_var] = pd.qcut(df_train[f], 10)
    
    # then create mosaic plot
    plt.rcParams["figure.figsize"] = (16,7) # increase plot size for mosaics
    mosaic(df_train, [new_var, 'target'], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

## Categorical Features

In [None]:
# plot target vs features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_cat:
    plt.rcParams["figure.figsize"] = (16,7) # increase plot size for mosaics
    mosaic(df_train, [f, 'target'], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

<a id='5'></a>
# Build model

In [None]:
# select predictors
predictors = features_num + features_cat
print('Number of predictors: ', len(predictors))
print(predictors)

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frames in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

# force categorical target
train_hex['target'] = train_hex['target'].asfactor()

In [None]:
# define Gradient Boosting model
n_cv = 10

fit_1 = H2OGradientBoostingEstimator(ntrees = 200,
                                     max_depth=6,
                                     min_rows=100,
                                     learn_rate=0.05, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.5,
                                     nfolds=n_cv,
                                     seed=999)

In [None]:
# train model
t1 = time.time()
fit_1.train(x=predictors,
            y='target',
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [AUC]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('AUC')
    plt.ylim(0.8,1)
    plt.legend()
    plt.grid()
    plt.show()

### Variable Importance

In [None]:
# basic version
fit_1.varimp_plot(-1)

In [None]:
# variable importance using shap values => see direction as well as severity of feature impact
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

## Check performance on training data / cross validations

In [None]:
# training performance
perf_train = fit_1.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance
perf_cv = fit_1.model_performance(xval=True)
perf_cv.plot()

<a id='6'></a>
# Predict on Test Set and prepare Submission

In [None]:
# predict on test set (extract probabilities only)
pred_test = fit_1.predict(test_hex)['p1']
pred_test = pred_test.as_data_frame().p1

#### => The "cat10" feature does not only challenge us with its many levels but also has a few levels that occur only in the test set. By using cat10_reduced now instead of cat10 this problem is no longer relevant...

In [None]:
# let's quickly check the frequency of those "exotic" levels
df_test.cat10.value_counts()[['BU','BW','CA','DG','EJ','JM','KE','KM']]

In [None]:
# plot test set predictions (probabilities)
plt.figure(figsize=(7,5))
plt.hist(pred_test, bins=100)
plt.title('Predictions on Test Set')
plt.grid()
plt.show()

In [None]:
# prepare submission
df_sub.target = pred_test
df_sub.head(10)

In [None]:
# save to file for submission
df_sub.to_csv('submission.csv', index=False)