# Table of Contents
* [Import and Data Preparation](#import)
* [Target](#target)
* [Features](#features)
* [Fit Model](#model)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time
import gc

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# machine learning tools
import h2o
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
# show files
!ls -l '../input/tabular-playground-series-feb-2022/'

# Import and Data Preparation

In [None]:
# import data
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
df_train.head()

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure / missing values
df_train.info(verbose=True, show_counts=True)

### => No missing values, nice!

In [None]:
df_train.describe()

<a id='target'></a>
# Target

In [None]:
# target - basic stats
print(df_train.target.value_counts())
df_train.target.value_counts().plot(kind='bar')
plt.grid()
plt.show()

### => Target is nicely balanced.

In [None]:
# add numeric version of target
df_train['target_num'] = df_train.target.astype('category').cat.codes

In [None]:
# list target levels (for later)
target_levels = df_train.target.value_counts().index.tolist()

<a id='features'></a>
# Features

In [None]:
# extract features
features = df_train.columns
features = features.drop(['row_id','target','target_num'])
features = features.tolist()

### Correlation:

In [None]:
corr_pearson = df_train[features].corr(method='pearson')

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# export to file
corr_pearson.to_csv('corr_pearson.csv')

In [None]:
# memory management
del corr_pearson
gc.collect()

### Feature distributions:

In [None]:
fig, axs = plt.subplots(72, 4, figsize=(16,300))
i = 0
for f in features:
    current_ax = axs.flat[i]
    current_ax.hist(df_train[f], bins=100)
    current_ax.set_title(f)
    current_ax.grid()
    i = i + 1

### Visualize data points as lines colored by target:

In [None]:
colors = ['red','blue','green','orange','grey',
          'cyan','magenta','brown','darkgreen','lightblue']

In [None]:
# pick a few data points for first plot
plt.figure(figsize=(16,5))
for i in range(0,20):
    plt.plot(df_train.loc[i,features], color=colors[df_train.target_num[i]])

plt.xticks(rotation=90)
plt.show()

In [None]:
# plot more rows
plt.figure(figsize=(16,5))
for i in range(0,2000):
    plt.plot(df_train.loc[i,features], color=colors[df_train.target_num[i]])

plt.xticks(rotation=90)
plt.show()

<a id='model'></a>
# Fit Model

In [None]:
# start H2O
h2o.init(max_mem_size='13G', nthreads=4) # Use maximum of 13 GB RAM and 4 cores

In [None]:
# upload data in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# memory management: remove data frame version of train/test set
del df_train
del df_test
gc.collect()

In [None]:
# fit Gradient Boosting model
n_cv = 5

fit_GBM = H2OGradientBoostingEstimator(ntrees=100,
                                       max_depth=6,
                                       min_rows=10,
                                       learn_rate=0.25, # default: 0.1
                                       sample_rate=0.5,
                                       col_sample_rate=0.2,
                                       nfolds=n_cv,
                                       score_each_iteration=True,
                                       stopping_metric='logloss',
                                       stopping_rounds=5,
                                       stopping_tolerance=0.0001,
                                       seed=999)
# train model
t1 = time.time()
fit_GBM.train(x=features,
              y='target',
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_GBM.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations- Classification Error
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [Classification Error]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_classification_error, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_classification_error, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('Classification Error')
    plt.ylim(0.0,0.4)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# show scoring history - training vs cross validations - LogLoss
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [log_loss]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_logloss,
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_logloss, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('log_loss')
    plt.ylim(0.0,3.0)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# further details
fit_GBM

In [None]:
# variable importance
fit_GBM.varimp_plot(40)
plt.show()

In [None]:
# predict on training data
pred_train = fit_GBM.predict(train_hex).as_data_frame()
pred_train

In [None]:
# plot prediction probabilities
sns.pairplot(pred_train[target_levels],
             plot_kws=dict(s=1))
plt.show()

In [None]:
# correlation of target predictions
pred_train[target_levels].corr()

In [None]:
# save result
pred_train.to_csv('pred_train.csv', index=None)

In [None]:
y_train_pred = pred_train.predict
y_train_pred.value_counts()

In [None]:
# memory management
del pred_train
h2o.remove(train_hex)

In [None]:
gc.collect()

### Predict on Test Set:

In [None]:
# predict on test data
pred_test = fit_GBM.predict(test_hex).as_data_frame()
pred_test

In [None]:
y_test_pred = pred_test.predict
y_test_pred.value_counts()

In [None]:
# correlation of target predictions
pred_test[target_levels].corr()

In [None]:
# plot prediction probabilities
sns.pairplot(pred_test[target_levels],
             plot_kws=dict(s=1))
plt.show()

### Prepare Submission:

In [None]:
df_sub.target = y_test_pred
df_sub

In [None]:
# save submission data
df_sub.to_csv('submission_GBM.csv', index=None)

In [None]:
# save also probabilities
pred_test.to_csv('pred_test.csv', index=None)