# Tabular Playground August - Regression using Tweedie loss function
## Table of Contents
* [Import Data / First Glance](#1)
* [EDA](#2)
* [Fit Linear Model](#3)
* [Evaluate GLM Model on Training Data](#4)
* [Build GLM Submission](#5)
* [Fit GBM Model](#6)
* [Evaluate GBM Model on Training Data](#7)
* [Build GBM Submission](#8)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# statistics tools
import scipy.stats as stats
from sklearn.metrics import mean_absolute_error, mean_squared_error

# machine learning tools
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2OGradientBoostingEstimator

<a id='1'></a>
# Import Data / First Glance

In [None]:
# load data / preview
df_train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
n_train = df_train.shape[0]
df_train.shape

In [None]:
n_test = df_test.shape[0]
df_test.shape

<a id='2'></a>
# EDA

In [None]:
# plot target
plt.figure(figsize=(12,4))
df_train.loss.plot(kind='hist', bins=25)
plt.title('Target - Histogram')
plt.grid()
plt.show()

In [None]:
# categorical plot of target
plt.figure(figsize=(12,4))
df_train.loss.value_counts().sort_index().plot(kind='bar')
plt.title('Target - Discrete Distribution')
plt.grid()
plt.show()

In [None]:
# features
features = df_test.columns
features = features.drop('id')
features = features.to_list()

In [None]:
# evaluate correlations with target
corr_stats = pd.DataFrame(data=features, columns=['feature'])
corr_stats['corr_pearson'] = np.zeros(len(features))
corr_stats['corr_spearman'] = np.zeros(len(features))

i = 0
for f in features:
    c = df_train[f].corr(df_train.loss, method='pearson')
    c = np.round(c,4)
    corr_stats.loc[i,'corr_pearson'] = c
    c = df_train[f].corr(df_train.loss, method='spearman')
    c = np.round(c,4)    
    corr_stats.loc[i,'corr_spearman'] = c
    i=i+1

In [None]:
# show top correlations (positive)
corr_stats.nlargest(10, columns='corr_pearson')

In [None]:
# scatter plot with regression line
df_temp = pd.DataFrame({'Feature 13': df_train.f13, 
                        'Target': df_train.loss})
sns.jointplot(data=df_temp, x='Feature 13', y='Target',
              kind='reg',
              joint_kws={'line_kws':{'color':'magenta'}, 
                         'scatter_kws': {'alpha': 0.05}})
plt.show()

In [None]:
# show top correlations (negative)
corr_stats.nsmallest(10, columns='corr_pearson')

In [None]:
# scatter plot with regression line
df_temp = pd.DataFrame({'Feature 25': df_train.f25, 
                        'Target': df_train.loss})
sns.jointplot(data=df_temp, x='Feature 25', y='Target',
              kind='reg',
              joint_kws={'line_kws':{'color':'magenta'}, 
                         'scatter_kws': {'alpha': 0.05}})
plt.show()

#### Ok, we see pretty weak correlations with the target! We cannot expect a really good model here...

<a id='3'></a>
# Fit Linear Model

In [None]:
# define target
target = 'loss'

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# define GLM
n_cv = 5

glm_model = H2OGeneralizedLinearEstimator(family = 'tweedie',
                                          tweedie_variance_power = 1.5,
                                          nfolds = n_cv,
                                          alpha = 0, 
                                          # 0: Ridge (L2), 1: LASSO (L1)                                          
                                          lambda_search = True,
                                          score_each_iteration = True,                                          
                                          seed=12345)

In [None]:
# train model
t1 = time.time()
glm_model.train(features, target, training_frame = train_hex)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# show model details
glm_model

<a id='4'></a>
# Evaluate GLM Model on Training Data

In [None]:
# variable importance
glm_model.varimp_plot(25)

In [None]:
# predict on training data
pred_train_glm = glm_model.predict(train_hex)
y_train_act = train_hex.as_data_frame()[target].values # actuals
y_train_pred_glm = pred_train_glm.as_data_frame().predict.values # predictions

In [None]:
# plot distribution of predictions
plt.hist(y_train_pred_glm, bins=100)
plt.title('Predictions on Training Data - GLM')
plt.grid()
plt.show()

In [None]:
# plot predictions vs actual (training)
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
ax.scatter(x=y_train_act, y=y_train_pred_glm, alpha=0.1)
ax.plot([0,45],[0,45], color='green')
ax.set_aspect(1)
plt.grid()
plt.title('Prediction GLM vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

#### Well, it seems that there is not really much signal in the data...

In [None]:
# scatter plot with regression line
df_temp = pd.DataFrame({'Actual': y_train_act, 
                        'Prediction GLM': y_train_pred_glm})
sns.jointplot(data=df_temp, x='Actual', y='Prediction GLM',
              kind='reg',
              joint_kws={'line_kws':{'color':'magenta'}, 
                         'scatter_kws': {'alpha': 0.1}})
plt.show()

In [None]:
# yet another viz
sns.jointplot(data=df_temp, x='Actual', y='Prediction GLM',
              kind='kde')
plt.show()

In [None]:
# correlations
print('Correlations - Training Data')
print('Correlation Pearson:', stats.pearsonr(y_train_act, y_train_pred_glm))
print('Correlation Spearman:', stats.spearmanr(y_train_act, y_train_pred_glm))

In [None]:
# metrics on training data
print('MAE (train):', np.round(mean_absolute_error(y_train_act, y_train_pred_glm),4))
print('RMSE(train):', np.round(np.sqrt(mean_squared_error(y_train_act, y_train_pred_glm)),4))

In [None]:
# trivial benchmark for comparison: use simple mean of target
m = y_train_act.mean()
RMSE_train_trivial = np.sqrt(np.dot(y_train_act-m,y_train_act-m)/n_train)
print('RMSE(train,trivial model):', np.round(RMSE_train_trivial,4))

<a id='5'></a>
# Build GLM Submission

In [None]:
# predict on test data
pred_test_glm = glm_model.predict(test_hex).as_data_frame()
y_test_pred_glm = pred_test_glm.predict.values # predictions

In [None]:
pred_test_glm.predict.describe()

In [None]:
# fill submission
df_sub_glm = df_sub.copy()
df_sub_glm.loss = y_test_pred_glm
df_sub_glm

In [None]:
# and save result
df_sub_glm.to_csv('submission_GLM.csv', index=None)

### GLM/Tweedie - Public LB: 7.93925

<a id='6'></a>
# Fit GBM Model

In [None]:
n_cv = 5

gbm_model = H2OGradientBoostingEstimator(distribution = 'tweedie',
                                         tweedie_power = 1.5,
                                         ntrees = 50,
                                         nfolds=n_cv,
                                         max_depth=9,
                                         min_rows=5,
                                         learn_rate=0.1, # default: 0.1
                                         sample_rate=1,
                                         col_sample_rate=0.7,
                                         score_each_iteration=True,
                                         stopping_metric='RMSE',
                                         stopping_rounds=5,
                                         stopping_tolerance=0.0001, # default 0.001
                                         seed=999)

In [None]:
# train model
t1 = time.time()
gbm_model.train(features, target, training_frame = train_hex)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# show cross validation metrics
gbm_model.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = gbm_model.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

<a id='7'></a>
# Evaluate GBM Model on Training Data

In [None]:
# variable importance
gbm_model.varimp_plot(25)

In [None]:
# predict on training data
pred_train_gbm = gbm_model.predict(train_hex)
y_train_act = train_hex.as_data_frame()[target].values # actuals
y_train_pred_gbm = pred_train_gbm.as_data_frame().predict.values # predictions

In [None]:
# plot distribution of predictions
plt.hist(y_train_pred_gbm, bins=100)
plt.title('Predictions on Training Data - GBM')
plt.grid()
plt.show()

In [None]:
# plot predictions vs actual (training)
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
ax.scatter(x=y_train_act, y=y_train_pred_gbm, alpha=0.1)
ax.plot([0,45],[0,45], color='green')
ax.set_aspect(1)
plt.grid()
plt.title('Prediction GBM vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
# scatter plot with regression line
df_temp = pd.DataFrame({'Actual': y_train_act, 
                        'Prediction GBM': y_train_pred_gbm})
sns.jointplot(data=df_temp, x='Actual', y='Prediction GBM',
              kind='reg',
              joint_kws={'line_kws':{'color':'magenta'}, 
                         'scatter_kws': {'alpha': 0.1}})
plt.show()

In [None]:
# yet another viz
sns.jointplot(data=df_temp, x='Actual', y='Prediction GBM',
              kind='kde')
plt.show()

In [None]:
# correlations
print('Correlations - Training Data')
print('Correlation Pearson:', stats.pearsonr(y_train_act, y_train_pred_gbm))
print('Correlation Spearman:', stats.spearmanr(y_train_act, y_train_pred_gbm))

In [None]:
# metrics on training data
print('MAE (train):', np.round(mean_absolute_error(y_train_act, y_train_pred_gbm),4))
print('RMSE(train):', np.round(np.sqrt(mean_squared_error(y_train_act, y_train_pred_gbm)),4))

<a id='8'></a>
# Build GBM Submission

In [None]:
# predict on test data
pred_test_gbm = gbm_model.predict(test_hex).as_data_frame()
y_test_pred_gbm = pred_test_gbm.predict.values # predictions

In [None]:
pred_test_gbm.predict.describe()

In [None]:
# fill submission
df_sub_gbm = df_sub.copy()
df_sub_gbm.loss = y_test_pred_gbm
df_sub_gbm

In [None]:
# and save result
df_sub_gbm.to_csv('submission_GBM.csv', index=None)