# Table of Contents
* [Import and first checks](#Import)
* [Target](#Target)
* [Features](#Features)
* [Model - GLM](#Model_GLM)
* [Model - Gradient Boosting](#Model_GBM)
* [Predict on Test Set + Submissions](#PredTest)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time
import gc

# plot
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# machine learning tools
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator, H2ORandomForestEstimator, H2OGradientBoostingEstimator

In [None]:
# configuration to show all columns in output
pd.set_option('display.max_columns', None)

<a id='Import'></a>
# Import and first checks

In [None]:
# load data (this takes some time)
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure of data frame
df_train.info(verbose=True, show_counts=True)

In [None]:
df_test.info(verbose=True, show_counts=True)

<a id='Target'></a>
# Target

In [None]:
# basic stats
print(df_train.target.value_counts().reset_index(drop=True))
df_train.target.value_counts().reset_index(drop=True).plot(kind='bar')
plt.grid()
plt.show()

#### => Nicely balanced target!

### Impact of Features on Target (examples):

In [None]:
# plot target vs BINNED numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in ['f34','f55']:
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df_train[new_var] = pd.qcut(df_train[f], 8)
    # then create mosaic plot
    plt.rcParams['figure.figsize'] = (16,6) # increase plot size for mosaics
    mosaic(df_train, [new_var, 'target'], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

# remove temporary columns
df_train = df_train.drop(['f34_bin','f55_bin'], axis=1)

<a id='Features'></a>
# Features

In [None]:
# extract list of features
features_num = df_train.columns.tolist()
features_num.remove('id')
features_num.remove('target')

In [None]:
# basic stats (training)
df_train[features_num].describe()

In [None]:
# boxplot of all features (training)
n_plot_rows = 10
n_plot_cols = 10
for i in range(n_plot_rows):
    print('Columns', n_plot_cols*i+1 , 'to', n_plot_cols*i+n_plot_cols)
    df_train.iloc[:,n_plot_cols*i+1:n_plot_cols*i+n_plot_cols+1].plot(kind='box', figsize=(15,5))
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()

### Correlations:

In [None]:
# correlations (training)
corr_pearson_train = df_train[features_num].corr(method='pearson')
corr_pearson_test = df_test[features_num].corr(method='pearson')

In [None]:
fig = plt.figure(figsize = (12,10))
sns.heatmap(corr_pearson_train, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation - Train')
plt.show()

In [None]:
fig = plt.figure(figsize = (12,10))
sns.heatmap(corr_pearson_test, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation - Test')
plt.show()

<a id='Model_GLM'></a>
# Model - GLM

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # define maximum memory usage and number of cores

In [None]:
# upload training data in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train) # use all data
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,4))

In [None]:
# force categorical target
train_hex['target'] = train_hex['target'].asfactor()

In [None]:
# upload test data in H2O environment
t1 = time.time()
test_hex = h2o.H2OFrame(df_test) # use all data
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,4))

In [None]:
# select predictors
predictors = features_num
target = 'target'

In [None]:
# define GLM
n_cv = 5

fit_GLM = H2OGeneralizedLinearEstimator(family = 'binomial',
                                        nfolds = n_cv,
                                        alpha = 0, 
                                        # 0: Ridge (L2), 1: LASSO (L1)                                          
                                        lambda_search = True,
                                        score_each_iteration = True,                                          
                                        seed=12345)

In [None]:
# train model
t1 = time.time()
fit_GLM.train(predictors, target, training_frame = train_hex)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# show model details
fit_GLM

In [None]:
# show cross validation metrics
fit_GLM.cross_validation_metrics_summary()

In [None]:
# variable importance
fit_GLM.varimp_plot()

In [None]:
# training performance
perf_train = fit_GLM.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance
perf_cv = fit_GLM.model_performance(xval=True)
perf_cv.plot()

In [None]:
# predict on train set (extract probabilities only)
pred_train_GLM = fit_GLM.predict(train_hex)['p1']
pred_train_GLM = pred_train_GLM.as_data_frame().p1

# plot train set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_train_GLM, bins=100)
plt.title('Predictions on Train Set - GBM')
plt.grid()
plt.show()

In [None]:
# calibration
n_actual = sum(df_train.target)
n_pred_GLM = sum(pred_train_GLM)

print('Actual Frequency    :', n_actual)
print('Predicted Frequency :', n_pred_GLM)
print('Calibration Ratio   :', n_pred_GLM / n_actual)

<a id='Model_GBM'></a>
# Model - Gradient Boosting

In [None]:
# fit Gradient Boosting model
n_cv = 5

fit_GBM = H2OGradientBoostingEstimator(ntrees=250,
                                       max_depth=6,
                                       min_rows=10,
                                       learn_rate=0.1, # default: 0.1
                                       sample_rate=1,
                                       col_sample_rate=0.5,
                                       nfolds=n_cv,
                                       score_each_iteration=True,
                                       stopping_metric='auc',
                                       stopping_rounds=5,
                                       stopping_tolerance=0.0001,
                                       seed=999)
# train model
t1 = time.time()
fit_GBM.train(x=predictors,
              y='target',
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_GBM.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [AUC]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('AUC')
    plt.ylim(0.4,0.8)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_GBM.varimp_plot()

In [None]:
# alternative variable importance using SHAP => see direction as well as severity of feature impact
t1 = time.time()
fit_GBM.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# training performance
perf_train = fit_GBM.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance
perf_cv = fit_GBM.model_performance(xval=True)
perf_cv.plot()

In [None]:
# predict on train set (extract probabilities only)
pred_train_GBM = fit_GBM.predict(train_hex)['p1']
pred_train_GBM = pred_train_GBM.as_data_frame().p1

# plot train set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_train_GBM, bins=100)
plt.title('Predictions on Train Set - GBM')
plt.grid()
plt.show()

In [None]:
# calibration
n_actual = sum(df_train.target)
n_pred_GBM = sum(pred_train_GBM)

print('Actual Frequency    :', n_actual)
print('Predicted Frequency :', n_pred_GBM)
print('Calibration Ratio   :', n_pred_GBM / n_actual)

<a id='PredTest'></a>
# Predict on Test Set + Submissions

In [None]:
# predict on test set (extract probabilities only)
pred_test_GLM = fit_GLM.predict(test_hex)['p1']
pred_test_GLM = pred_test_GLM.as_data_frame().p1

# plot test set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_test_GLM, bins=100)
plt.title('Predictions on Test Set - GLM')
plt.grid()
plt.show()

In [None]:
# GLM submission
df_sub_GLM = df_sub.copy()
df_sub_GLM.target = pred_test_GLM
display(df_sub_GLM.head())
# save to file
df_sub_GLM.to_csv('submission_GLM.csv', index=False)

In [None]:
# predict on test set (extract probabilities only)
pred_test_GBM = fit_GBM.predict(test_hex)['p1']
pred_test_GBM = pred_test_GBM.as_data_frame().p1

# plot test set predictions (probabilities)
plt.figure(figsize=(8,4))
plt.hist(pred_test_GBM, bins=100)
plt.title('Predictions on Test Set - GBM')
plt.grid()
plt.show()

In [None]:
# GBM submission
df_sub_GBM = df_sub.copy()
df_sub_GBM.target = pred_test_GBM
display(df_sub_GBM.head())
# save to file
df_sub_GBM.to_csv('submission_GBM.csv', index=False)

### Blend

In [None]:
# scatter plot with regression line
df_temp = pd.DataFrame({'Pred_GLM': df_sub_GLM.target, 
                        'Pred_GBM': df_sub_GBM.target})
sns.jointplot(data=df_temp, x='Pred_GLM', y='Pred_GBM',
              joint_kws = {'alpha' : 0.1})
plt.show()

In [None]:
# blend submission
df_sub_blend = df_sub.copy()
w_GLM = 0.65
df_sub_blend.target = w_GLM * df_sub_GLM.target + (1-w_GLM) * df_sub_GBM.target
display(df_sub_blend.head())
# save to file
df_sub_blend.to_csv('submission_blend.csv', index=False)