# Table of Contents
* [Target Exploration](#1)
* [Feature Engineering](#2)
* [Features EDA](#3)
* [Build GBM Model](#4)
* [Evaluate Model](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning tools
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
pd.set_option('display.max_columns', None) # show all columns in data frames

In [None]:
# load data + first glance
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
t2 = time.time()
print('Elapsed Time[s]:', np.round(t2-t1,4))

In [None]:
# first glance (training data)
df_train.head()

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
df_train.info(verbose=True, show_counts=True)

In [None]:
df_test.info(verbose=True, show_counts=True)

<a id='1'></a>
# Target Exploration

In [None]:
df_train.claim.value_counts().plot(kind='bar')
plt.title('Target: claim')
plt.grid()
plt.show()

<a id='2'></a>
# Feature Engineering
### Credits to https://www.kaggle.com/c/tabular-playground-series-sep-2021/discussion/270206

In [None]:
df_train['nan_count'] = df_train.isnull().sum(axis=1)
df_test['nan_count'] = df_test.isnull().sum(axis=1)

In [None]:
print(df_train.nan_count.value_counts())
df_train.nan_count.value_counts().plot(kind='bar')
plt.title('NaN count - Training')
plt.grid()
plt.show()

In [None]:
print(df_test.nan_count.value_counts())
df_test.nan_count.value_counts().plot(kind='bar')
plt.title('NaN count - Test')
plt.grid()
plt.show()

### Show impact of nan_count on target:

In [None]:
# cross table - absolute counts...
ctab = pd.crosstab(df_train.claim, df_train.nan_count)
ctab

In [None]:
# ...and normalized by column
ctab_norm = ctab / ctab.sum()
ctab_norm

In [None]:
# visualize
plt.figure(figsize=(14,5))
p1 = plt.bar(ctab_norm.columns, ctab_norm.iloc[0])
bot = ctab_norm.iloc[0]
p2 = plt.bar(ctab_norm.columns, ctab_norm.iloc[1], bottom=bot)
plt.xlabel('nan_count')
plt.ylabel('Relative Frequency of Target Classes')
plt.title('Target vs nan_count')
plt.legend((p1[0],p2[0]), ('0', '1'))
plt.grid()
plt.show()

<a id='3'></a>
# Features EDA

In [None]:
# numerical features
features_num = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
                'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
                'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
                'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
                'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
                'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
                'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
                'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 
                'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109',
                'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118',
                'nan_count']

In [None]:
# basic stats
df_train[features_num].describe()

### Target vs Feature

In [None]:
# cross table - target vs example feature
example_feature = 'f48'
# discretize feature first (quantile based to achieve balanced bucket sizes)
df_train['temp'] = pd.qcut(df_train[example_feature],10)

In [None]:
# calc crosstab
ctab = pd.crosstab(df_train.claim, df_train.temp)
# ...and normalized by column
ctab_norm = ctab / ctab.sum()
ctab_norm

In [None]:
# visualize
xx = list(map(str,ctab_norm.columns)) # convert intervals to strings
yy1 = ctab_norm.iloc[0].values
yy2 = ctab_norm.iloc[1].values

plt.figure(figsize=(14,5))
p1 = plt.bar(xx, yy1)
bot = ctab_norm.iloc[0]
p2 = plt.bar(xx, yy2, bottom=bot)
plt.xlabel('')
plt.ylabel('Relative Frequency of Target Classes')
plt.title('Target vs ' + example_feature)
plt.legend((p1[0],p2[0]), ('0', '1'))
plt.xticks(rotation=90)
plt.grid()
plt.show()

<a id='4'></a>
# Build GBM Model

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frames in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time[s]: ', np.round(t2-t1,4))

In [None]:
# force categorical target
train_hex['claim'] = train_hex['claim'].asfactor()

In [None]:
# define predictors and target
predictors = features_num
target = 'claim'

In [None]:
# define GBM model
n_cv = 5
fit_1 = H2OGradientBoostingEstimator(ntrees=300,
                                     learn_rate=0.05,
                                     max_depth=6,
                                     min_rows=5,
                                     sample_rate=0.5, # sample rows
                                     col_sample_rate=0.5, # sample columns
                                     nfolds=n_cv,
                                     score_each_iteration=True,
                                     #stopping_metric='auc',
                                     #stopping_rounds=5,
                                     #stopping_tolerance=0.001, # default 0.001
                                     seed=999)

# train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()

print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# scoring history (training)
fit_1.plot()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.ylim(0.75,0.85)
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

<a id='5'></a>
# Evaluate Model

In [None]:
# cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# training performance
perf_train = fit_1.model_performance(train=True)
perf_train.plot()

In [None]:
# CV performance
perf_cv = fit_1.model_performance(xval=True)
perf_cv.plot()

In [None]:
# variable importance
fit_1.varimp_plot(25)

In [None]:
# predict on training data
pred_train = fit_1.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
pred_train.head()

In [None]:
# check calibration
n_actual = sum(df_train.claim)
n_pred = sum(pred_train.p1)
n_train = df_train.shape[0]

print('Actual Frequency    :', n_actual)
print('Predicted Frequency :', n_pred)
print('Calibration Ratio   :', n_pred / n_actual)
print('Train Set Size      :', n_train)
print('Predicted Ratio     :', n_pred/n_train)

In [None]:
# plot probabilities
plt.figure(figsize=(8,4))
plt.hist(pred_train.p1, bins=50)
plt.title('Predictions on Train Set')
plt.grid()
plt.show()

In [None]:
# predict on test set
pred_test = fit_1.predict(test_hex)
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
plt.figure(figsize=(8,4))
plt.hist(pred_test.p1, bins=50)
plt.title('Predictions on Test Set')
plt.grid()
plt.show()

In [None]:
# frequency check
n_test_pred = np.round(pred_test.p1.sum(),2)
n_test = df_test.shape[0]
print('Predicted count:', n_test_pred)
print('Test Set Size:',n_test)
print('Predicted Ratio:', n_test_pred/n_test)# frequency check

In [None]:
# submission
df_sub_1 = df_sub.copy()
df_sub_1.claim = pred_test.p1
display(df_sub_1.head())
# save to file
df_sub_1.to_csv('submission_1.csv', index=False)