In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning tools
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
# load data + first glance
df_train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

# first glance (training data)
df_train.head()

In [None]:


# dimensions
df_train.shape



In [None]:
df_train.info()

## Target Value

In [None]:
# basic stats
df_train.target.describe(percentiles=[0.1,0.25,0.5,0.75,0.9])


In [None]:
# histogram of target
df_train.target.plot(kind='hist', bins=100)
plt.title('Target - Histogram')
plt.grid()
plt.show()

In [None]:
# boxplot of target => looking for outliers
df_train.target.plot(kind='box')
plt.title('Target - Boxplot')
plt.grid()
plt.show()


In [None]:
df_zero = df_train[df_train.target==0]
df_zero


In [None]:
# let's remove this one observation
df_train = df_train[df_train.target>0]
df_train.target.describe()


In [None]:
features_num = ['cont0', 'cont1', 'cont2', 'cont3', 
                'cont4', 'cont5', 'cont6', 'cont7',
                'cont8', 'cont9', 'cont10', 'cont11',
                'cont12', 'cont13']

# plot distribution of numerical features
for f in features_num:
    plt.figure(figsize=(8,4))
    df_train[f].plot(kind='hist', bins=100)
    plt.title(f)
    plt.grid()
    plt.show()



# Correlation

In [None]:
corr_pearson = df_train[features_num].corr(method='pearson')
corr_spearman = df_train[features_num].corr(method='spearman')

fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()



In [None]:
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()


In [None]:
# example of scatter plot - we pick pair having highest (Pearson) correlation
sns.jointplot(data=df_train, x='cont5', y='cont12',
              joint_kws = {'alpha': 0.1})
plt.show()

In [None]:
# different visualization
sns.jointplot(data=df_train, x='cont5', y='cont12', kind='hex')
plt.show()


# Categorical Feature

In [None]:
features_cat = ['cat0', 'cat1', 'cat2', 'cat3',
                'cat4', 'cat5', 'cat6', 'cat7',
                'cat8', 'cat9']

# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(8,4))
    df_train[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()



# Scatter Plot Vs Feature

In [None]:
# scatter plot of target vs each feature + show correlation
for f in features_num:
    c = df_train[f].corr(df_train.target, method='pearson')
    c = np.round(c,4)
    plt.figure(figsize=(7,7))
    plt.scatter(df_train[f], df_train.target, alpha=0.01)
    plt.title('Target vs ' + f + ' / corr = ' + str(c))
    plt.xlabel(f)
    plt.ylabel('Target')
    plt.grid()
    plt.show()

In [None]:
for f in features_num:
    new_var = f + '_bin'
    df_train[new_var] = pd.cut(df_train[f], bins=10, include_lowest=True)
    plt.figure(figsize=(7,7))
    sns.boxplot(data=df_train, x=new_var, y='target')
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()

In [None]:


for f in features_cat:
    plt.figure(figsize=(10,5))
    sns.boxplot(data=df_train, x=f, y='target')
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()



# Model Building

In [None]:
# Check mean of target as trivial prediction
m0 = df_train.target.mean()
print('Mean of target:', np.round(m0,6))

In [None]:


# Metrics on training data
foo = df_train.target - m0 # difference target vs. trivial mean prediction
foo = (foo*foo).mean() # mean squared error
print('RMSE(train) - Trivial Benchmark: ', np.round(np.sqrt(foo),6))



In [None]:
# select predictors
predictors = features_num + features_cat
print('Number of predictors: ', len(predictors))
print(predictors)

In [None]:


# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores



In [None]:
# upload training and test data in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))


In [None]:
# define Gradient Boosting model
fit_1 = H2OGradientBoostingEstimator(ntrees = 1000,
                                     max_depth=9,
                                     min_rows=1,
                                     learn_rate=0.01, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.7,
                                     nfolds=5,
                                     seed=999)

# train model - this takes some time...
t1 = time.time()
fit_1.train(x=predictors,
            y='target',
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))



In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(5):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()



In [None]:
# variable importance using shap values => see direction as well as severity of feature impact
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# predict on training data
pred_train = fit_1.predict(train_hex)
y_train_pred = pred_train.as_data_frame().predict.values # predictions

# and add prediction to original data frame
df_train['prediction'] = y_train_pred

In [None]:


# plot predictions vs actual
p=sns.jointplot(data=df_train, x='target', y='prediction',
              joint_kws={'alpha' : 0.1})
p.fig.suptitle('Prediction vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()



In [None]:
# predict on test data
pred_test = fit_1.predict(test_hex)
y_test_pred = pred_test.as_data_frame().predict.values # predictions

# and plot distribution of predictions
plt.hist(y_test_pred, bins=100)
plt.title('Predictions on Test Set')
plt.grid()
plt.show()


In [None]:
plt.hist(y_train_pred, bins=100)
plt.title('Predictions on Training Data')
plt.grid()
plt.show()


In [None]:
# prepare submission
df_sub.target = y_test_pred
df_sub.head(10)

In [None]:
# stats
df_sub.target.describe()


In [None]:
# save to file for submission
df_sub.to_csv('submission.csv', index=False)
