# Table of Contents
* [Data Preparation](#1)
* [Targets](#2)
* [Numerical Features](#3)
* [Time Features](#4)
* [Targets vs Features](#5)
* [Autocorrelations of Targets](#6)
* [Baseline Models](#7)
* [Predict on Test Set and Visualize Results](#8)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# ML tools
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

<a id='1'></a>
# Data Preparation

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# date conversion + feature extraction
df_train['date_time'] = pd.to_datetime(df_train.date_time)
df_train['year'] = df_train.date_time.dt.year
df_train['month'] = df_train.date_time.dt.month
df_train['day'] = df_train.date_time.dt.day
df_train['hour'] = df_train.date_time.dt.hour
df_train['weekday'] = df_train.date_time.dt.weekday

df_test['date_time'] = pd.to_datetime(df_test.date_time)
df_test['year'] = df_test.date_time.dt.year
df_test['month'] = df_test.date_time.dt.month
df_test['day'] = df_test.date_time.dt.day
df_test['hour'] = df_test.date_time.dt.hour
df_test['weekday'] = df_test.date_time.dt.weekday

In [None]:
df_train.describe(include='all', datetime_is_numeric=True)

In [None]:
df_test.describe(include='all', datetime_is_numeric=True)

<a id='2'></a>
# Targets

### We have three target variables here:

In [None]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
df_train[targets].describe()

### Time Series Plots

In [None]:
for t in targets:
    my_alpha=0.25
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(df_train.date_time, df_train[t], alpha=my_alpha, 
               color='darkred', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(t)
    plt.grid()
    plt.show()

#### There seems to be a structural change for Nitrogen Oxides target around September 2010!

In [None]:
# incremental view
for t in targets:
    my_alpha=0.25
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(df_train.date_time, df_train[t].diff(), alpha=my_alpha, 
               color='darkred', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(t + ' - incremental')
    plt.grid()
    plt.show()

### Target Correlations

In [None]:
corr_target_pearson = df_train[targets].corr(method='pearson')
corr_target_spearman = df_train[targets].corr(method='spearman')

fig = plt.figure(figsize = (4,3))
sns.heatmap(corr_target_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (4,3))
sns.heatmap(corr_target_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

#### => Targets show strong correlation!

In [None]:
# distributions / scatter plots
sns.pairplot(df_train[targets])
plt.show()

In [None]:
# add boxplots of targets (=> look for outliers)
for t in targets:
    plt.figure(figsize=(8,2))
    plt.boxplot(df_train[t], vert=False)
    plt.grid()
    plt.title(t + ' - boxplot')
    plt.show()

In [None]:
# smae with log trafo
for t in targets:
    plt.figure(figsize=(8,2))
    plt.boxplot(np.log10(df_train[t]), vert=False)
    plt.grid()
    plt.title(t + ' - boxplot of log10(...)')
    plt.show()

In [None]:
# # check accumulation of values on 0.1 for Benzene.
# # these are the strange straight lines in the scatter plot above
# check = df_train[df_train.target_benzene <= 0.1]
# check.shape

In [None]:
# # let's try to remove those => score gets much worse... => version 16
# df_train = df_train[df_train.target_benzene>0.1]
# # redo scatter plots
# sns.pairplot(df_train[targets])
# plt.show()

<a id='3'></a>
# Numerical Features

In [None]:
# numerical features excluding time features
features_num = ['deg_C', 'relative_humidity', 'absolute_humidity',
                'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']

In [None]:
# plot distributions of numerical features
for f in features_num:
    plt.figure(figsize=(14,4))
    ax1 = plt.subplot(1,2,1)
    ax1.hist(df_train[f], bins=50, color='blue')
    plt.title(f + ' [Train]')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    ax2.hist(df_test[f], bins=50, color='green')
    plt.title(f + ' [Test]')
    plt.grid()
    plt.show()

In [None]:
# correlations (training)
corr_pearson_train = df_train[features_num].corr(method='pearson')
corr_pearson_test = df_test[features_num].corr(method='pearson')

fig = plt.figure(figsize = (7,5))
sns.heatmap(corr_pearson_train, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation - Train')
plt.show()

fig = plt.figure(figsize = (7,5))
sns.heatmap(corr_pearson_test, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation - Test')
plt.show()

In [None]:
# scatter plots of features (train)
sns.pairplot(df_train[features_num])
plt.show()

In [None]:
# scatter plots of features (test)
sns.pairplot(df_test[features_num])
plt.show()

### Time Series Plots

In [None]:
for f in features_num:
    plt.figure(figsize=(14,4))
    ax1 = plt.subplot(2,1,1)
    ax1.scatter(df_train.date_time, df_train[f], alpha=my_alpha, color='blue', s=4)
    ax1.scatter(df_test.date_time, df_test[f], alpha=my_alpha, color='green', s=4)
    plt.title(f)
    plt.grid()

<a id='4'></a>
# Time Features

In [None]:
# time features
features_time = ['year', 'month', 'day', 'hour', 'weekday']

In [None]:
# plot distributions of time features
for f in features_time:
    plt.figure(figsize=(14,4))
    ax1 = plt.subplot(1,2,1)
    v = df_train[f].value_counts().sort_index()
    ax1.bar(height=v.values, x=v.index, color='blue')
    plt.title(f + ' [Train]')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    v = df_test[f].value_counts().sort_index()
    ax2.bar(height=v.values, x=v.index, color='green')
    plt.title(f + ' [Test]')
    plt.grid()
    plt.show()

<a id='5'></a>
# Targets vs Features

In [None]:
# plot targets vs features column-wise
fig, axs = plt.subplots(13, 3, figsize=(16,64))
i = 0
for f in features_num + features_time:
    for t in targets:
        current_ax = axs.flat[i]
        corr_t = np.round(df_train[f].corr(df_train[t], method='pearson'),4)
        current_ax.scatter(df_train[f], df_train[t],
                           alpha=0.25, s=4,
                           color='darkred')
        t_short = t.replace('target_','')
        current_ax.set_title(t_short + ' vs ' + f + ', corr=' + str(corr_t))
        current_ax.grid()
        i = i + 1

<a id='6'></a>
# Autocorrelations of targets

In [None]:
for t in targets:
    plt.figure(figsize=(10,4))
    plt.acorr(df_train[t].diff()[2:], maxlags=20)
    plt.title('Autocorrelations of increments of ' + t)
    plt.grid()
    plt.show()

<a id='7'></a>
# Baseline Models

### Trying to build first baseline models ignoring the time series structure...

In [None]:
# exclude one day in 2011 (is also in test set)
df_train = df_train[df_train.year==2010]
df_train = df_train.reset_index()

In [None]:
# exclude August (being unusual low)
df_train = df_train[df_train.month!=8]
df_train = df_train.reset_index()

In [None]:
# size of adjusted training data
n_train = df_train.shape[0]
print('Training size (after adjustments):', n_train)

In [None]:
# specify manual folds for cross validation
df_train['fold'] = np.floor(df_train.index / 579)
df_train.fold.value_counts().sort_index()

In [None]:
# select predictors
predictors = features_num + features_time

predictors.remove('year')
predictors.remove('month')
# predictors.remove('day')

print('Number of predictors: ', len(predictors))
print(predictors)

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data in H2O environment
train_hex = h2o.H2OFrame(df_train)
test_hex = h2o.H2OFrame(df_test)

In [None]:
# define Gradient Boosting model - target 1
target = 'target_carbon_monoxide'

fit_1 = H2OGradientBoostingEstimator(ntrees = 250,
                                     max_depth=9,
                                     min_rows=5,
                                     learn_rate=0.02, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.7,
                                     fold_column='fold',
                                     score_each_iteration=True,
                                     stopping_metric='RMSE',
                                     stopping_rounds=5,
                                     stopping_tolerance=0.001, # default 0.001
                                     seed=999)

# and train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# variable importance using shap values
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define Gradient Boosting model - target 2
target = 'target_benzene'
n_cv = 5

fit_2 = H2OGradientBoostingEstimator(ntrees = 250,
                                     max_depth=9,
                                     min_rows=5,
                                     learn_rate=0.1, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.7,
                                     fold_column='fold',
                                     score_each_iteration=True,
                                     stopping_metric='RMSE',
                                     stopping_rounds=5,
                                     stopping_tolerance=0.001, # default 0.001
                                     seed=999)

# and train model
t1 = time.time()
fit_2.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_2.cross_validation_metrics_summary()

In [None]:
# variable importance using shap values
t1 = time.time()
fit_2.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define Gradient Boosting model - target 3
target = 'target_nitrogen_oxides'

fit_3 = H2OGradientBoostingEstimator(ntrees = 250,
                                     max_depth=9,
                                     min_rows=5,
                                     learn_rate=0.1, # default: 0.1
                                     sample_rate=1,
                                     col_sample_rate=0.7,
                                     fold_column='fold',
                                     score_each_iteration=True,
                                     stopping_metric='RMSE',
                                     stopping_rounds=5,
                                     stopping_tolerance=0.001, # default 0.001
                                     seed=999)

# and train model
t1 = time.time()
fit_3.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_3.cross_validation_metrics_summary()

In [None]:
# variable importance using shap values
t1 = time.time()
fit_3.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# mean metrics (cross validation)
print('Mean RMSE on CV:', (fit_1.rmse(xval=True) + fit_2.rmse(xval=True) + fit_3.rmse(xval=True))/3)
print('Mean RMSLE on CV:', (fit_1.rmsle(xval=True) + fit_2.rmsle(xval=True) + fit_3.rmsle(xval=True))/3)

<a id='8'></a>
# Predict on Test Set and Visualize Results

In [None]:
# predict on test set
pred_test_1 = fit_1.predict(test_hex).as_data_frame()
pred_test_2 = fit_2.predict(test_hex).as_data_frame()
pred_test_3 = fit_3.predict(test_hex).as_data_frame()

In [None]:
# build submission data frame
df_sub.target_carbon_monoxide = pred_test_1
df_sub.target_benzene = pred_test_2
df_sub.target_nitrogen_oxides = pred_test_3
df_sub.head()

In [None]:
# the first row is already known as it is part of the training set!
df_train[7110:]

In [None]:
# overwrite known target values
df_sub.loc[0,'target_carbon_monoxide'] = 1.4
df_sub.loc[0,'target_benzene'] = 4.1
df_sub.loc[0,'target_nitrogen_oxides'] = 186.5

In [None]:
# basic stats of submission
df_sub.describe()

In [None]:
# visualize submission
sns.pairplot(df_sub[targets], 
             kind='reg', 
             plot_kws={'line_kws':{'color':'magenta'}, 'scatter_kws': {'alpha': 0.25}})
plt.show()

In [None]:
# visualize as time series
for t in targets:
    my_alpha=0.25
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(df_train.date_time, df_train[t], alpha=my_alpha, 
               color='darkred', s=4)
    ax.scatter(df_sub.date_time, df_sub[t], alpha=my_alpha, 
               color='magenta', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(t)
    plt.grid()
    plt.show()

In [None]:
# save submission file
df_sub.to_csv('submission.csv', index=False)