# Tabular Playground
## Data loading and preprocessing

Following the same steps as the other notebook I will standardize the data.

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv', 
                    parse_dates=["date_time"])
train = train.set_index('date_time')
target = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
train = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv',
                  parse_dates=["date_time"])
test = test.set_index('date_time')

In order for the deg_C column to be positive I just convert the value to Kelvin.

In [None]:
train.deg_C = train.deg_C + 273.15
test.deg_C = test.deg_C + 273.15

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2)
column_names, tr_index, val_index = X_train.columns, X_train.index, X_val.index

## Preprocessing

I try two types of standardization, the box-cox transformation and the MinMax scaling to the interval $[0,1]$. It seems that the box-cox is the better in the models.

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='box-cox')
X_train_sc = pd.DataFrame(data = pt.fit_transform(X_train), columns=column_names, 
                          index=tr_index)
X_val_sc = pd.DataFrame(data = pt.transform(X_val), columns=column_names, 
                          index=val_index)

train_sc = pd.DataFrame(data = pt.fit_transform(train), columns=column_names, 
                          index=train.index)
test_sc = pd.DataFrame(data = pt.transform(test), columns=column_names, 
                          index=test.index)

fig = X_train_sc.hist(figsize=(100, 100), bins=30)
[x.title.set_size(80) for x in fig.ravel()]
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
X_train_m = pd.DataFrame(data = minmax.fit_transform(X_train), columns=X_train.columns, 
                          index=X_train.index)
X_val_m = pd.DataFrame(data = minmax.transform(X_val), columns=X_val.columns, 
                          index=X_val.index)

train_m = pd.DataFrame(data = minmax.fit_transform(train), columns=train.columns, 
                          index=train.index)
test_m = pd.DataFrame(data = minmax.transform(test), columns=test.columns, 
                          index=test.index)

fig = X_train_m.hist(figsize=(100, 100), bins=30)
[x.title.set_size(80) for x in fig.ravel()]
[x.tick_params(axis='both', which='major', labelsize=80) for x in fig.ravel()]
plt.show()

It seems that the target is skewed so I also try the logarithmic transformation. However it doesn't change very much the result.

In [None]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [None]:
fig = y_train_log.hist(figsize=(100, 100), bins=30)
[x.title.set_size(80) for x in fig.ravel()]
plt.show()

## Neural Network

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import keras.backend as K
from keras.callbacks import EarlyStopping

DROPOUT = 0.3
NSIZE = 512

with tf.device('/gpu:0'):
    tf.random.set_seed(0)
    model = keras.Sequential([
        layers.BatchNormalization(input_shape=[X_train.shape[1]]),
        
        layers.Dense(4096, activation='relu'),
        layers.Reshape((256,-1)),
        
        layers.Conv1D(filters=512, kernel_size=5, strides=1, padding='same',
                    data_format='channels_first', groups=16,
                    activation='relu'),
        layers.AveragePooling1D(pool_size=2,
                    strides=1, padding='same'),
        
        layers.Conv1D(filters=16, kernel_size=5, strides=1, padding='same',
                    data_format='channels_last',
                    activation='relu'),
        
        layers.MaxPooling1D(pool_size=2,
                    strides=2, padding='same', data_format='channels_first'),
        
        layers.Flatten(),
        
        layers.Dense(NSIZE),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(DROPOUT),
        
        layers.Dense(NSIZE),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(DROPOUT),
        
        layers.Dense(3, activation='relu')
    ])

    def RMSLE(y_true, y_pred):
        msle = keras.losses.MeanSquaredLogarithmicError()
        return K.sqrt(msle(y_true, y_pred))

    model.compile(
        optimizer='adam',
        loss='MeanSquaredLogarithmicError',
        metrics=[RMSLE]
    )

    earlyStopping = EarlyStopping(
        min_delta=0.0001, 
        patience=15, 
        verbose=1,
        restore_best_weights=True
    )

    history = model.fit(X_train_sc, y_train, validation_data=(X_val_sc, y_val),
             epochs=400,
             batch_size=256,
             callbacks=[earlyStopping]
    )

In [None]:
history_df = pd.DataFrame(history.history)
fig, ax = plt.subplots(2, figsize=(50,20))
history_df.loc[:,['loss', 'val_loss']].plot(ax = ax[0])
history_df.loc[:,['RMSLE', 'val_RMSLE']].plot(ax = ax[1])
for i in range(2):
    ax[i].title.set_fontsize(30)
    ax[i].legend(fontsize=30)
    ax[i].tick_params(axis='both', which='major', labelsize=30)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_log_error

preds = model.predict(X_val_sc)
err = np.sqrt(mean_squared_log_error(y_val, preds))
print('Validation error:', '{0:.4f}'.format(err))

## Ensemble version

We partition the data into three equal datasets and train the same model three times.

In [None]:
from random import sample, seed

seed(0)
train_len = X_train_sc.shape[0]
PARTITIONS = 3
sampling = np.array(sample(range(train_len), train_len)).reshape((PARTITIONS,-1))
sampling = X_train_sc.index[sampling]

X_train_sc_part = {}
y_train_part = {}
for i in range(PARTITIONS):
    X_train_sc_part[i] = X_train_sc[X_train_sc.index.isin(sampling[i])]
    y_train_part[i] = y_train[y_train.index.isin(sampling[i])]

In [None]:
models = {}
for i in range(PARTITIONS):
    models[i] = keras.models.clone_model(model)
    models[i].compile(
            optimizer='adam',
            loss='MeanSquaredLogarithmicError',
            metrics=[RMSLE, 'MeanSquaredError']
        )

In [None]:
history = {}
class myCallback(keras.callbacks.Callback):
    def __init__(self, partition):
        super().__init__()
        self.curr_part = partition
        
    def on_train_begin(self, logs=None):
        print('Currently on partition:', self.curr_part)
        
    def on_train_end(self, logs=None):
        print('Loss:', logs.get('loss'), 'val_loss:', logs.get('val_loss'))
        
        
with tf.device('/gpu:0'):
    tf.random.set_seed(0)
    for i in range(PARTITIONS):
        callback = myCallback(i)
        history[i] = models[i].fit(X_train_sc_part[i], y_train_part[i],
                 validation_data=(X_val_sc, y_val),
                 epochs=400,
                 batch_size=128, verbose=0,
                 callbacks=[earlyStopping, callback]
        )

In [None]:
fig, ax = plt.subplots(PARTITIONS,3, figsize=(50,20))
for j in range(PARTITIONS):
    history_df = pd.DataFrame(history[j].history)
    history_df.loc[:,['loss', 'val_loss']].plot(ax = ax[j,0])
    history_df.loc[:,['RMSLE', 'val_RMSLE']].plot(ax = ax[j,1])
    history_df.loc[:,['mean_squared_error', 'val_mean_squared_error']].plot(ax = ax[j,2])
    for i in range(3):
        ax[j,i].title.set_fontsize(30)
        ax[j,i].legend(fontsize=30)
        ax[j,i].tick_params(axis='both', which='major', labelsize=30)
plt.show()

In [None]:
preds = np.zeros(y_val.shape)
for i in range(PARTITIONS):
    preds += models[i].predict(X_val_sc)
preds = preds / PARTITIONS
err = np.sqrt(mean_squared_log_error(y_val, preds))
print('Validation error:', '{0:.4f}'.format(err))

## Prediction

It isn't my best.

In [None]:
preds = model.predict(test_sc)
preds = pd.DataFrame(data=preds, columns=target.columns, index=test_sc.index)

In [None]:
## Ensemble
preds = np.zeros((test_sc.shape[0],3))
for i in range(PARTITIONS):
    preds += models[i].predict(test_sc)
preds = preds / PARTITIONS
preds = pd.DataFrame(data=preds, columns=target.columns, index=test_sc.index)

In [None]:
fig, ax = plt.subplots(3,1, figsize=(50,20))
target.target_benzene.plot(ax=ax[0])
preds.target_benzene.plot(ax=ax[0])

target.target_carbon_monoxide.plot(ax=ax[1])
preds.target_carbon_monoxide.plot(ax=ax[1])

target.target_nitrogen_oxides.plot(ax=ax[2])
preds.target_nitrogen_oxides.plot(ax=ax[2])
plt.show()

In [None]:
preds.reset_index().to_csv('submission.csv', index=False)

## ResNet version

In [None]:
# Basic ResNet Building Block
def resnet_layer(inputs, nsize, dropout, activation):
    dense = layers.Dense(nsize)
    batch = layers.BatchNormalization()
    drop = layers.Dropout(dropout)
    act = layers.Activation(activation)
  
    x = inputs
    x = dense(x)
    x = batch(x)
    x = drop(x)
    if activation is not None:
        x = act(x)
    return x

In [None]:
# ResNet Model
def resnet(input_shape, depth, nsize, dropout, activation):
      
    if (depth - 2) % 9 != 0:
        raise ValueError('depth should be 9n + 2 (eg 56 or 110 in [b])')
    # Start model definition.
    num_res_blocks = int((depth - 2) / 9)
  
    inputs = layers.Input(shape = input_shape)
    x = resnet_layer(inputs=inputs, nsize=nsize, dropout=dropout, activation=activation)
    # Instantiate the stack of residual units
    for stack in range(3):
        for res_block in range(num_res_blocks):           
            y = resnet_layer(inputs=x, nsize=nsize, dropout=dropout, activation=activation)
            y = resnet_layer(inputs=y, nsize=nsize, dropout=dropout, activation=activation)
            y = resnet_layer(inputs=y, nsize=nsize, dropout=dropout, activation=None)
            x = keras.layers.add([x, y])
            x = layers.Activation(activation)(x)
  
    # Add regressor on top.
    outputs = layers.Dense(3, activation ='relu')(x)
  
    # Instantiate model.
    model = keras.models.Model(inputs = inputs, outputs = outputs)
    return model

In [None]:
tf.random.set_seed(0)
resNetModel = resnet([X_train_sc.shape[1]], 110, 512, 0.2, 'relu')

resNetModel.compile(
    optimizer='adam',
    loss='MeanSquaredLogarithmicError',
    metrics=[RMSLE]
)

In [None]:
with tf.device('/gpu:0'):
    history = resNetModel.fit(X_train_sc, y_train, validation_data=(X_val_sc, y_val),
             epochs=400,
             batch_size=256,
             callbacks=[earlyStopping]
    )

In [None]:
fig, ax = plt.subplots(2, figsize=(50,20))
history_df = pd.DataFrame(history.history)
history_df.loc[:,['loss', 'val_loss']].plot(ax = ax[0])
history_df.loc[:,['RMSLE', 'val_RMSLE']].plot(ax = ax[1])
for i in range(2):
    ax[i].title.set_fontsize(30)
    ax[i].legend(fontsize=30)
    ax[i].tick_params(axis='both', which='major', labelsize=30)
plt.show()

In [None]:
preds = resNetModel.predict(X_val_sc)
err = np.sqrt(mean_squared_log_error(y_val, preds))
print('Validation error:', '{0:.4f}'.format(err))

In [None]:
preds = resNetModel.predict(test_sc)
preds = pd.DataFrame(data=preds, columns=target.columns, index=test_sc.index)

In [None]:
fig, ax = plt.subplots(3,1, figsize=(50,20))
target.target_benzene.plot(ax=ax[0])
preds.target_benzene.plot(ax=ax[0])

target.target_carbon_monoxide.plot(ax=ax[1])
preds.target_carbon_monoxide.plot(ax=ax[1])

target.target_nitrogen_oxides.plot(ax=ax[2])
preds.target_nitrogen_oxides.plot(ax=ax[2])
plt.show()

In [None]:
preds.reset_index().to_csv('submission.csv', index=False)

## ResNet Convolutional

In [None]:
# Basic ResNet Building Block
def resnet_layer_conv(inputs, num_filters = 16, kernel_size = 3, strides = 1, 
                      activation ='relu', batch_normalization = True):
    
    conv = layers.Conv1D(num_filters,
                  kernel_size = kernel_size,
                  strides = strides,
                  padding ='same',
                  kernel_initializer ='he_normal',
                  kernel_regularizer = keras.regularizers.l2(1e-4))
    x = inputs
    if batch_normalization:
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.2)(x)
    if activation is not None:
        x = layers.Activation(activation)(x)
    x = conv(x)
    return x

In [None]:
# ResNet Model
def resnetConv(input_shape, depth, activation):
      
    if (depth - 2) % 9 != 0:
        raise ValueError('depth should be 9n + 2 (eg 56 or 110 in [b])')
    # Start model definition.
    num_filters_in = 16
    num_res_blocks = int((depth - 2) / 9)
  
    inputs = layers.Input(shape = input_shape)
    x = layers.BatchNormalization(input_shape=input_shape)(inputs)
    x = layers.Dense(4096, activation='relu')(x)
    x = layers.Reshape((256,-1))(x)
        
    x = resnet_layer_conv(inputs = x)
    # Instantiate the stack of residual units
    for stage in range(3):
        for res_block in range(num_res_blocks):           
            strides = 1
            if stage == 0:
                num_filters_out = num_filters_in * 4
                if res_block == 0:  # first layer and first stage
                    activation = None
                    batch_normalization = False
            else:
                num_filters_out = num_filters_in * 2
                if res_block == 0:  # first layer but not first stage
                    strides = 2    # downsample
  
            # bottleneck residual unit
            y = resnet_layer_conv(inputs = x,
                             num_filters = num_filters_in,
                             kernel_size = 1,
                             strides = strides,
                             activation = activation,
                             batch_normalization = batch_normalization)
            y = resnet_layer_conv(inputs = y,
                             num_filters = num_filters_in)
            y = resnet_layer_conv(inputs = y,
                             num_filters = num_filters_out,
                             kernel_size = 1)
            if res_block == 0:
                # linear projection residual shortcut connection to match
                # changed dims
                x = resnet_layer_conv(inputs = x,
                                 num_filters = num_filters_out,
                                 kernel_size = 1,
                                 strides = strides,
                                 activation = None,
                                 batch_normalization = False)
            x = keras.layers.add([x, y])
  
    # Add regressor on top.
    x = layers.MaxPooling1D(pool_size = 8)(x)
    x = layers.Flatten()(x)
    outputs = layers.Dense(3, activation ='relu')(x)
  
    # Instantiate model.
    model = keras.models.Model(inputs = inputs, outputs = outputs)
    return model

In [None]:
tf.random.set_seed(0)
resNetModelConv = resnetConv([X_train_sc.shape[1]], 110, 'relu')

resNetModelConv.compile(
    optimizer='adam',
    loss='MeanSquaredLogarithmicError',
    metrics=[RMSLE]
)

In [None]:
with tf.device('/gpu:0'):
    history = resNetModelConv.fit(X_train_sc, y_train, validation_data=(X_val_sc, y_val),
             epochs=400,
             batch_size=256,
             callbacks=[earlyStopping]
    )

In [None]:
fig, ax = plt.subplots(2, figsize=(50,20))
history_df = pd.DataFrame(history.history)
history_df.loc[:,['loss', 'val_loss']].plot(ax = ax[0])
history_df.loc[:,['RMSLE', 'val_RMSLE']].plot(ax = ax[1])
for i in range(2):
    ax[i].title.set_fontsize(30)
    ax[i].legend(fontsize=30)
    ax[i].tick_params(axis='both', which='major', labelsize=30)
plt.show()

In [None]:
preds = resNetModelConv.predict(X_val_sc)
err = np.sqrt(mean_squared_log_error(y_val, preds))
print('Validation error:', '{0:.4f}'.format(err))

In [None]:
preds = resNetModelConv.predict(test_sc)
preds = pd.DataFrame(data=preds, columns=target.columns, index=test_sc.index)

In [None]:
fig, ax = plt.subplots(3,1, figsize=(50,20))
target.target_benzene.plot(ax=ax[0])
preds.target_benzene.plot(ax=ax[0])

target.target_carbon_monoxide.plot(ax=ax[1])
preds.target_carbon_monoxide.plot(ax=ax[1])

target.target_nitrogen_oxides.plot(ax=ax[2])
preds.target_nitrogen_oxides.plot(ax=ax[2])
plt.show()

In [None]:
preds.reset_index().to_csv('submission.csv', index=False)