In [None]:
import os
import math
import datetime
import numpy as np 
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import numexpr
from PIL import Image
import seaborn as sns

RANDOM_SEED = 111

np.random.seed(RANDOM_SEED)

from numpy.random import default_rng
rng = default_rng(RANDOM_SEED)

from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler, OneHotEncoder, Binarizer, KBinsDiscretizer, QuantileTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config

from statsmodels.tsa.seasonal import seasonal_decompose

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.python.keras.losses import mean_squared_logarithmic_error

tf.random.set_seed(RANDOM_SEED)

INPUT_DIR = '/kaggle/input/tabular-playground-series-jul-2021'
BATCH_SIZE = 1024

# Upload dataset

In [None]:
def season(month):
  if (month == 12 or month == 1 or month == 2):   #winter
        return 0        
  elif(month == 3 or month == 4 or month == 5):   #spring
        return 1       
  elif(month == 6 or month == 7 or month == 8):   #summer
        return 2       
  else:                                           #outemn
        return 3 

def daytime(hour):
  if (hour > 5 and hour < 17):      #light
    return 0
  else:                             #darkness
    return 1

train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='date_time')
test_df = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'), index_col='date_time')

train_df.index = pd.to_datetime(train_df.index)
test_df.index = pd.to_datetime(test_df.index)

labels = train_df[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]

train_df.drop(labels.columns, axis=1, inplace=True)
total_df = train_df.append(test_df)     #pd.concat()

total_df['dew_point'] = total_df['deg_C'].apply(lambda x: (17.27 * x) / (237.7 + x)) + total_df['absolute_humidity'].apply(lambda x: math.log (x))
total_df['partial_pressure'] = (total_df['deg_C'].apply(lambda x: (237.7 + x) * 286.8) * total_df['absolute_humidity']) / 100000
total_df['saturated_wvd'] = (total_df['absolute_humidity'] * 100) / total_df['relative_humidity']

total_df['dt_hour'] = [x.hour for x in total_df.index]
total_df['dt_weekday'] = [x.weekday() for x in total_df.index]
total_df['dt_month'] = [x.month for x in total_df.index]
total_df['dt_season'] = [season(x.month) for x in total_df.index]
total_df['dt_lights'] = [daytime(x.hour) for x in total_df.index]
total_df['dt_month_s'] = np.sin(np.pi * (total_df['dt_month']-1)/6)
total_df['dt_month_c'] = np.cos(np.pi * (total_df['dt_month']-1)/6)

total_df['dt_month_s'] = total_df['dt_month_s'].astype('category').cat.codes
total_df['dt_month_c'] = total_df['dt_month_c'].astype('category').cat.codes

total_df["dt_working_hours"] = total_df["dt_hour"].isin(np.arange(8, 21, 1)).astype("int")
total_df["dt_weekend"] = (total_df["dt_weekday"] >= 5).astype("int")

sensors = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
#total_df[[x+'_1h' for x in sensors]] = total_df[sensors].diff(1).fillna(0)

cat_cols = np.array([col for col in total_df.columns if 'dt_' in col])
num_cols = np.array([col for col in total_df.columns if not 'dt_' in col])
total_cols = np.concatenate([num_cols,cat_cols])
cat_cols_idx = [np.where(total_df.columns == x)[0][0] for x in cat_cols]

# Analyse features

In [None]:
pd.concat((total_df.min(), total_df.max(), total_df.mean(), total_df.std(), total_df.nunique()), axis=1)

In [None]:
sns.clustermap(total_df.corr(), annot=True, square=True)
plt.show()

All input features, expesially temperature (`deg_C`) are seasonal, so we can use here windowed sequences with RNN models. 

In [None]:
total_df[num_cols].plot(subplots=True, layout=(3,4), figsize=(20,5))
plt.show()

In [None]:
total_df[num_cols].pct_change().plot(subplots=True, layout=(3,4), figsize=(20,5))
plt.show()

NOTE: There is a temperature anomaly at "2011-02-06". The very low morning temperature and a shart jump to the normal midday temperature.<br/>
I tried to use feature value returns instead of the absolute values but received worse results.

In [None]:
total_df.loc['2011-02-06','deg_C'].pct_change().plot();

In [None]:
total_df.loc['2011-02-06T08:00':'2011-02-06T14:00','deg_C']

In [None]:
total_df.loc['2011-02-05T08:00':'2011-02-05T14:00','deg_C']

Correlation between seasonalities

In [None]:
seasonality_dict = {ts: seasonal_decompose(total_df[ts], period=255).seasonal for ts in num_cols}
seasonality_corr = pd.DataFrame(seasonality_dict).corr()

sns.clustermap(seasonality_corr, annot=True)
plt.show();

As we can see, in both cases the autocorrelation is highest in a period of 24h

In [None]:
from statsmodels.graphics import tsaplots
tsaplots.plot_acf(total_df["deg_C"], lags=30, title='deg_C')
tsaplots.plot_acf(labels[labels.columns[0]], lags=30, title=labels.columns[0])
plt.show()

The rolling STD for 24h windows is not constant, so it is not a "white noise".

In [None]:
total_df["deg_C"].rolling(24).std().std()

...and it is not a "random walk" - p<0.05

In [None]:
from statsmodels.tsa.stattools import adfuller

results = adfuller(labels[labels.columns[0]])

print(f"ADF Statistic: {results[0]}")
print(f"p-value: {results[1]}")
print("Critical Values:")
for key, value in results[4].items():
    print("\t%s: %.3f" % (key, value))

In [None]:
pd.concat((labels.min(), labels.max(), labels.mean(), labels.nunique()), axis=1)

Target labels are highly correlated.

In [None]:
sns.clustermap(labels.corr(), annot=True, figsize=(5,5))
plt.show();

We will split the dataset into the 24h windows

In [None]:
fft = tf.signal.rfft(total_df['deg_C'])
f_per_dataset = np.arange(0, len(fft))

n_samples_h = len(total_df['deg_C'])
hours_per_year = 24*365.2524
years_per_dataset = n_samples_h/(hours_per_year)

f_per_year = f_per_dataset/years_per_dataset
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.ylim(0, 50000)
plt.xlim([0.1, max(plt.xlim())])
plt.xticks([1, 365.2524], labels=['1/Year', '1/day'])
_ = plt.xlabel('Frequency (log scale)')

# Preprocess features

In [None]:
pipe_pre = ColumnTransformer([
  #('poly', PolynomialFeatures(interaction_only=True, include_bias=False), sensors),
  ('num', Pipeline([
      ('sensors', ColumnTransformer([
        ('poly', PolynomialFeatures(interaction_only=True, include_bias=False), sensors)
      ], remainder='passthrough')),
      #('scale', StandardScaler()),
      #('gauss', QuantileTransformer(output_distribution="normal")),
      #('minmax', MinMaxScaler()),
      ('kbins', KBinsDiscretizer(n_bins=32, encode='ordinal')),  #strategy='uniform'
      #('onehot', OneHotEncoder(sparse=False))
  ]), num_cols),
  #('cat', OrdinalEncoder(), cat_cols)
  ('cat', OneHotEncoder(sparse=False), cat_cols)
  #('cat', MyVectorizer(cols=cat_cols, hashing=16), cat_cols)
], remainder='passthrough')

pipe_pre.fit(total_df)
total_data = pipe_pre.transform(total_df).astype('float')

train_data, test_data = total_data[:train_df.index.shape[0]], total_data[train_df.index.shape[0]:]

Example of how the windowing is working.<br/>
There ar emany methods that do windowing. Each of them has pros and cons. 

`test_data = skimage.util.view_as_windows(test_data, (16, 2000)).reshape((-1, 16, 2000, 1))`

`tf.keras.preprocessing.timeseries_dataset_from_array(train_data, exp_labels[1], sequence_length=window_size, batch_size=BATCH_SIZE)`

`tf.keras.preprocessing.sequence.TimeseriesGenerator`

```exp_test_data = [np.roll(test_data, -i, axis=0) for i in range(window_size)]
exp_test_data = np.moveaxis(np.stack(exp_test_data),0,-1)```
  
We will use the simplest:

In [None]:
wsize = 3
data = [1,2,3,4,5,6,7,8,9,10,11,12]
target = [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12]

exp_data = [data[i:-wsize+i+1] for i in range(wsize-1)]
exp_data.append(data[wsize-1:])
exp_data = np.dstack(exp_data)[0]

exp_target = target[wsize-1:]

print(exp_data)
print(exp_target)

In [None]:
# after reducing LR continue from the largest score weights
class CustomReduceLROnPlateau(tf.keras.callbacks.ReduceLROnPlateau):
  def on_epoch_end(self, epoch, logs=None):
    current = (logs or {}).get(self.monitor)
    if not self.monitor_op(current, self.best) and not self.in_cooldown():
      if self.wait+1 >= self.patience:
        self.model.load_weights("filepath.h5")
    
    super().on_epoch_end(epoch, logs)


def create_windows(data, window_size):
  exp_data = [data[i:-window_size+i+1] for i in range(window_size-1)]
  exp_data.append(data[window_size-1:])
  exp_data = np.moveaxis(np.dstack(exp_data),1,2)
  return exp_data


def append_label(data, target):
  target = np.moveaxis(np.expand_dims([target for x in range(264)], axis=0), 2,0)
  data = np.append(data, target, axis=1)
  return data


def RMSLE(y_true, y_pred):
  return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(tf.keras.backend.log(1+y_pred) - tf.keras.backend.log(1+y_true))))


def train_model(train_tensor, test_tensor, model):
  tf.keras.backend.clear_session()

  early_stop  = tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss', mode='min', min_delta=0.00001)
  check_point = tf.keras.callbacks.ModelCheckpoint(filepath='filepath.h5', monitor='val_loss', mode='min', save_best_only=True, save_weights_only=True)

  model.compile(loss=RMSLE, optimizer=tfa.optimizers.SWA(tf.keras.optimizers.Adam(0.01)))
  model.fit(train_tensor, validation_data=train_tensor, epochs=10, callbacks=[early_stop, check_point], verbose=1)
  model.load_weights('filepath.h5')
    
  score = model.evaluate(train_tensor)
  test_predict = model.predict(test_tensor)
    
  return score, test_predict


window_size = 24
BATCH_SIZE = 1024
dims = train_data.shape[1]

# Build 2D model

This is the simplest, flat DNN model that doesn't use windows.<br/>
All models are split to 3 identical branches that return 1 target label each (corresponds to the sklearn's `MultiOutputRegressor`) because I recieved worse results when was training with 1 shared branch:<br/>
`tf.keras.layers.Dense(3, activation="sigmoid")`<br/>
The activation function is always `relu` because when I was training all models with `sigmoid` and `softmax` they returned the worse results.<br/>
The 2D model returned the worst result.

In [None]:
img_array = np.array(Image.open('../input/jul21plot/model_2d.png'))
plt.figure(figsize = (20,10))
plt.imshow(img_array)

In [None]:
def create_model_2d(dims):
  inp = tf.keras.layers.Input(shape=(dims,))

  branch = [None] * 3
  for i in range(3):
    branch[i] = tf.keras.layers.Dense(dims, activation="relu")(inp)
    branch[i] = tf.keras.layers.Dense(dims//2, activation="relu")(branch[i])
    branch[i] = tf.keras.layers.Dense(dims//4, activation="relu")(branch[i])
    branch[i] = tf.keras.layers.Dense(1, activation="relu")(branch[i])

  y = tf.keras.layers.Concatenate()(branch)
  model = tf.keras.Model(inputs=inp, outputs=y)
  #tf.keras.utils.plot_model(model, show_shapes=True, rankdir="TB")
  #print(model.summary())
  return model


train_tensor = tf.data.Dataset.from_tensor_slices((train_data, labels)).batch(BATCH_SIZE, drop_remainder=True).cache()
test_tensor = tf.data.Dataset.from_tensor_slices(test_data).batch(BATCH_SIZE).cache()

model = create_model_2d(dims)
score, test_predict = train_model(train_tensor, test_tensor, model)

# Build 3D model

This is the RNN model that uses 24h windows for input features and one-step-prediction for target labels.<br/>
I tried other window sizes, like: 8h, 12h, 7d, 11d, 30d; and they all returned worse results.<br/>
This model returned the best local result of `0.1564716398715973` (`0.2` in Private Score) after about the 150 steps.

In [None]:
img_array = np.array(Image.open('../input/jul21plot/model_3d.png'))
plt.figure(figsize = (20,10))
plt.imshow(img_array)

In [None]:
def simple_model_3d(dims, wsize):
  inp = tf.keras.layers.Input(shape=(wsize, dims))
  x = tf.keras.layers.SimpleRNN(wsize, return_sequences=True)(inp)
  x = tf.keras.layers.SimpleRNN(wsize//2)(x)
  y = tf.keras.layers.Dense(1, activation="relu")(x)
  model = tf.keras.Model(inputs=inp, outputs=y)
  print(model.summary())
  return model


def create_model_3d(dims, wsize):
  inp = tf.keras.layers.Input(shape=(wsize, dims))

  branch = [None] * 3
  for i in range(3):
    branch[i] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(wsize, return_sequences=True))(inp)
    branch[i] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(wsize))(branch[i])
    branch[i] = tf.keras.layers.Dense(wsize, activation="relu")(branch[i])
    branch[i] = tf.keras.layers.Dense(wsize//2, activation="relu")(branch[i])
    branch[i] = tf.keras.layers.Dense(1, activation="relu")(branch[i])

  y = tf.keras.layers.Concatenate()(branch)
  model = tf.keras.Model(inputs=inp, outputs=y)
  tf.keras.utils.plot_model(model, show_shapes=True, rankdir="TB")
  #print(model.summary())
  return model


exp_labels = labels.values[window_size-1:]
exp_train_data = create_windows(train_data, window_size)

exp_test_data = np.concatenate([train_data[-window_size:,:], test_data[1:]])
exp_test_data = create_windows(exp_test_data, window_size)

train_tensor = tf.data.Dataset.from_tensor_slices((exp_train_data, exp_labels)).batch(BATCH_SIZE, drop_remainder=True).cache()
test_tensor = tf.data.Dataset.from_tensor_slices(exp_test_data).batch(BATCH_SIZE).cache()

model = create_model_3d(dims, window_size)
score, test_predict = train_model(train_tensor, test_tensor, model)

In [None]:
img_array = np.array(Image.open('../input/jul21plot/model_4d.png'))
plt.figure(figsize = (20,10))
plt.imshow(img_array)

# Build 4D model

This is the RNN model that uses 24h windows for input features and the 24h multi-step-prediction for target labels.<br/>
An example is described here: https://mobiarch.wordpress.com/2020/11/13/preparing-time-series-data-for-rnn-in-tensorflow/<br/>
This model returned the local result of `0.3389951288700104` after about the 150 steps.

In [None]:
def create_model_4d(dims, wsize):
  inp = tf.keras.layers.Input(shape=(wsize, dims))
  #x = tf.keras.layers.Embedding(dims, dims//2)(inp)
  #x = tf.keras.layers.Reshape(target_shape=(dims//2))(x)
  #x = tf.keras.layers.GlobalAveragePooling2D()(x)
  #x = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(x)
  #x = tf.keras.layers.BatchNormalization()(inp)

  branch = [None] * 3
  for i in range(3):
    branch[i] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(wsize//2, return_sequences=True))(inp)
    branch[i] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(wsize//2, return_sequences=True))(branch[i])
    branch[i] = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(wsize//2))(branch[i])
    branch[i] = tf.keras.layers.Dense(1, activation="relu")(branch[i])

  y = tf.keras.layers.Concatenate()(branch)
  model = tf.keras.Model(inputs=inp, outputs=y)

  #tf.keras.utils.plot_model(model, show_shapes=True, rankdir="TB")
  #print(model.summary())
  return model


exp_labels = create_windows(labels, window_size) 
exp_train_data = create_windows(train_data, window_size)

exp_test_data = np.concatenate([train_data[-window_size:,:], test_data[1:]])
exp_test_data = create_windows(exp_test_data, window_size)

train_tensor = tf.data.Dataset.from_tensor_slices((exp_train_data, exp_labels)).batch(BATCH_SIZE, drop_remainder=True).cache()
test_tensor = tf.data.Dataset.from_tensor_slices(exp_test_data).batch(BATCH_SIZE).cache()

model = create_model_4d(dims, window_size)
score, test_predict = train_model(train_tensor, test_tensor, model)
test_predict = test_predict[:,:,0]

# Save results to CSV file

In [None]:
output_res = pd.DataFrame(index=test_df.index, data={'date_time':test_df.index.values})
output_res[labels.columns] = test_predict
output_res.to_csv('./submission.csv', index=False)

# Validate the 3D model residuals

As we can see, the residuals still contain the autocorrelations and trend/seasonality patterns.

In [None]:
exp_labels = labels.values[window_size-1:]
train_predict = pd.read_csv('../input/predicted-train-ds/train_predict.csv')
residuals = pd.DataFrame(data=train_predict-exp_labels, columns=labels.columns)

print('mean: \n', residuals.mean(axis=0))
print('std: \n', residuals.std(axis=0))

In [None]:
loss = np.sqrt(mean_squared_log_error(train_predict, exp_labels))
print('loss function:', loss)

In [None]:
residuals.plot(subplots=True, figsize=(20,5))
plt.show()

In [None]:
tsaplots.plot_acf(residuals[residuals.columns[0]], lags=30, title=residuals.columns[2])
plt.show()

In [None]:
seasonal_decompose(residuals[residuals.columns[0]], period=255).seasonal.plot()
seasonal_decompose(residuals[residuals.columns[0]], period=255).trend.plot()