In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import trange
import matplotlib.pyplot as plt
from datetime import datetime, date 
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler

import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


<hr>
<div>    
    <div style = "float:left; width:55%; overflow:hidden;">        
        <br><br><br><br>        
        <span style = "float:right;">
        <h2>Time Series Transformers</h2>
        <p>How to use Transformers for Time-series data? ü§î</p>
        <br>
        <b></b>
        <b>
        - üåé <a href="https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/300195">Discussion Thread</a>
        <br>
        - üá∞ <a href="https://www.kaggle.com/c/tabular-playground-series-jan-2022">The Competition</a>
        </b>            
        </span>
    </div>
    <div style="float:right; width:35%; max-height:300px; overflow: hidden;">        
        <img src="https://i.ibb.co/5hrW6cC/Robotics.gif" style = "max-height: 300px;">         
    </div>
</div>
<hr>

_____

<br>

## Training Notebok [Here](https://www.kaggle.com/yamqwe/tutorial-time-series-transformer-time2vec)
<br>

_____

In [2]:
import gc
import os
import math
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import dateutil.easter as easter
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import StandardScaler
import warnings; warnings.filterwarnings('ignore')

In [3]:
LR = 1e-05
WINDOW_SIZE = 60


original_train_df = pd.read_csv('./train.csv')
original_test_df = pd.read_csv('./test.csv')
gdp_df = pd.read_csv('./GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)

for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace=True, drop=False)
original_train_df.head(2)

Unnamed: 0_level_0,row_id,date,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
2015-01-01,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520


In [4]:
def smape_loss(y_true, y_pred): return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

def engineer(df):
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis = 1)), 'wd4': df.date.dt.weekday == 4, 'wd56': df.date.dt.weekday >= 5})
    for country in ['Finland', 'Norway']: new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Sticker']: new_df[product] = df['product'] == product
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 20):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'sticker_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Sticker']
        new_df[f'sticker_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Sticker']
    return new_df

def rolling_window(df, y = None, window_size = 10):
    all_features, all_targets = [], []
    for i in range(0, len(df) - window_size):
        all_features.append(np.expand_dims(df[i: i + window_size].values, axis = 0))
        if y is not None: all_targets.append(np.expand_dims(y[i + window_size], axis = 0))
    if y is not None: return np.concatenate(all_features, axis = 0), np.concatenate(all_targets, axis = 0)
    else: return np.concatenate(all_features, axis = 0)


> ### See training notebook [here](https://www.kaggle.com/yamqwe/tutorial-time-series-transformer-time2vec)


# <span class="title-section w3-xxlarge" id="ts_trans">Time Series Transformers ü§ñ</span>
<hr> 

**Let's apply a Time-Series transformer with the above set-up!**

<!-- ![Multivariate Time Series Transformer Framework](https://storage.googleapis.com/kaggle-markpeng/GoogleBrainVentilator/mvts_transformer_architecture.png) -->

<img src="https://storage.googleapis.com/kaggle-markpeng/GoogleBrainVentilator/mvts_transformer_architecture.png" alt="Multivariate Time Series Transformer Framework" width="800"/>


_____
**Credits:** Parts of this implementation were insipred / based on: 

- **George Zerveas _et al._ (2021). "_A Transformer-based Framework for Multivariate Time Series Representation Learning_," Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '21).** - https://arxiv.org/abs/2010.02803
- **TensorFlow Transformer** - https://www.kaggle.com/cdeotte/tensorflow-transformer-0-112
- **Time2Vec** - https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3

_____


As we all know, transformers are taking over the state-of-the-art title in any field they get into. It was just a matter of time until we got the first papers implementing them for time-series. 

This rest of the notebook implements a transformer model for learning the representation of a Time-series. 
It learns to attend both to preceding and succeeding segments in individual features, as well as the inter-dependencies between features.


**Differences between Transformers and Time-Series Transformers:**

- 1. In the paper, they use batchnorm rather then layernorm, this is because the problem with batchnorm in the first place was variation in input length for NLP. This is the reason for the inferior performance of batch normalization in NLP (i.e., sentences in most tasks) (Shen et al., 2020). 

- 2. In the paper they use a **Fully learnable positional encodings** rather then a fixed positionl encoding (As the classic BERT is using. It is observed that they perform better for all datasets presented in the paper.

- 3. In the paper they have an unsupervised trainin phased using Masked MLM input (sequencial masking starting from left). The model then try to predict all of the input vector but the loss given is only from the masked indices. The reconstruction loss is mse. 

- 4. In this notebook we apply residual skip connections as it is done on the implementation linked above: We use a constant "SKIP_CONNECTION_STRENGTH" and muliply the skip connection by it. This improves performance in practice. 

- 5. In this notebook we extend the "learnable embeddings" and apply Time2Vec ([paper](https://arxiv.org/abs/1907.05321)). This also improves performance in practice. 


<br>

### Architecture & Loss Function

The model is trained with keras for about 1,000 epochs using combined training and test sets. The loss function is **MAE loss** trained end2end to the targets. You could play around the hyperparameters for a larger model or using more hand-crafted features as the input.

![](https://i.ibb.co/TTDp6kZ/1-TWkm-OC8a-K-2-Tgc-CIm-m-KMg.png)

In [5]:
import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras
import pandas as pd, numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, Reshape, Lambda

AttributeError: type object 'h5py.h5.H5PYConfig' has no attribute '__reduce_cython__'

In [9]:
#pip install h5py
#import tensorflow as tf

print(tensorflow.__version__)

NameError: name 'tensorflow' is not defined

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, feat_dim, num_heads, ff_dim, rate = 0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads = num_heads, key_dim = embed_dim)
        self.ffn = keras.Sequential( [layers.Dense(ff_dim, activation = "gelu"), layers.Dense(feat_dim),] )
        self.layernorm1 = layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon = 1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training = training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training = training)
        return self.layernorm2(out1 + ffn_output)

<br>

### Time 2 Vec Embeddings

Mathematically speaking, implementing Time2Vec is quite easy:

![](https://i.ibb.co/yhphBqc/1-E8-Zi-ELOi-Gv-Nu-WCAHX8f-Rw.png)

[from https://arxiv.org/pdf/1907.05321.pdf]

<br>

Where k is the time2vec dimension, tau is a raw time series, F is a periodic activation function, omega and phi are a set of learnable parameters. 

On this notebook we set F to be a sin function in order to help the algorithm capture periodic behaviors in data. At the same time, the linear term represents the progression of time and can be used for capturing non-periodic patterns in the input that depend on time.

In [None]:
class Time2Vec(tf.keras.layers.Layer):
    def __init__(self, kernel_size = 1):
        super(Time2Vec, self).__init__(trainable = True, name = 'Time2VecLayer')
        self.k = kernel_size

    def build(self, input_shape):
        self.wb = self.add_weight(name = 'wb', shape = (input_shape[1],), initializer = 'uniform', trainable = True)
        self.bb = self.add_weight(name = 'bb', shape = (input_shape[1],), initializer = 'uniform', trainable = True)
        self.wa = self.add_weight(name = 'wa', shape = (1, input_shape[1], self.k), initializer = 'uniform', trainable = True)
        self.ba = self.add_weight(name = 'ba', shape = (1, input_shape[1], self.k), initializer = 'uniform', trainable = True)
        super(Time2Vec, self).build(input_shape)

    def call(self, inputs, **kwargs):
        bias = self.wb * inputs + self.bb
        dp = K.dot(inputs, self.wa) + self.ba
        wgts = K.sin(dp)
        ret = K.concatenate([K.expand_dims(bias, -1), wgts], -1)
        ret = K.reshape(ret, (-1, inputs.shape[1] * (self.k + 1)))
        return ret
    def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[1] * (self.k + 1))

> ### See training notebook [here](https://www.kaggle.com/yamqwe/tutorial-time-series-transformer-time2vec)

# <span class="title-section w3-xxlarge" id="putting">Putting It All Together üèóÔ∏è</span>
<hr>

In [None]:
def get_model(input_shape, time2vec_dim = 3):
    inp = Input(input_shape)
    x = inp

    time_embedding = keras.layers.TimeDistributed(Time2Vec(time2vec_dim - 1))(x)
    x = Concatenate(axis = -1)([x, time_embedding])
    x = layers.LayerNormalization(epsilon = 1e-6)(x)
    x = layers.LayerNormalization(epsilon = 1e-6)(x)

    for k in range(1):
        x_old = x
        transformer_block = TransformerBlock(64, input_shape[-1] + ( input_shape[-1] * time2vec_dim), 8, 4096, 0.0)
        x = transformer_block(x)
        x = ((1.0 - 0.2) * x) + (0.2 * x_old)

    x = layers.Flatten()(x)
    x = layers.Dense(128, activation = "selu")(x)
    x = layers.Dropout(0.0)(x)
    x = Dense(1, activation = 'linear')(x)

    out = x
    model = Model(inp, out)
    model.compile(tf.keras.optimizers.Adam(LR), 'mae')
    return model

In [None]:
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]: df[features] = df[features].astype(np.float32)

# <span class="title-section w3-xxlarge" id="training">Inference üî•</span>
<hr>

**Inference Functions**

In [None]:
def predict_model(fold_idx, train_df, all_data):
    if os.path.exists(f'./ts-transformer-trained/ts_transformer_100_epochs_fold_{fold_idx}_model_recreate_3.h5'):
        preproc = StandardScaler()
        x_all = pd.DataFrame(preproc.fit_transform(all_data[features]), columns = features)
        all_data_ts = rolling_window(x_all, window_size = WINDOW_SIZE)
        model = get_model(all_data_ts.shape[1:], 3)
        model.load_weights(f'./ts-transformer-trained/ts_transformer_100_epochs_fold_{fold_idx}_model_recreate_3.h5')
        preds = model.predict(all_data_ts)
        test_preds = np.exp(preds[len(train_df) - WINDOW_SIZE:])
        return test_preds
    return None

def engineer_more(df):
    new_df = engineer(df)
    new_df = pd.concat([new_df, pd.DataFrame({f"dec{d}": (df.date.dt.month == 12) & (df.date.dt.day == d) for d in range(24, 32)})], axis = 1)
    new_df = pd.concat([new_df, pd.DataFrame({f"jan{d}": (df.date.dt.month == 1) & (df.date.dt.day == d) for d in range(1, 13)})], axis = 1)
    new_df = pd.concat([new_df, pd.DataFrame({f"may{d}": (df.date.dt.month == 5) & (df.date.dt.day == d) for d in list(range(1, 10)) + list(range(17, 25))})], axis = 1)
    new_df = pd.concat([new_df, pd.DataFrame({f"june{d}": (df.date.dt.month == 6) & (df.date.dt.day == d) for d in list(range(6, 14))})], axis = 1)
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')), 2016: pd.Timestamp(('2016-06-29')), 2017: pd.Timestamp(('2017-06-28')), 2018: pd.Timestamp(('2018-06-27')), 2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df, pd.DataFrame({f"wed_june{d}": (df.date - wed_june_date == np.timedelta64(d, "D")) for d in list(range(-5, 6))})], axis = 1)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')), 2016: pd.Timestamp(('2016-11-6')), 2017: pd.Timestamp(('2017-11-5')), 2018: pd.Timestamp(('2018-11-4')), 2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df, pd.DataFrame({f"sun_nov{d}": (df.date - sun_nov_date == np.timedelta64(d, "D")) for d in list(range(0, 10))})], axis = 1)
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df, pd.DataFrame({f"easter{d}": (df.date - easter_date == np.timedelta64(d, "D")) for d in list(range(0, 9)) + list(range(50, 60)) + list(range(40, 46))})], axis = 1)
    return new_df.astype(np.float32)

**Inference**

In [None]:
train_df = engineer_more(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer_more(original_test_df)
test_df.year = 2018
features = test_df.columns

RUNS = 10
LOSS_CORRECTION = 1
TRAIN_VAL_CUT = datetime(2018, 1, 1)
np.random.seed(202100)

test_pred_list = []
all_data = pd.concat([train_df, test_df])
total_start_time = datetime.now()
for run in range(RUNS):
    test_preds = predict_model(run, train_df, all_data)    
    if test_preds is not None: test_pred_list.append(test_preds * LOSS_CORRECTION)

# <span class="title-section w3-xxlarge" id="submit">Blending & Submitting To Kaggle üá∞</span>
<hr>

In [None]:
sub = original_test_df[['row_id']].copy()
sub['num_sold'] = np.mean(np.concatenate([np.expand_dims(i, axis = 0) for i in test_pred_list], axis = 0), axis = 0)

sub_blend_1 = pd.read_csv('./tps-01-2022/submission.csv')['num_sold'].values
sub_blend_2 = pd.read_csv('./tell-me-the-magic-number/submission.csv')['num_sold'].values
sub_blend_3 = pd.read_csv('./tpsjan22-03-linear-model/submission_linear_model_rounded.csv')['num_sold'].values

sub['num_sold'] = (sub_blend_1 * 0.8) + (sub_blend_2 * 0.2)
sub.to_csv('submission_updated.csv', index=False)

plt.figure(figsize=(16,3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201), density=True, label='Training')
plt.hist(sub['num_sold'], bins=np.linspace(0, 3000, 201), density=True, rwidth=0.5, label='Test predictions')
plt.xlabel('num_sold')
plt.ylabel('Frequency')
plt.legend()
plt.show()
sub

**Some Rounding Post Processing**

In [None]:
sub_rounded = sub.copy()
sub_rounded['num_sold'] = sub_rounded['num_sold'].round()
sub_rounded.to_csv('submission2.csv', index=False)
plt.figure(figsize=(16,3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201), density=True, label='Training')
plt.hist(sub_rounded['num_sold'], bins=np.linspace(0, 3000, 201), density=True, rwidth=0.5, label='Test predictions')
plt.xlabel('num_sold')
plt.ylabel('Frequency')
plt.legend()
plt.show()
sub_rounded

In [11]:
pip install tensorflow

Collecting numpy~=1.19.2
  Using cached numpy-1.19.5-cp38-cp38-win_amd64.whl (13.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.2
    Uninstalling numpy-1.21.2:
      Successfully uninstalled numpy-1.21.2
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\HP\\AppData\\Local\\Temp\\pip-uninstall-3dpfev3g\\core\\_multiarray_tests.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

