# TTPS-22-03 with LSTM including Moving Average

In [None]:
import numpy as np 
import tensorflow as tf
import pandas as pd
import math
from tensorflow import keras
import time

## Common Parameters

In [None]:
sequence_length = 32
batch_size = 1024
categorical_columns = ["x", "y", "direction", "hour", "month", "dayofweek", "key_index"]
sequence_numeric_columns = ["ma5", "ma10", "ma30", "target"]
numeric_values = ["ma5", "ma10", "ma30"]
keys = ['0_0_EB', '0_0_NB', '0_0_SB', '0_1_EB', '0_1_NB', '0_1_SB', '0_1_WB', '0_2_EB', '0_2_NB', '0_2_SB', '0_2_WB', '0_3_EB', '0_3_NB', '0_3_NE', '0_3_SB', '0_3_SW', '0_3_WB', '1_0_EB', '1_0_NB', '1_0_NE', '1_0_SB', '1_0_SW', '1_0_WB', '1_1_EB', '1_1_NB', '1_1_SB', '1_1_WB', '1_2_EB', '1_2_NB', '1_2_NE', '1_2_SB', '1_2_SW', '1_2_WB', '1_3_EB', '1_3_NB', '1_3_NE', '1_3_SB', '1_3_SW', '1_3_WB', '2_0_EB', '2_0_NB', '2_0_SB', '2_0_WB', '2_1_EB', '2_1_NB', '2_1_NE', '2_1_NW', '2_1_SB', '2_1_SE', '2_1_SW', '2_1_WB', '2_2_EB', '2_2_NB', '2_2_NE', '2_2_NW', '2_2_SB', '2_2_SE', '2_2_SW', '2_2_WB', '2_3_EB', '2_3_NB', '2_3_NE', '2_3_SB', '2_3_SW', '2_3_WB']
is_training = False

## Import dataset

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
direction_map = dict()
for i, direction in enumerate(train.direction.unique()):
    direction_map[direction] = i
key_map = dict()
for i, key in enumerate(keys):
    key_map[key] = i

## Feature Engineering

In [None]:
 def feature_engineering(data):
    data["key"] = data["x"].map(lambda item: str(item)) + "_" + data["y"].map(lambda item: str(item)) + "_" + data["direction"]
    data["key_index"] = data["key"].map(lambda item: key_map[item])
    data["direction"] = data["direction"].map(lambda item: direction_map[item])
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['dayofweek'] = data['time'].dt.dayofweek
    data['hour'] = data['time'].dt.hour
    data = data.drop(['time'], axis=1)
    return data

def calculate_moving_average(data):
    for gap in [5, 10, 30]:
        moving_average = data.rolling(gap).congestion.mean()
        data[f"ma{gap}"] = list(moving_average)
    return data

In [None]:
train = feature_engineering(train)
train.head(30)

## EDA

When groupping the dataset by x, y and direction, each dataset contains 13059 records.

In [None]:
set(train["key"].value_counts())

Target value is very volatile even for latest data.

In [None]:
gap = 30
key = f"ma{gap}"
_0_0_EB = train[train.key=="0_0_EB"]
moving_average = _0_0_EB.rolling(gap).congestion.mean()
_0_0_EB[key] = list(moving_average)
len(_0_0_EB)
_0_0_EB[gap:][["congestion", key]].plot()

## Make Tensorflow Time Series Dataset

In [None]:
def preprocess(window):
    return (
        window[:-1, 0], 
        window[:-1, 1], 
        window[:-1, 2],  
        window[:-1, 3], 
        window[-1, 0], 
        window[-1, 1], 
        window[-1, 2],
        window[-1, 3],
        window[-1, 4],
        window[-1, 5],
        window[-1, 6],
        window[-1, 7],
        window[-1, 8],
        window[-1, 9],
    ), window[-1, -1]
def make_dataset(df, sequence_length=32):
    dataset = tf.data.Dataset.from_tensor_slices((df[categorical_columns + numeric_values + ["congestion"]]))
    dataset = dataset.window(sequence_length + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(sequence_length + 1))
    dataset = dataset.map(preprocess)
    return dataset

In [None]:
%%time
split_fraction = 0.9
train_datasets = []
valid_datasets = []
validation_datas = []
for key in keys:
    data = train[train.key==key].copy()
    split_index = int(len(data) * split_fraction)
    train_data = data[0:split_index]
    train_data = calculate_moving_average(train_data)
    val_data = data[split_index:]
    val_data = calculate_moving_average(val_data)
    train_data.shape, val_data.shape
    train_ds = make_dataset(train_data[30:])
    train_datasets.append(train_ds)
    valid_ds = make_dataset(val_data[30:])
    valid_datasets.append(valid_ds)
    validation_datas.append(val_data)

In [None]:
train_dataset = None
valid_dataset = None
for dataset in train_datasets:
    if train_dataset == None:
        train_dataset = dataset
    else:
        train_dataset = train_dataset.concatenate(dataset)

for dataset in valid_datasets:
    if valid_dataset == None:
        valid_dataset = dataset
    else:
        valid_dataset = valid_dataset.concatenate(dataset)

In [None]:
def post_process_dataset(dataset, batch_size=1024, mode="train"):
    if mode == "train":
        dataset = dataset.shuffle(buffer_size=batch_size)
    dataset = dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
train_dataset = post_process_dataset(train_dataset)
valid_dataset = post_process_dataset(valid_dataset, mode="valid")

## Model Development

### Create Lookup layers

In [None]:
%%time
lookupLayersMap = dict()
for column in categorical_columns:
    unique_values = list(train[column].unique())
    lookupLayersMap[column] = tf.keras.layers.IntegerLookup(vocabulary=unique_values)

In [None]:
def get_model():
    sequence_inputs = []
    sequence_vectors = []
    dense_inputs = []
    dense_vectors = []
    for column in sequence_numeric_columns:
        sequence_input = keras.Input(shape=(sequence_length, 1), name=f"{column}_sequnce_input")
        sequence_inputs.append(sequence_input)
        sequence_vectors.append(sequence_input)
    sequence_vector = keras.layers.Concatenate(axis=-1)(sequence_vectors)
    sequence_vector = keras.layers.LSTM(128, return_sequences=True)(sequence_vector)
    sequence_vector = keras.layers.LSTM(64, return_sequences=False)(sequence_vector)
    sequence_vector = keras.layers.Dense(32, activation="relu")(sequence_vector)

    for column in categorical_columns:
        dense_input = keras.Input(shape=(1, ), name=f"{column}_dense_input")
        lookup = lookupLayersMap[column]
        vocab_size = len(lookup.get_vocabulary())
        embed_dimension = math.ceil(np.sqrt(vocab_size))
        dense_vector = lookup(dense_input)
        dense_vector = keras.layers.Embedding(vocab_size, embed_dimension, input_length=1)(dense_vector)
        dense_vector = keras.layers.Reshape((-1,))(dense_vector)
        dense_vectors.append(dense_vector)
        dense_inputs.append(dense_input)
        
    for column in numeric_values:
        dense_input = keras.Input(shape=(1, ), name=f"{column}_dense_input")
        dense_vectors.append(dense_input)
        dense_inputs.append(dense_input)
        
    dense_vector = keras.layers.Concatenate(axis=-1)(dense_vectors)
    dense_vector = keras.layers.Dense(32, activation="relu")(dense_vector)

    vector = keras.layers.Concatenate(axis=-1)([sequence_vector, dense_vector])
    vector = keras.layers.Dense(32, activation="relu")(vector)
    output = keras.layers.Dense(1)(vector)
    model = keras.Model(inputs=sequence_inputs + dense_inputs, outputs=output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", "mape"])
    return model

## Create Model

In [None]:
model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

## Model Training

In [None]:
cp = keras.callbacks.ModelCheckpoint("model.tf", monitor="val_mae", save_best_only=True, save_weights_only=True)
es = keras.callbacks.EarlyStopping(patience=10)
if is_training:
    model.fit(train_dataset, epochs=50, validation_data=valid_dataset, callbacks=[es, cp])
    model.load_weights("model.tf")
else:
    model.load_weights(f"../input/tps2203-lstm-output-v2/model.tf")

## Model Evaluation

In [None]:
#print(model.evaluate(valid_dataset))

## Submission

In [None]:
def make_test_dataset(df, congestions, ma5s, ma10s, ma30s, sequence_length=32):
    data = df.copy()
    data["congestion"] = congestions[-sequence_length:len(congestions)] + [0]
    data["ma5"] = ma5s[-sequence_length-1:]
    data["ma10"] = ma10s[-sequence_length-1:]
    data["ma30"] = ma30s[-sequence_length-1:]
    dataset = tf.data.Dataset.from_tensor_slices((data[categorical_columns + numeric_values + ["congestion"]]))
    dataset = dataset.window(sequence_length + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(sequence_length + 1))
    dataset = dataset.map(preprocess)
    dataset = dataset.batch(1)
    return dataset

In [None]:
df_map = dict()
congestions_map = dict()
ma5_map = dict()
ma10_map = dict()
ma30_map = dict()
for key, validation_data in zip(keys, validation_datas):
    df = validation_data.iloc[-sequence_length-1:]
    df_map[key] = df
    ma5_map[key] = list(df["ma5"])
    ma10_map[key] = list(df["ma10"])
    ma30_map[key] = list(df["ma30"])
    congestions_map[key] = list(df["congestion"])

In [None]:
%%time

begin = time.time()
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
test = feature_engineering(test)
submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
preditions = []
for i in range(len(test)):
    item = test.iloc[i]
    key = item["key"]
    df = df_map[key]
    df = df.append(item)
    congestions = congestions_map[key]
    ma5s = ma5_map[key]
    ma10s = ma10_map[key]
    ma30s = ma30_map[key]
    ds = make_test_dataset(
        df.iloc[-sequence_length-1:len(df)], 
        congestions, 
        ma5s,
        ma10s,
        ma30s,
        sequence_length=sequence_length)
    congestion = model.predict(ds)[0][0]
    preditions.append(congestion)
    congestions.append(congestion)
    ma5s.append(ma5s[-1] + (congestion - ma5s[-1]) / 5.0)
    ma10s.append(ma10s[-1] + (congestion - ma10s[-1]) / 10.0)
    ma30s.append(ma30s[-1] + (congestion - ma30s[-1]) / 30.0)
    df_map[key] = df
    congestions_map[key] = congestions
    ma5_map[key] = ma5s
    ma10_map[key] = ma10s
    ma30_map[key] = ma30s
    if (i + 1) % 100 == 0:
        elaspsed_time = time.time() - begin
        estimated_time = elaspsed_time / (i + 1) * len(test)
        eta = estimated_time - elaspsed_time
        print(f"ETA: %.2fs"%(eta))
submission["congestion"] = np.round(preditions)
submission.to_csv("submission.csv", index=False)