<a href="https://colab.research.google.com/github/racamirko/ml_sketches/blob/main/01_multivar_timeseries_beijing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multivariate timeseries prognosis

Using example from [here](https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras

In [11]:
# Dataset information
url_beijing_pm25 = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv'
df_raw = pd.read_csv(url_beijing_pm25)
df_raw = df_raw.iloc[24:, :]
df_raw['pm25'] = df_raw['pm2.5']
df_raw.drop(['No', 'pm2.5'], axis=1, inplace=True)
df_raw.head(20)

Unnamed: 0,year,month,day,hour,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,pm25
24,2010,1,2,0,-16,-4.0,1020.0,SE,1.79,0,0,129.0
25,2010,1,2,1,-15,-4.0,1020.0,SE,2.68,0,0,148.0
26,2010,1,2,2,-11,-5.0,1021.0,SE,3.57,0,0,159.0
27,2010,1,2,3,-7,-5.0,1022.0,SE,5.36,1,0,181.0
28,2010,1,2,4,-7,-5.0,1022.0,SE,6.25,2,0,138.0
29,2010,1,2,5,-7,-6.0,1022.0,SE,7.14,3,0,109.0
30,2010,1,2,6,-7,-6.0,1023.0,SE,8.93,4,0,105.0
31,2010,1,2,7,-7,-5.0,1024.0,SE,10.72,0,0,124.0
32,2010,1,2,8,-8,-6.0,1024.0,SE,12.51,0,0,120.0
33,2010,1,2,9,-7,-5.0,1025.0,SE,14.3,0,0,132.0


Normalize the data

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [19]:
df_scaled = df_raw.copy()
wind_encoder = LabelEncoder()
df_scaled['cbwd'] = wind_encoder.fit_transform(df_scaled['cbwd'])
float_scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = pd.DataFrame(float_scaler.fit_transform(df_scaled))
df_scaled.columns = df_raw.columns
df_scaled

Unnamed: 0,year,month,day,hour,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,pm25
0,0.0,0.0,0.033333,0.000000,0.352941,0.245902,0.527273,0.666667,0.002290,0.000000,0.0,0.129779
1,0.0,0.0,0.033333,0.043478,0.367647,0.245902,0.527273,0.666667,0.003811,0.000000,0.0,0.148893
2,0.0,0.0,0.033333,0.086957,0.426471,0.229508,0.545455,0.666667,0.005332,0.000000,0.0,0.159960
3,0.0,0.0,0.033333,0.130435,0.485294,0.229508,0.563636,0.666667,0.008391,0.037037,0.0,0.182093
4,0.0,0.0,0.033333,0.173913,0.485294,0.229508,0.563636,0.666667,0.009912,0.074074,0.0,0.138833
...,...,...,...,...,...,...,...,...,...,...,...,...
43795,1.0,1.0,1.000000,0.826087,0.250000,0.278689,0.781818,0.333333,0.395659,0.000000,0.0,0.008048
43796,1.0,1.0,1.000000,0.869565,0.264706,0.262295,0.781818,0.333333,0.405588,0.000000,0.0,0.010060
43797,1.0,1.0,1.000000,0.913043,0.264706,0.262295,0.781818,0.333333,0.413996,0.000000,0.0,0.010060
43798,1.0,1.0,1.000000,0.956522,0.264706,0.245902,0.781818,0.333333,0.420866,0.000000,0.0,0.008048


## Create batches

Lets randomly choose 5-step batches.

In [30]:
df_scaled.iloc[:7, :].values

array([[0.        , 0.        , 0.03333333, 0.        , 0.35294118,
        0.24590164, 0.52727273, 0.66666667, 0.00229001, 0.        ,
        0.        , 0.12977867],
       [0.        , 0.        , 0.03333333, 0.04347826, 0.36764706,
        0.24590164, 0.52727273, 0.66666667, 0.00381099, 0.        ,
        0.        , 0.14889336],
       [0.        , 0.        , 0.03333333, 0.08695652, 0.42647059,
        0.2295082 , 0.54545455, 0.66666667, 0.00533197, 0.        ,
        0.        , 0.15995976],
       [0.        , 0.        , 0.03333333, 0.13043478, 0.48529412,
        0.2295082 , 0.56363636, 0.66666667, 0.00839101, 0.03703704,
        0.        , 0.18209256],
       [0.        , 0.        , 0.03333333, 0.17391304, 0.48529412,
        0.2295082 , 0.56363636, 0.66666667, 0.00991199, 0.07407407,
        0.        , 0.138833  ],
       [0.        , 0.        , 0.03333333, 0.2173913 , 0.48529412,
        0.21311475, 0.56363636, 0.66666667, 0.01143297, 0.11111111,
        0.        ,

In [25]:
def create_batches(feats: pd.DataFrame, y_column_idx: int, batch_size: int = 5):
  # outputs
  batches = []
  y_metric = []
  # column selection
  feature_cols = list(range(feats.shape[1]))
  feature_cols.remove(y_column_idx)
  # create batches
  for i in range(batch_size, feats.shape[0]):
    batches.append(feats.iloc[i-batch_size:i, feature_cols].values)
    y_metric.append(feats.iloc[i, y_column_idx])
  return np.array(batches), np.array(y_metric)

In [26]:
batches, labels = create_batches(df_scaled, 11, 5)

In [34]:
batches.shape

(43795, 5, 11)

In [35]:
labels.shape

(43795,)

## Split into train/validation

In [37]:
import random

In [47]:
def split_data(batches, labels, pct_test=.1):
  assert(batches.shape[0] == labels.shape[0])
  num_samples = batches.shape[0]
  selector_test = np.array([random.random() for i in range(num_samples)]) < pct_test
  selector_train = ~selector_test
  train_feats = batches[selector_train]
  train_labels = labels[selector_train]
  test_feats = batches[selector_test]
  test_labels = labels[selector_test]
  return train_feats, train_labels, test_feats, test_labels

In [48]:
train_feats, train_labels, test_feats, test_labels = split_data(batches, labels, .1)

In [49]:
train_feats.shape

(39309, 5, 11)

In [50]:
train_labels.shape

(39309,)

In [52]:
test_feats.shape

(4486, 5, 11)

## Train the network