In [1]:
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv

--2022-09-28 12:20:33--  https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2010492 (1.9M) [text/plain]
Saving to: ‘pollution.csv’


2022-09-28 12:20:33 (205 MB/s) - ‘pollution.csv’ saved [2010492/2010492]



In [2]:
!pip install icecream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sktime
  Downloading sktime-0.13.4-py3-none-any.whl (7.0 MB)
[K     |████████████████████████████████| 7.0 MB 35.1 MB/s 
Collecting deprecated>=1.2.13
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: deprecated, sktime
Successfully installed deprecated-1.2.13 sktime-0.13.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting executing>=0.3.1
  Downloading executing-1.1.0-py2.py3-none-any.whl (22 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.8-py2.py3-none-any.whl (23 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.0.8 colorama-0.4.5 executin

In [46]:
from datetime import datetime

import numpy as np
import pandas as pd
import tensorflow as tf
from icecream import ic

In [47]:
def parse(x):
    return datetime.strptime(x, "%Y %m %d %H")

In [48]:
dataset = pd.read_csv(
    "pollution.csv",
    parse_dates=[["year", "month", "day", "hour"]],
    index_col=0,
    date_parser=parse,
)

In [49]:
dataset.drop("No", axis=1, inplace=True)

In [50]:
dataset.columns = [
    "pollution",
    "dew",
    "temp",
    "press",
    "wnd_dir",
    "wnd_spd",
    "snow",
    "rain",
]
dataset.index.name = "date"
# mark all NA values with 0
dataset["pollution"].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]

In [51]:
dataset.head()

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [52]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [53]:
encoder = LabelEncoder()
values = dataset.values

In [54]:
values[:, 4] = encoder.fit_transform(values[:, 4])

In [55]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [56]:
scaled = scaler.fit_transform(values)

In [57]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [("var%d(t-%d)" % (j + 1, i)) for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [("var%d(t)" % (j + 1)) for j in range(n_vars)]
        else:
            names += [("var%d(t+%d)" % (j + 1, i)) for j in range(n_vars)]

    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [58]:
reframed = series_to_supervised(scaled, 1, 1)

# dropping cols we dont want to predict
reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True)

In [59]:
reframed.head()

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var1(t)
1,0.129779,0.352941,0.245902,0.527273,0.666667,0.00229,0.0,0.0,0.148893
2,0.148893,0.367647,0.245902,0.527273,0.666667,0.003811,0.0,0.0,0.15996
3,0.15996,0.426471,0.229508,0.545455,0.666667,0.005332,0.0,0.0,0.182093
4,0.182093,0.485294,0.229508,0.563636,0.666667,0.008391,0.037037,0.0,0.138833
5,0.138833,0.485294,0.229508,0.563636,0.666667,0.009912,0.074074,0.0,0.109658


In [64]:
n_train_hours = 365 * 24
train = reframed[:n_train_hours]
test = reframed[n_train_hours:]

In [65]:
train.head()

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var1(t)
1,0.129779,0.352941,0.245902,0.527273,0.666667,0.00229,0.0,0.0,0.148893
2,0.148893,0.367647,0.245902,0.527273,0.666667,0.003811,0.0,0.0,0.15996
3,0.15996,0.426471,0.229508,0.545455,0.666667,0.005332,0.0,0.0,0.182093
4,0.182093,0.485294,0.229508,0.563636,0.666667,0.008391,0.037037,0.0,0.138833
5,0.138833,0.485294,0.229508,0.563636,0.666667,0.009912,0.074074,0.0,0.109658


In [101]:
class WindowGenerator:
    def __init__(
        self,
        input_width,
        label_width,
        shift,
        train_df=train,
        test_df=test,
        label_columns=None,
    ):
        # Store the raw data.
        self.train_df = train_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {
                name: i for i, name in enumerate(label_columns)
            }
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}
        ic(self.column_indices)

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def split_window(self, features):
        inputs = features[:, self.input_slice, :-1]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [
                    labels[:, :, self.column_indices[name]]
                    for name in self.label_columns
                ],
                axis=-1,
            )

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels

    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32,
        )

        ds = ds.map(self.split_window)

        return ds

    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)

    @property
    def example(self):
        """Get and cache an example batch of `inputs, labels` for plotting."""
        result = getattr(self, "_example", None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train))
        # And cache it for next time
        self._example = result
        return result

    def __repr__(self):
        return "\n".join(
            [
                f"Total window size: {self.total_window_size}",
                f"Input indices: {self.input_indices}",
                f"Label indices: {self.label_indices}",
                f"Label column name(s): {self.label_columns}",
            ]
        )

In [102]:
w1 = WindowGenerator(input_width=24, label_width=1, shift=1, label_columns=["var1(t)"])

ic| self.column_indices: {'var1(t)': 8,
                          'var1(t-1)': 0,
                          'var2(t-1)': 1,
                          'var3(t-1)': 2,
                          'var4(t-1)': 3,
                          'var5(t-1)': 4,
                          'var6(t-1)': 5,
                          'var7(t-1)': 6,
                          'var8(t-1)': 7}


In [103]:
w1

Total window size: 25
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Label indices: [24]
Label column name(s): ['var1(t)']

In [104]:
# Stack three slices, the length of the total window.
example_window = tf.stack(
    [
        np.array(train[: w1.total_window_size]),
        np.array(train[100 : 100 + w1.total_window_size]),
        np.array(train[200 : 200 + w1.total_window_size]),
    ]
)

example_inputs, example_labels = w1.split_window(example_window)

print("All shapes are: (batch, time, features)")
print(f"Window shape: {example_window.shape}")
print(f"Inputs shape: {example_inputs.shape}")
print(f"Labels shape: {example_labels.shape}")

All shapes are: (batch, time, features)
Window shape: (3, 25, 9)
Inputs shape: (3, 24, 8)
Labels shape: (3, 1, 1)


In [105]:
w1.train.element_spec

(TensorSpec(shape=(None, 24, 8), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 1, 1), dtype=tf.float32, name=None))

In [106]:
import keras

In [107]:
dense = keras.Sequential(
    [
        keras.layers.Flatten(),
        keras.layers.Dense(units=32, activation="sigmoid"),
        keras.layers.Dense(units=32, activation="sigmoid"),
        keras.layers.Dense(units=1),
        keras.layers.Reshape([1, -1]),
    ]
)

In [109]:
dense.compile(
    loss=keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

In [110]:
dense.fit(w1.train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe9752eab50>

In [111]:
dense.evaluate(w1.test)



[0.001509038032963872, 0.023750029504299164]