In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
DATA_DIR = '../../data/data_clean.csv'

In [16]:
df = pd.read_csv(DATA_DIR)

In [17]:
df.columns

Index(['Source.Name', 'date', 'hour', 'type', '北京', '天津', '石家庄', '唐山', '秦皇岛',
       '邯郸', '保定', '张家口', '承德', '廊坊', '沧州', '衡水', '邢台', 'tag'],
      dtype='object')

In [18]:
df = df.drop(['Source.Name','date','hour','tag'],axis=1)

In [19]:
df

Unnamed: 0,type,北京,天津,石家庄,唐山,秦皇岛,邯郸,保定,张家口,承德,廊坊,沧州,衡水,邢台
0,O3_24h,50.0,52.0,57.0,61.0,60.0,45.0,67.0,64.0,76.0,44.0,60.0,66.0,46.0
1,NO2_24h,63.0,57.0,76.0,63.0,47.0,67.0,62.0,26.0,42.0,33.0,58.0,68.0,61.0
2,SO2_24h,32.0,65.0,121.0,52.0,53.0,176.0,157.0,76.0,37.0,37.0,99.0,70.0,152.0
3,PM2.5_24h,51.0,65.0,141.0,61.0,42.0,140.0,181.0,33.0,35.0,57.0,71.0,108.0,142.0
4,O3_24h,50.0,52.0,57.0,61.0,60.0,45.0,67.0,64.0,76.0,44.0,60.0,66.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247004,PM2.5_24h,14.0,31.0,30.0,30.0,18.0,38.0,26.0,13.0,17.0,20.0,30.0,29.0,33.0
247005,O3_24h,92.0,123.0,114.0,113.0,112.0,130.0,121.0,104.0,105.0,133.0,117.0,120.0,123.0
247006,NO2_24h,23.0,31.0,37.0,21.0,21.0,19.0,42.0,11.0,25.0,20.0,21.0,25.0,23.0
247007,SO2_24h,2.0,9.0,6.0,7.0,6.0,8.0,6.0,3.0,11.0,4.0,11.0,10.0,6.0


In [22]:
O3 = df[df['type']=='O3_24h'].drop('type',axis=1)

In [26]:
O3['label'] = O3['北京'].shift(-15)

In [28]:
O3 = O3.dropna()

In [31]:
o3_np = np.array(O3)

In [50]:
one_array = O3['北京'].values

In [51]:
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 64.
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(128))

# Add a Dense layer with 10 units.
model.add(layers.Dense(10))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
 lstm_2 (LSTM)               (None, 128)               98816     
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
Total params: 164,106
Trainable params: 164,106
Non-trainable params: 0
_________________________________________________________________


In [52]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [53]:
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

batch_size = 64
input_sequence_length = 15
forecast_horizon = 3
multi_horizon = False


def create_tf_dataset(
    data_array: np.ndarray,
    input_sequence_length: int,
    forecast_horizon: int,
    batch_size: int = 128,
    shuffle=True,
    multi_horizon=True,
):
    """Creates tensorflow dataset from numpy array.
    This function creates a dataset where each element is a tuple `(inputs, targets)`.
    `inputs` is a Tensor
    of shape `(batch_size, input_sequence_length, num_routes, 1)` containing
    the `input_sequence_length` past values of the timeseries for each node.
    `targets` is a Tensor of shape `(batch_size, forecast_horizon, num_routes)`
    containing the `forecast_horizon`
    future values of the timeseries for each node.
    Args:
        data_array: np.ndarray with shape `(num_time_steps, num_routes)`
        input_sequence_length: Length of the input sequence (in number of timesteps).
        forecast_horizon: If `multi_horizon=True`, the target will be the values of the timeseries for 1 to
            `forecast_horizon` timesteps ahead. If `multi_horizon=False`, the target will be the value of the
            timeseries `forecast_horizon` steps ahead (only one value).
        batch_size: Number of timeseries samples in each batch.
        shuffle: Whether to shuffle output samples, or instead draw them in chronological order.
        multi_horizon: See `forecast_horizon`.
    Returns:
        A tf.data.Dataset instance.
    """

    inputs = timeseries_dataset_from_array(
        np.expand_dims(data_array[:-forecast_horizon], axis=-1),
        None,
        sequence_length=input_sequence_length,
        shuffle=False,
        batch_size=batch_size,
    )

    target_offset = (
        input_sequence_length
        if multi_horizon
        else input_sequence_length + forecast_horizon - 1
    )
    target_seq_length = forecast_horizon if multi_horizon else 1
    targets = timeseries_dataset_from_array(
        data_array[target_offset:],
        None,
        sequence_length=target_seq_length,
        shuffle=False,
        batch_size=batch_size,
    )

    dataset = tf.data.Dataset.zip((inputs, targets))
    if shuffle:
        dataset = dataset.shuffle(100)

    return dataset.prefetch(16).cache()

In [54]:
dataset_tf = create_tf_dataset(one_array, input_sequence_length, forecast_horizon, batch_size)

In [55]:
dataset_tf

<CacheDataset element_spec=(TensorSpec(shape=(None, None, 1), dtype=tf.float64, name=None), TensorSpec(shape=(None, None), dtype=tf.float64, name=None))>