In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
import pandas as pd
import sklearn

In [2]:
# read in data
df_NO2 = pd.read_csv('../air_weather_data/NO2_all_stations_cleaned.csv')
df_NO2['date'] = pd.to_datetime(df_NO2['date'], format = '%Y-%m-%d %H:%M') # change time column to pandas datetime
df_NO2 = df_NO2.set_index('date') # date column as index

df_weather = pd.read_csv('../air_weather_data/metdata.csv')
df_weather['date'] = pd.to_datetime(df_weather['date'], format = '%Y-%m-%d %H:%M') # change time column to pandas datetime
df_weather = df_weather.set_index('date') # date column as index

In [3]:
# replace zero and negative with NaNs and interpolate data
df_NO2[df_NO2 <= 0] = np.NaN
df_NO2.interpolate(inplace=True)

In [4]:
# split into train, val, and test sets
df_NO2_train = df_NO2['2016-01-01 00:00':'2019-01-01 00:00']
df_NO2_val = df_NO2['2019-01-01 01:00:00':'2020-01-01 00:00:00']
df_NO2_test = df_NO2['2021-01-25 16:00:00':]

df_weather_train = df_weather['2016-01-01 00:00':'2019-01-01 00:00']
df_weather_val = df_weather['2019-01-01 01:00:00':'2020-01-01 00:00:00']
df_weather_test = df_weather['2021-01-25 16:00:00':]

In [5]:
# index needed later
index_train = df_NO2_train.index 
index_val = df_NO2_val.index
index_test = df_NO2_test.index

In [6]:
df_train = df_NO2_train.merge(df_weather_train, on='date')
df_val = df_NO2_val.merge(df_weather_val, on='date')
df_test = df_NO2_test.merge(df_weather_test, on='date')

In [7]:
# input variables used in robust regression model
stations = ['NO$_2$, Stockholm Torkel Knutssonsgatan',
            'NO$_2$, Stockholm Hornsgatan 108 ',
            'NO$_2$, Stockholm Sveavägen 59 ',
            'NO$_2$, Stockholm E4/E20 Lilla Essingen',
            'Temperature', 
            'Relative humidity', 
            'Precipitation', 
            'Solar radiation',
            'Wind speed',
            'Sine day', 
            'Cosine day']

In [8]:
# create y and X matrix for train, val, and test sets
y_train = df_train['NO$_2$, Stockholm Torkel Knutssonsgatan']
X_train = df_train[stations]

y_val = df_val['NO$_2$, Stockholm Torkel Knutssonsgatan']
X_val = df_val[stations]

y_test = df_test['NO$_2$, Stockholm Torkel Knutssonsgatan']
X_test = df_test[stations]

In [9]:
from sklearn.preprocessing import MinMaxScaler
# two scalers, one for X and on for y
scaler1 = MinMaxScaler() 
scaler2 = MinMaxScaler()

# normalize train set
X_train = scaler1.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=stations)
X_train.insert(0, 'date', index_train)
X_train.set_index('date', inplace=True)

y_train = scaler2.fit_transform(y_train.to_frame())
y_train = pd.DataFrame(y_train, columns = ['NO$_2$, Torkel Knutssonsgatan'])
y_train.insert(0, 'date', index_train)
y_train.set_index('date', inplace=True)

# normalize validation set
X_val = scaler1.transform(X_val) # fit is not used here to only use statistics from train data
X_val = pd.DataFrame(X_val, columns=stations)
X_val.insert(0, 'date', index_val)
X_val.set_index('date', inplace=True)

y_val = scaler2.transform(y_val.to_frame()) # fit is not used here to only use statistics from train data
y_val = pd.DataFrame(y_val, columns = ['NO$_2$, Torkel Knutssonsgatan'])
y_val.insert(0, 'date', index_val)
y_val.set_index('date', inplace=True)

# normalize test set (except y which is kept as it is since we want to predict on original scale)
X_test = scaler1.transform(X_test) # not fit here to use statistics from train data
X_test = pd.DataFrame(X_test, columns=stations)
X_test.insert(0, 'date', index_test)
X_test.set_index('date', inplace=True)

In [105]:
data = X_train.iloc[:,0].values
np.shape(data[:-12]), np.shape(data[12:])

((26293,), (26293,))

In [111]:
input_data = data[:-12], 
targets = data[12:],
dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
    input_data,
    targets,
    sequence_length=1,
    sampling_rate=1)
for batch in dataset:
  inputs, targets = batch

ValueError: `sampling_rate` must be lower than the length of the data. Received: sampling_rate=1, for data of length 1

In [52]:
dataset_test = tf.keras.preprocessing.timeseries_dataset_from_array(
    X_test.values[:-12], 
    y_test.values[12:],
    sequence_length=12)

dataset_val = tf.keras.preprocessing.timeseries_dataset_from_array(
    X_val.values[:-12], 
    y_val.values[12:],
    sequence_length=12)

In [53]:

for samples, targets in dataset:
    print("samples shape:", samples.shape)
    print("targets shape:", targets.shape)
    break

samples shape: (128, 12, 11)
targets shape: (128, 1)


In [38]:
dense = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1),
    tf.keras.layers.Reshape([1, -1]),

])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=5,
                                                  mode='min')

dense.compile(loss = tf.losses.MeanSquaredError(),
              optimizer= tf.optimizers.Adam(learning_rate=0.0001),
              metrics=[tf.metrics.MeanSquaredError()])

history_dense = dense.fit(dataset, 
                          epochs=30, 
                          validation_data=dataset_val,
                          callbacks = [early_stopping])

Epoch 1/30


ValueError: in user code:

    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/engine/training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/simoncarlen/opt/miniconda3/envs/DL/lib/python3.10/site-packages/keras/layers/core/dense.py", line 139, in build
        raise ValueError('The last dimension of the inputs to a Dense layer '

    ValueError: Exception encountered when calling layer "sequential_1" (type Sequential).
    
    The last dimension of the inputs to a Dense layer should be defined. Found None. Full input shape received: (None, None)
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(None, None, 11), dtype=float64)
      • training=True
      • mask=None


In [None]:
dense.evaluate(dataset_test)