# Load dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import math

In [4]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

In [5]:
from keras.layers import LSTM, Flatten

In [6]:
# Load dataset
df = pd.read_csv("datasets/ps6_trainvalid.csv")

In [7]:
df.head()

Unnamed: 0,datetime,temperature,humidity,pressure,weather,wind_direction,wind_speed
0,2012-10-01 12:00:00,,,,,,
1,2012-10-01 13:00:00,291.87,88.0,1013.0,mist,0.0,0.0
2,2012-10-01 14:00:00,291.868186,88.0,1013.0,sky is clear,0.0,0.0
3,2012-10-01 15:00:00,291.862844,88.0,1013.0,sky is clear,0.0,0.0
4,2012-10-01 16:00:00,291.857503,88.0,1013.0,sky is clear,0.0,0.0


### Preprocessing

In [8]:
# Dealing with missing data
missing_data = df[pd.isnull(df["temperature"])]
missing_data.shape

(3, 7)

In [9]:
# drop those data
df = df.dropna()

In [10]:
time = df['datetime']
temp_values = df['temperature'].values 

In [11]:
time.shape

(44671,)

In [12]:
temp_values.shape

(44671,)

In [13]:
temp_values

array([291.87      , 291.86818552, 291.86284446, ..., 296.51      ,
       297.09      , 296.69      ])

# The model which can forecast the 'future' 24 hours, 72 hours, .......

## Predicting future 24 hours

In [14]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps_in, n_steps_out):
    X, y = list(), list()
    # for i in range(len(sequence)):
    #     # find the end of this pattern
    #     end_ix = i + n_steps_in
    #     out_end_ix = end_ix + n_steps_out
    #     # check if we are beyond the sequence
    #     if out_end_ix > len(sequence):
    #         break
    #     # gather input and output parts of the pattern
    #     seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
    #     X.append(seq_x)
    #     y.append(seq_y)
    i = 0
    while i + n_steps_in + n_steps_out <= len(sequence):
        end_ix = i + n_steps_in
        out_end_ix = i + n_steps_in + n_steps_out
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
        i += n_steps_out
    return np.array(X), np.array(y)
 
# define input sequence
raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90]
# choose a number of time steps
n_steps_in, n_steps_out = 3, 2
# split into samples
X, y = split_sequence(raw_seq, n_steps_in, n_steps_out)
# summarize the data
for i in range(len(X)):
	print(X[i], y[i])

[10 20 30] [40 50]
[30 40 50] [60 70]
[50 60 70] [80 90]


In [15]:
# choose a number of time steps
n_steps_in, n_steps_out = 24*5, 24

In [16]:
X, y = split_sequence(temp_values, n_steps_in, n_steps_out)

In [17]:
X.shape

(1856, 120)

In [18]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))

In [19]:
X.shape

(1856, 120, 1)

In [20]:
y.shape

(1856, 24)

In [21]:
y

array([[290.96, 292.28, 294.72, ..., 291.75, 294.55, 296.13],
       [297.4 , 298.21, 298.01, ..., 293.54, 294.48, 296.44],
       [296.83, 297.66, 297.68, ..., 291.25, 291.74, 293.93],
       ...,
       [291.58, 294.44, 296.58, ..., 290.15, 290.21, 290.41],
       [291.26, 292.65, 293.98, ..., 285.3 , 285.08, 287.77],
       [291.21, 293.38, 295.38, ..., 283.66, 284.14, 287.33]])

In [22]:
# Split training and validation set
n = len(X)
train_X = X[0:int(n*0.7),:]
val_X = X[int(n*0.7):,:]

train_y = y[0:int(n*0.7)]
val_y = y[int(n*0.7):]

print("Shape of training X: {}".format(train_X.shape))
print("Shape of validation X: {}".format(val_X.shape))

print("Shape of training y: {}".format(train_y.shape))
print("Shape of validation y: {}".format(val_y.shape))

Shape of training X: (1299, 120, 1)
Shape of validation X: (557, 120, 1)
Shape of training y: (1299, 24)
Shape of validation y: (557, 24)


In [23]:
# Normalization
train_X_mean = train_X.mean()
train_X_std = train_X.std()

train_y_mean = train_y.mean()
train_y_std = train_y.std()

In [24]:
normalized_train_X = (train_X - train_X_mean)/train_X_std
normalized_val_X = (val_X - train_X_mean)/train_X_std

normalized_train_y = (train_y - train_y_mean)/train_y_std
normalized_val_y = (val_y - train_y_mean)/train_y_std

In [25]:
# define model
RNN_model_2 = Sequential()
RNN_model_2.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
RNN_model_2.add(LSTM(50, activation='relu'))
RNN_model_2.add(Dense(n_steps_out))
RNN_model_2.compile(optimizer='adam', loss='mse')
print(RNN_model_2.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 120, 50)           10400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 24)                1224      
Total params: 31,824
Trainable params: 31,824
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
# fit model
RNN_model_2.fit(normalized_train_X, normalized_train_y, epochs=100, validation_split=0.2, verbose=2)

Epoch 1/100
33/33 - 8s - loss: 24944970.0000 - val_loss: 0.8677
Epoch 2/100
33/33 - 4s - loss: 1225641426944.0000 - val_loss: 6826975744.0000
Epoch 3/100
33/33 - 4s - loss: 11088.0430 - val_loss: 0.8918
Epoch 4/100
33/33 - 4s - loss: 0.9230 - val_loss: 0.8891
Epoch 5/100
33/33 - 4s - loss: 0.9184 - val_loss: 0.8836
Epoch 6/100
33/33 - 4s - loss: 0.9130 - val_loss: 0.8783
Epoch 7/100
33/33 - 5s - loss: 0.9075 - val_loss: 0.8723
Epoch 8/100
33/33 - 5s - loss: 0.9017 - val_loss: 0.8664
Epoch 9/100
33/33 - 4s - loss: 0.8958 - val_loss: 0.8602
Epoch 10/100
33/33 - 4s - loss: 0.8890 - val_loss: 0.8533
Epoch 11/100
33/33 - 4s - loss: 681828032.0000 - val_loss: 0.8521
Epoch 12/100
33/33 - 4s - loss: 0.8932 - val_loss: 0.8523
Epoch 13/100
33/33 - 4s - loss: 0.8867 - val_loss: 0.8438
Epoch 14/100
33/33 - 4s - loss: 0.8782 - val_loss: 0.8357
Epoch 15/100
33/33 - 4s - loss: 0.8702 - val_loss: 0.8280
Epoch 16/100
33/33 - 4s - loss: 0.8626 - val_loss: 0.8207
Epoch 17/100
33/33 - 4s - loss: 0.8553 - 

<tensorflow.python.keras.callbacks.History at 0x7fb5b9374e50>

In [27]:
y_true = val_y
y_true.shape

(557, 24)

In [36]:
y_true.reshape(-1)

array([287.84, 288.38, 288.97, ..., 283.66, 284.14, 287.33])

In [29]:
normalized_y_pred = RNN_model_2.predict(normalized_val_X)
y_pred = normalized_y_pred * train_y.std() + train_y.mean()
y_pred.shape

(557, 24)

In [30]:
y_pred

array([[289.2087 , 289.20316, 289.1069 , ..., 289.9152 , 289.65912,
        289.65637],
       [289.44864, 289.36365, 289.2755 , ..., 290.21304, 289.94025,
        289.93015],
       [289.9357 , 289.75433, 289.65598, ..., 290.72186, 290.42764,
        290.4338 ],
       ...,
       [289.38782, 289.38452, 289.2753 , ..., 290.05548, 289.79996,
        289.82898],
       [290.2056 , 289.99292, 289.876  , ..., 290.68527, 290.44794,
        290.5622 ],
       [287.71494, 287.73465, 287.73624, ..., 288.59076, 288.34875,
        288.17776]], dtype=float32)

In [39]:
# Evaluation
valScore = mean_squared_error(y_true, y_pred)
print('Mean Squared Error is: %.2f' % (valScore))

Mean Squared Error is: 34.81


## Predicting future 72 hours

In [49]:
# choose a number of time steps
n_steps_in, n_steps_out = 24*5, 72

In [50]:
X, y = split_sequence(temp_values, n_steps_in, n_steps_out)

In [51]:
X.shape

(618, 120)

In [58]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))

In [59]:
X.shape

(618, 120, 1)

In [60]:
y.shape

(618, 72)

In [61]:
# Split training and validation set
n = len(X)
train_X = X[0:int(n*0.7),:]
val_X = X[int(n*0.7):,:]

train_y = y[0:int(n*0.7)]
val_y = y[int(n*0.7):]

print("Shape of training X: {}".format(train_X.shape))
print("Shape of validation X: {}".format(val_X.shape))

print("Shape of training y: {}".format(train_y.shape))
print("Shape of validation y: {}".format(val_y.shape))

Shape of training X: (432, 120, 1)
Shape of validation X: (186, 120, 1)
Shape of training y: (432, 72)
Shape of validation y: (186, 72)


In [62]:
# Normalization
train_X_mean = train_X.mean()
train_X_std = train_X.std()

train_y_mean = train_y.mean()
train_y_std = train_y.std()

In [63]:
normalized_train_X = (train_X - train_X_mean)/train_X_std
normalized_val_X = (val_X - train_X_mean)/train_X_std

normalized_train_y = (train_y - train_y_mean)/train_y_std
normalized_val_y = (val_y - train_y_mean)/train_y_std

In [64]:
# define model
RNN_model_2 = Sequential()
RNN_model_2.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
RNN_model_2.add(LSTM(50, activation='relu'))
RNN_model_2.add(Dense(n_steps_out))
RNN_model_2.compile(optimizer='adam', loss='mse')
print(RNN_model_2.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 120, 50)           10400     
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_2 (Dense)              (None, 72)                3672      
Total params: 34,272
Trainable params: 34,272
Non-trainable params: 0
_________________________________________________________________
None


In [65]:
# fit model
RNN_model_2.fit(normalized_train_X, normalized_train_y, epochs=100, validation_split=0.2, verbose=2)

Epoch 1/100
11/11 - 5s - loss: 0.9950 - val_loss: 0.9597
Epoch 2/100
11/11 - 1s - loss: 239.9470 - val_loss: 0.9091
Epoch 3/100
11/11 - 2s - loss: 0.9374 - val_loss: 0.9126
Epoch 4/100
11/11 - 1s - loss: 0.9344 - val_loss: 0.9004
Epoch 5/100
11/11 - 1s - loss: 0.9150 - val_loss: 0.8660
Epoch 6/100
11/11 - 1s - loss: 158931984.0000 - val_loss: 0.8775
Epoch 7/100
11/11 - 1s - loss: 0.9120 - val_loss: 0.8839
Epoch 8/100
11/11 - 1s - loss: 0.9134 - val_loss: 0.8786
Epoch 9/100
11/11 - 1s - loss: 0.9076 - val_loss: 0.8672
Epoch 10/100
11/11 - 1s - loss: 0.8981 - val_loss: 0.8492
Epoch 11/100
11/11 - 1s - loss: 0.8837 - val_loss: 10960421888.0000
Epoch 12/100
11/11 - 1s - loss: 470550.0625 - val_loss: 0.8841
Epoch 13/100
11/11 - 1s - loss: 0.9171 - val_loss: 0.8981
Epoch 14/100
11/11 - 1s - loss: 0.9217 - val_loss: 0.9003
Epoch 15/100
11/11 - 1s - loss: 0.9209 - val_loss: 0.8995
Epoch 16/100
11/11 - 1s - loss: 0.9192 - val_loss: 0.8974
Epoch 17/100
11/11 - 1s - loss: 0.9169 - val_loss: 0.895

<tensorflow.python.keras.callbacks.History at 0x7fb59fe957c0>

In [66]:
y_true = val_y
y_true.shape

(186, 72)

In [67]:
normalized_y_pred = RNN_model_2.predict(normalized_val_X)
y_pred = normalized_y_pred * train_y.std() + train_y.mean()
y_pred.shape

(186, 72)

In [68]:
# Evaluation
valScore = mean_squared_error(y_true, y_pred)
print('Mean Squared Error is: %.2f' % (valScore))

Mean Squared Error is: 39.39
