In [1]:
# Model definition
import os
import numpy as np
import pandas as pd

# SKLEARN
from sklearn.preprocessing  import ( StandardScaler, )
from sklearn.model_selection import ( train_test_split,KFold,cross_val_score, )

#KERAS 
from keras import backend
from keras import models
from keras import layers
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor

from keras.layers import (Dense,Flatten,Embedding, SimpleRNN, LSTM, GRU, )
from keras.models import (load_model,Sequential, )
from keras.applications import (VGG16,)
from keras.preprocessing.image import (ImageDataGenerator,image,)
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.datasets import (boston_housing,mnist, imdb,)
from keras.utils import to_categorical


import matplotlib.pyplot as plt
%matplotlib inline

#from utils import plot_history

Using TensorFlow backend.


In [16]:
# download from  https://s3.amazonaws.com/keras-datasets/jena_climate_2009_2016.csv.zip
jena = r'G:\STUDY\Chollet - Deep LEarning With Python - Keras\jena_climate_2009_2016.csv'

with open(jena) as file:
    data = file.read()

lines = data.split('\n')
header = lines[0].split(",")
lines = lines[1:]

print(header)
data = np.zeros((len(lines), len(header) - 1))
# not including DateTime first column
for i, line in enumerate(lines):
    data[i,:] = [float(x) for x in line.split(",")[1:]]

print(data.shape)

scaler = StandardScaler(copy = False)
scaler.fit(data)
scaler.transform(data)

# verify the scaler
#print (pd.DataFrame(data).describe())

['"Date Time"', '"p (mbar)"', '"T (degC)"', '"Tpot (K)"', '"Tdew (degC)"', '"rh (%)"', '"VPmax (mbar)"', '"VPact (mbar)"', '"VPdef (mbar)"', '"sh (g/kg)"', '"H2OC (mmol/mol)"', '"rho (g/m**3)"', '"wv (m/s)"', '"max. wv (m/s)"', '"wd (deg)"']
(420551, 14)
                 0             1             2             3             4   \
count  4.205510e+05  4.205510e+05  4.205510e+05  4.205510e+05  4.205510e+05   
mean  -3.518031e-13  6.626287e-15  4.186531e-13 -9.558809e-16  2.610831e-14   
std    1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00   
min   -9.046245e+00 -3.853589e+00 -3.867705e+00 -4.452138e+00 -3.827243e+00   
25%   -5.997241e-01 -7.230073e-01 -7.128898e-01 -7.006519e-01 -6.553871e-01   
50%    4.393434e-02 -3.579020e-03 -2.674296e-03  3.924517e-02  1.997881e-01   
75%    6.588794e-01  7.146621e-01  7.098929e-01  7.598276e-01  8.127952e-01   
max    3.127034e+00  3.303892e+00  3.274429e+00  2.697229e+00  1.456149e+00   

                 5             6 

In [33]:
# data is 10 mins interval
def generator(data, lookback, delay, start_index, end_index, shuffle=False, batch_size=128,step = 6):
    if end_index is None:
        end_index = len(data) - delay -1
        
    i = start_index + lookback
    while True:
        if shuffle:
            rows = np.random.randint(start_index + lookback, end_index, size = batch_size)
        else:
            if i + batch_size >= end_index:
                i = start_index + lookback
            rows = np.arange(i, min(i + batch_size, end_index))
            i += len(rows)
        samples = np.zeros((len(rows), lookback// step, data.shape[-1]))
        targets =  np.zeros((len(rows),))
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]
        yield samples, targets
        

In [34]:
step = 6
delay = 24 * 6         # points in 1 day
lookback = 10 * delay  # points in 10 days
batch_size = 128

In [35]:

train_gen = generator(data,lookback, delay, start_index = 0, end_index = 200000, 
                      shuffle=True,step=step,batch_size=batch_size)

valid_gen = generator(data,lookback, delay, start_index = 200001, end_index = 300000, 
                      step=step,batch_size=batch_size)

test_gen = generator(data,lookback, delay, start_index = 300001, end_index = None, 
                      step=step,batch_size=batch_size)


In [44]:
# FFN achieves 0.3 loss on validation  data
# GRU achieves 0.26 loss on validation data
def build_model(gru = True):
    model = Sequential()
    if gru:
        model.add(GRU(32,input_shape = (None,data.shape[-1])))
    else:
        model.add(Flatten(input_shape = (lookback//step, data.shape[-1])))
        model.add(Dense(32, activation="relu"))
    model.add(Dense(1))
    model.compile(optimizer='rmsprop', loss = 'mae')
    
    model.summary()
    return model

In [45]:
model = build_model()
history = model.fit_generator(train_gen, epochs = 5, steps_per_epoch = 500, validation_data = valid_gen, validation_steps=10)
model.save("gru_trained.h5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 32)                4512      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 33        
Total params: 4,545
Trainable params: 4,545
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
