In [42]:
%load_ext autoreload
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
#import dependencies
import numpy as np
# fix random seed for reproducibility
seed = 155
np.random.seed(seed)

In [44]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time



In [45]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir,'src')
sys.path.append(src_dir)

## Load dataset

In [46]:
%store -r ili_GLL

In [47]:
l = len(ili_GLL.columns)
print("The data contains {} features".format(l))
ili_GLL.head()

The data contains 8 features


Unnamed: 0,date,year,month,week,state,latitude,longitude,ili_activity_group
0,2010-10-09,2010,10,40,AK,61.370716,-152.404419,Minimal
1,2010-10-16,2010,10,41,AK,61.370716,-152.404419,Minimal
2,2010-10-23,2010,10,42,AK,61.370716,-152.404419,Minimal
3,2010-10-30,2010,10,43,AK,61.370716,-152.404419,Minimal
4,2010-11-06,2010,10,44,AK,61.370716,-152.404419,Minimal


In [48]:
nstates = len(ili_GLL.state.unique())
print("Number of location : {}".format(nstates))

Number of location : 46


## LSTM Data Preparation

In [140]:
%autoreload
import time
import os

from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from sklearn.metrics import mean_squared_error

import keras
#from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Input, concatenate
from keras.models import Model

#allow to use time distributed content
#from keras.layers import TimeDistributed
from keras.callbacks import TensorBoard

import numpy as np
from numpy import concatenate

from math import sqrt
    
import matplotlib.pyplot as plt

## Prepare Dataset

In [141]:
#we put everything together
#sepate by states
datasets = {}
states_label = ili_GLL.state.unique()
index = [0]
states_label = np.delete(states_label, index)

In [142]:
states_label

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'GA', 'HI', 'IA', 'ID',
       'IL', 'IN', 'KS', 'KY', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS',
       'MT', 'NC', 'ND', 'NE', 'NH', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR',
       'PA', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV',
       'WY'], dtype=object)

In [143]:
for s in states_label:
    datasets[s] = ili_GLL[(ili_GLL.state == s)]
    datasets[s].drop(['date'], 1, inplace=True)
    datasets[s].drop(['state'], 1, inplace=True)

nstates = len(datasets)
print("Number of location : {}".format(nstates))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Number of location : 45


In [144]:
datasets['NY'].head()

Unnamed: 0,year,month,week,latitude,longitude,ili_activity_group
7480,2010,10,40,42.165726,-74.948051,Minimal
7481,2010,10,41,42.165726,-74.948051,Minimal
7482,2010,10,42,42.165726,-74.948051,Minimal
7483,2010,10,43,42.165726,-74.948051,Minimal
7484,2010,10,44,42.165726,-74.948051,Minimal


In [145]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
       Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    print("The data contains {} features".format(n_vars))
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    #n_vars = len(agg.columns)
    #print("The reframed data contains {} features".format(n_vars))

    if dropnan:
        agg.dropna(inplace=True)
        return agg

In [146]:
# scale train and test data to [-1, 1]
def scale(datasets):
    # fit scaler
    scaler = MinMaxScaler(feature_range=(-1, 1))
    data_scaled = []
    for state in datasets:
        scaler = scaler.fit(datasets[state])
        #datasets[state] = datasets[state].reshape(datasets[state].shape[0], datasets[state].shape[1])
        data_scaled.append(scaler.transform(data_scaled))
    return scaler, data_scaled

In [147]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, yhat):
    new_row = [x for x in X] + [yhat]
    array = numpy.array(new_row)
    array = array.reshape(1, len(array))
    inverted = scaler.inverse_transform(array)
    return inverted[0, -1]

In [148]:
# One Hot encoder
def encode_category(data):
    #check the categories
    df = pd.get_dummies(data)
    return df

In [149]:
# parse the data into states
def parse_data(states_label, datasets, n_weeks = 260):
    """
    return a dictionary
    """
    parse_data = {}
    for state in states_label:
        data = datasets[state]
        names = list(data.columns.values)
        
        #drop date value
        if 'date' in names:
            data.drop(['date'], 1, inplace=True)
        # make sure that every states has the same number of weeks
        if(len(data)>= n_weeks):
            parse_data[state] = pd.get_dummies(data)
    return parse_data

In [150]:
def reframe_data(parse_data, n_weeks =1,n_features =1 ):
    reframed_data = []
    for state in parse_data:
        values=parse_data[state]
        # ensure all data is float
        values = values.astype('float32')
        # normalize features
        reframed = series_to_supervised(values, n_weeks, n_features)
        # we are predicting ili activity
        reframed_data.append(reframed)
    return reframed_data

In [174]:
# verbose=0 suppresses the file writing message
# note that the fit method expects a list of callbacks
start = time.time()
my_first_rnn_fitted = model.fit(
    train_features,
    train_label[0], #label for the targeted state
    validation_data= (
        test_features,
        test_label[0] ),
    epochs=2000,
    verbose=0,
    shuffle = False,
    batch_size=52,
    callbacks=[checkpoint],
    initial_epoch=0
)
end = time.time()
print "Model took %0.2f seconds to train"%(end - start)

NameError: name 'model' is not defined

In [217]:
def get_tt_data(datasets, n_total_years = 260, n_train_weeks = 156 ):
    train_features, train_label = list(), list()
    test_features, test_label = list(), list()
    for data in datasets:
        values = data.head(n_total_years).values
        train = values[:n_train_weeks, :]
        test = values[n_train_weeks:, :]
        # split into input and outputs
        train_X, train_y = train[:, :-1], train[:,-1]
        test_X, test_y = test[:, :-1], test[:, -1]
        # reshape input to be 3D [samples, timesteps, features]
        train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
        test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
        #print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
        train_features.append(train_X)
        train_label.append(train_y)
        test_features.append(test_X)
        test_label.append(test_y)
    print("number of weeks in a year: {}".format(n_total_years))
    print("number of weeks in the training set : {}".format(n_train_weeks))
    return train_features, train_label, test_features, test_label

In [218]:
def create_model(labels, refraimed_data, batch_size = 52, n_neurons = 50):
    #dataLength =  4 weeks
    stateInputs = {}
    stateLayers = []
    i = 0
    for label in labels:
        data = refraimed_data[i]
        i+=1
        timesteps = data.shape[1]
        features = data.shape[2]
        print("timesteps: {}".format(timesteps))
        inputName = "{}_input".format(label)
        stateInputs[inputName] = Input(shape=(timesteps,features),
                                       batch_shape =(batch_size, timesteps, features), 
                                       name=inputName)
        
    for state in stateInputs:
        stateL = LSTM(n_neurons, return_sequences=False, stateful=True,
                            batch_input_shape=(batch_size, timesteps, features))(stateInputs[state])
        stateLayers.append(stateL)
    #combined the output
    output = keras.layers.concatenate(stateLayers)
    output = Dense(1, activation='relu', name='wheighthedAverage_output')(output)
    stateInput = stateInputs.values()
    
    model = Model(inputs = stateInput, outputs = [output])
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    return model

In [219]:
# fit an LSTM network to training data
def fit_lstm(labels, 
             train_features, 
             train_label, 
             test_features, 
             test_label , 
           #  scaler, 
             nb_epoch, 
             timesteps = 1, batch_size = 52, n_neurons = 50):
    
    
    # prepare model
    model = create_model(labels, train_features)
    # fit model
    train_rmse, test_rmse = list(), list()
    
    for i in range(nb_epoch):
        model.fit(
            train_features,
            train_label[0], #label for the targeted state
           # validation_data= (
           #     test_features,
           #     test_label[0] ),
            epochs=2000,
            verbose=0,
            shuffle = False,
            batch_size=52,
            initial_epoch=0
        )        
        model.reset_states()
        
        # evaluate model on train data
        # raw_train = raw[-(len(train)+len(test)+1):-len(test)]
        train_rmse.append(evaluate(model, train_features, train_label, scaler, 0, batch_size))
        model.reset_states()
        # evaluate model on test data
        #raw_test = raw[-(len(test)+1):]
        test_rmse.append(evaluate(model, test_features, test_label, scaler, 0, batch_size))
        model.reset_states()
    history = DataFrame()
    history['train'], history['test'] = train_rmse, test_rmse
    return history

In [220]:
processed_dir = os.path.join(os.getcwd(),os.pardir, 'models')
processed_dir

'/Users/bbuildman/Documents/Developer/GitHub/001-BB-DL-ILI/notebooks/../models'

In [221]:
# evaluate the model on a dataset, returns RMSE in transformed units
def evaluate(model,
             test_features, 
             test_label ,
             scaler, 
             offset, 
             batch_size):
    # reshape
    reshaped = test_features.reshape(len(test_features), 1, 1)
    # forecast dataset
    output = model.predict(reshaped, batch_size=batch_size)
    # invert data transforms on forecast
    predictions = list()
    for i in range(len(output)):
        yhat = output[i,0]
        # invert scaling
        # yhat = invert_scale(scaler, X[i], yhat)
        # store forecast
        predictions.append(yhat)
    # report performance
    rmse = sqrt(mean_squared_error(raw_data[1:], predictions))
    return rmse

In [222]:
# run diagnostic experiments
def run():
    # load dataset
    series = parse_data(states_label, datasets, n_weeks = 260)
    labels = series.keys()
    print labels
    # transform data to be supervised learning
    reframed = reframe_data(series)
    # split data into train and test-sets
    train_features, train_label, test_features, test_label = get_tt_data(reframed)
    # transform the scale of the data
    # scaler, train_scaled, test_scaled = scale(train, test)
    # fit and evaluate model
    # train_trimmed = train_scaled[2:, :]
    # config
    repeats = 10
    nb_epoch = 10
    # run diagnostic tests
    for i in range(repeats):
        history = fit_lstm(labels, 
             train_features, 
             train_label, 
             test_features, 
             test_label , 
         #    scaler, 
             nb_epoch)
        pyplot.plot(history['train'], color='blue')
        pyplot.plot(history['test'], color='orange')
        print('%d) TrainRMSE=%f, TestRMSE=%f' % (i, history['train'].iloc[-1], history['test'].iloc[-1]))
    pyplot.savefig('epochs_diagnostic.png')
    

In [223]:
# entry point
run()

['WA', 'WI', 'WV', 'HI', 'TX', 'NE', 'NY', 'PA', 'VA', 'CO', 'CA', 'AL', 'AR', 'IL', 'GA', 'IN', 'AZ', 'CT', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MT', 'SC', 'KY', 'OR', 'SD']
The data contains 10 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 10 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 10 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 10 features
The data contains 10 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 9 features
The data contains 8 features
The data contains 9 features
The data contains 10 features
The data contains 9 features
The dat

ValueError: Error when checking input: expected NE_input to have shape (52, 1, 17) but got array with shape (156, 1, 19)

In [None]:
# train_data.size % batch_size = 0
def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()