### Packages

In [1]:
%matplotlib inline

# plotting
import matplotlib as mpl
mpl.style.use('ggplot')
import matplotlib.pyplot as plt

# math and data manipulation
import numpy as np
import pandas as pd

# to handle paths
from pathlib import Path

# set random seeds 
from numpy.random import seed
from tensorflow import set_random_seed

# to handle warning
import warnings
warnings.filterwarnings('ignore')

# to handle pre-processing
from sklearn.preprocessing import MinMaxScaler


# modeling
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

# progress bar
from tqdm import tqdm


# to save model
from keras.models import model_from_json

RANDOM_SEED = 2018
seed(RANDOM_SEED)
set_random_seed(RANDOM_SEED)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Functions


In [2]:
## create dummies for given cols set

def create_dummies(data, cols):
    data_dummies = pd.get_dummies(data[cols])
    data = pd.concat([data,data_dummies],axis=1)
    return data
    

In [3]:
### data pre-processing steps use for both train and test dataset
def data_preprocessing(data, cols=[]):
    data['date'] = data.timestamp.dt.date
    data['weekday'] = data.timestamp.dt.weekday
    cols = ['series_id','date','weekday'] + cols
    data = data.groupby(cols)['consumption'].sum().reset_index()
    data['timestamp'] = data['date']
    data =create_dummies(data, "weekday")
    return data
    

In [4]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


def prepare_training_data(consumption_series, n_lags,n_features):
    """ Converts a series of consumption data into a
        lagged, scaled sample.
    """
    n_obs = n_lags*n_features
    # scale training data
    scaler = MinMaxScaler(feature_range=(-1, 1))
    #scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(consumption_series)
    #consumption_vals = scaler.fit_transform(consumption_series.values.reshape(-1, 1))
    #consumption_vals = scaled.values.reshape(-1, 1)
    
    
    consumption_lagged1 = series_to_supervised(scaled, n_lags, 1)
    consumption_lagged = consumption_lagged1.values
    
    
    X, y = consumption_lagged[:, :n_obs], consumption_lagged[:, -n_features]
    X = X.reshape((X.shape[0], n_lags, n_features))
    
    return X, y, scaler


In [5]:
# update X to be latest data plus prediction
def generate_features_prediction(pred, X, lag, scaler):
    if (lag>1):
        X = X[no_features:]
        last_entry = list(X[-(no_features-1):])
    else:
        last_entry = list(X[1:])
        X = []
        
    weekday_ind = last_entry.index(1)
    weekday_ind = (weekday_ind+1)%7

    weekday_features = no_features*[-1]
    weekday_features[weekday_ind] = 1
    
    X = list(X)
    X.append(pred)

    for j in range(no_features-1):
        X.append(weekday_features[j])
    X = np.array(X)
    
    # revert scale back to original range
    a1 = no_features*[0]
    a1[0] = pred
    a1 = np.array(a1)
    pred_scaled = (scaler.inverse_transform(a1.reshape(1,-1)))[0][0]
   
    return X,pred_scaled
    

In [6]:
def generate_forecast(num_pred_days, shipment_data, model, scaler, lag, no_features, scaling = True, output_scaled=True):
    """ Uses last hour's prediction to generate next for num_pred_hours, 
        initialized by most recent cold start prediction. Inverts scale of 
        predictions before return.
    """
    # allocate prediction frame
    preds_scaled = np.zeros(num_pred_days)
    n = lag*no_features
    
    # initial X is last lag values from the cold start
    if (scaling):
        X = scaler.transform(shipment_data.values.ravel().reshape(-1, 1))[-n:]
    else:
        X = shipment_data.ravel()[-n:]
       
    for i in range(num_pred_days):
        # predict scaled value for next time step
        yhat = model.predict(X.reshape(1, lag,no_features), batch_size=1)[0][0]
        
        X,yhat_scaled = generate_features_prediction(yhat, X, lag, scaler)
        
        # revert scale back to original range
        if (output_scaled):
            preds_scaled[i] = yhat_scaled
        else:
            preds_scaled[i] = yhat
   
    return preds_scaled 

### Data Dictionary & Model Parameters

In [7]:
### Data Dictionary
data_path = Path('..', '01.Data')

### Features
cols_to_use = ['consumption', 0,1,2,3,4,5,6]
no_features = len(cols_to_use)


# model parameters
num_neurons = 8
lag = 1
num_passes_through_data = 10

# model store
model_path = Path('..', '05.Model')
model_path.mkdir(exist_ok=True, parents=True)

model_file_name = "model_02_Test_LSTM_Daily_Weekly-Multivariate-Lag_7_1-V2.h5"
existing_model_load = False
existing_model_file_name = "model_02_Test_LSTM_Daily_Weekly-Multivariate-Lag_7_1-V2.h5"

# result store
save_path = Path('..', '06.Test')
save_path.mkdir(exist_ok=True, parents=True)
result_file_name = "Lstm_check.csv"



### Loading dataset

In [8]:
### Consumption data

consumption_train = pd.read_csv(data_path / 'consumption_train.csv', index_col=0, parse_dates=['timestamp'])
consumption_train = data_preprocessing(consumption_train)
print ("consumption dataset -->", consumption_train.shape)

cold_start_test = pd.read_csv(data_path / 'cold_start_test.csv', index_col=0, parse_dates=['timestamp'])
cold_start_test = data_preprocessing(cold_start_test)
print ("cold start train dataset -->", cold_start_test.shape)

submission_format = pd.read_csv(data_path / 'submission_format.csv',index_col='pred_id',parse_dates=['timestamp'])
submission_format = data_preprocessing(submission_format,cols = ["prediction_window"])
print ("submission format -->", submission_format.shape)

consumption dataset --> (21224, 12)
cold start train dataset --> (4666, 12)
submission format --> (1963, 13)


In [9]:
### Pre-processing steps

cold_start_weekday_data = cold_start_test.groupby(['series_id'])[0,1,2,3,4,5,6].sum().reset_index()
cold_start_weekday_data['weekday'] = cold_start_weekday_data[0] + cold_start_weekday_data[1] +cold_start_weekday_data[2]+cold_start_weekday_data[3] +cold_start_weekday_data[4]
    
cold_start_weekday_data['weekend'] = cold_start_weekday_data[5] + cold_start_weekday_data[6]

cold_start_weekday_list = cold_start_weekday_data[(cold_start_weekday_data.weekday==0)]
cold_start_weekend_list = cold_start_weekday_data[(cold_start_weekday_data.weekend==0)]

cold_start_weekday_missing_list = list(cold_start_weekday_list.series_id)
cold_start_weekend_missing_list = list(cold_start_weekend_list.series_id)

In [10]:
submission_format_filter_daily_weekly = submission_format[submission_format.prediction_window != "hourly"]
print ("submission format-->", submission_format_filter_daily_weekly.shape)

# copy submission format and fill in values
my_submission = submission_format_filter_daily_weekly.copy()
print ("output format-->", my_submission.shape)
my_submission['consumption'] = 0

submission format--> (1721, 13)
output format--> (1721, 13)


### Modeling

In [11]:
# model initiate 

if (existing_model_load):
    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights(model_path /existing_model_file_name)
    model.compile(loss='mean_absolute_error', optimizer='adam')
    print("Loaded model from disk")
    
else:
    
    batch_size = 1  # this forces the lstm to step through each time-step one at a time
    batch_input_shape=(batch_size, lag, no_features)

    # instantiate a sequential model
    model = Sequential()

    # add LSTM layer - stateful MUST be true here in 
    # order to learn the patterns within a series
    model.add(LSTM(units=num_neurons, 
                  batch_input_shape=batch_input_shape, 
                  stateful=True))

    model.add(Dense(3, activation='relu'))
    # followed by a dense layer with a single output for regression
    model.add(Dense(1))

    # compile
    model.compile(loss='mean_absolute_error', optimizer='adam')
    
    
    %%time
    num_training_series = consumption_train.series_id.nunique()


    for i in tqdm(range(num_passes_through_data), 
                  total=num_passes_through_data, 
                  desc='Learning Consumption Trends - Epoch'):

        # reset the LSTM state for training on each series
        for ser_id, ser_data in consumption_train.groupby('series_id'):

            # prepare the data
            X, y, scaler = prepare_training_data(ser_data[cols_to_use], lag, no_features)

            # fit the model: note that we don't shuffle batches (it would ruin the sequence)
            # and that we reset states only after an entire X has been fit, instead of after
            # each (size 1) batch, as is the case when stateful=False
            model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
            model.reset_states()
            
    # serialize model to JSON
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)

    model_file_name
    # serialize weights to HDF5
    model.save_weights(model_path /model_file_name)
    print("Saved model to disk")

Learning Consumption Trends - Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.7 µs


Learning Consumption Trends - Epoch: 100%|██████████| 10/10 [06:12<00:00, 37.25s/it]

Saved model to disk





In [12]:
%%time
pred_window_to_num_preds = {'hourly': 1, 'daily': 7, 'weekly': 14}
pred_window_to_num_days = {'hourly': 1, 'daily': 7, 'weekly': 2}
#pred_window_to_num_pred_hours = {'hourly': 24, 'daily': 7 * 24, 'weekly': 2 * 7 * 24}

num_test_series = my_submission.series_id.nunique()

model.reset_states()
count=0
for ser_id, pred_df in tqdm(my_submission.groupby('series_id'), 
                            total=num_test_series, 
                            desc="Forecasting from Cold Start Data"):
        
    # get info about this series' prediction window
    pred_window = pred_df.prediction_window.unique()[0]
    num_preds = pred_window_to_num_preds[pred_window]
    num_days = pred_window_to_num_days[pred_window]
    #num_pred_hours = pred_window_to_num_pred_hours[pred_window]
    
    # prepare cold start data
    series_data = cold_start_test[cold_start_test.series_id == ser_id]
    series_data = series_data[cols_to_use]
    cold_X, cold_y, scaler = prepare_training_data(series_data, lag, no_features)
    if (len(cold_X)):
        check_flag = 1
    else:
        check_flag=0
    check_flag = check_flag & (ser_id not in cold_start_weekday_missing_list) & (ser_id not in cold_start_weekend_missing_list)
    if (check_flag):
        count+=1

        # fine tune our lstm model to this site using cold start data    
        model.fit(cold_X, cold_y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
        b = series_data[-lag:].values[0]
        c = scaler.transform(b.reshape(1,no_features))[0]
        preds = generate_forecast(num_preds, c, model, scaler, lag=lag, no_features=no_features, scaling = False)
    else:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        max_value = max(series_data.consumption)
        series_data = series_data.tail(1)
        
        if ser_id in cold_start_weekday_missing_list:
            l1 = no_features*[0]
            l1[0] = 1.3*max_value
            l1 = pd.DataFrame(data=[l1], columns = series_data.columns)
            a = pd.concat([series_data,l1], axis=0)
            b = scaler.fit_transform(a)
            scaled = scaler.transform(series_data)
        elif ser_id in cold_start_weekend_missing_list:
            l1 = no_features*[0]
            l1[0] = 0.7*max_value
            l1 = pd.DataFrame(data=[l1], columns = series_data.columns)
            a = pd.concat([series_data,l1], axis=0)
            b = scaler.fit_transform(a)
            scaled = scaler.transform(series_data)
        else:
            print (ser_id)
            scaled = scaler.fit_transform(series_data)
        weekday_ind = list(series_data.values[0]).index(1)
        scaled[0][weekday_ind] =1
        
        # make hourly forecasts for duration of pred window
        preds = generate_forecast(num_preds,scaled, model, scaler, lag=lag, no_features=no_features, scaling = False)
        
        # reduce by taking sum over each sub window in pred window
    reduced_preds = [pred.sum() for pred in np.split(preds, num_days)]

        # store result in submission DataFrame
    ser_id_mask = my_submission.series_id == ser_id
    my_submission.loc[ser_id_mask, 'consumption'] = reduced_preds
    model.reset_states()

Forecasting from Cold Start Data: 100%|██████████| 383/383 [00:11<00:00, 34.45it/s]

CPU times: user 22.2 s, sys: 3.89 s, total: 26.1 s
Wall time: 11.1 s





In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (1, 8)                    544       
_________________________________________________________________
dense_1 (Dense)              (1, 3)                    27        
_________________________________________________________________
dense_2 (Dense)              (1, 1)                    4         
Total params: 575
Trainable params: 575
Non-trainable params: 0
_________________________________________________________________


In [14]:
save_path = Path('..', '06.Test')


my_submission_final  = my_submission[(my_submission.consumption != 0)]

my_submission_final.to_csv(save_path / result_file_name, index_label='pred_id')
print ("Submission Done")

Submission Done
