# Introduction

Here I Will use all the function that I create in the previous notebook but the hibrid model would have now a LSTM! 

I have taking lot of ideas from this kaggle: https://www.kaggle.com/code/dimitreoliveira/time-series-forecasting-with-lstm-autoencoders/notebook

As always I change a little the original code to make it more efficient. Namely, in the code above the input it is created using a loop that is very innefficient. 

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression 

import seaborn as sns
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from tensorflow.keras import optimizers, Sequential, Model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

import os

In [2]:
os.environ["KERAS_BACKEND"] = "tensorflow"

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
# Read the data
# This data you can find here: https://www.kaggle.com/c/home-data-for-ml-course/data
data_dir ="sales_train"

train = pd.read_csv('sales_train.csv',
        parse_dates=['date'],
        dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 
        'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'}
)
train["date"] = pd.to_datetime(train["date"], format='%d.%m.%Y')

In [5]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1
1,2013-01-03,0,25,2552,899.0,1
2,2013-01-05,0,25,2552,899.0,-1
3,2013-01-06,0,25,2554,1709.050049,1
4,2013-01-15,0,25,2555,1099.0,1


In [6]:
train_monthly = train[['date', 'date_block_num', 'shop_id', 'item_id', 'item_cnt_day']]
train_monthly = train_monthly.sort_values('date').groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False)
train_monthly = train_monthly.agg({'item_cnt_day':['sum']})
train_monthly.columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt']
# IN the kaggle there is this query, but the explanation is not good enough to keep it
# train_monthly = train_monthly.query('item_cnt >= 0 and item_cnt <= 20')
# Label
train_monthly['item_cnt_month'] = train_monthly.sort_values('date_block_num').groupby(['shop_id', 'item_id'])['item_cnt'].shift(-1)

display(train_monthly.head(4))
#display(train_monthly.describe())

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt,item_cnt_month
0,0,0,32,6,10.0
1,0,0,33,3,3.0
2,0,0,35,1,14.0
3,0,0,43,1,


In [7]:
monthly_series = train_monthly.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num',values='item_cnt', fill_value=int(0)).reset_index()
monthly_series.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Lazy trick to change all columns to integer
for col in monthly_series.columns:
    try:
        monthly_series[col] = monthly_series[col].apply(lambda x:int(x))
    except:
        pass

In [9]:
temp_columns1 = [a for a in range(21,34)]
column1 = ["shop_id","item_id"] + temp_columns1 
temp_columns2 = [a for a in range(20,33)]
column2 = ["shop_id","item_id"] + temp_columns2 

In [10]:
monthly_series_new1 = monthly_series[column1]
monthly_series_new2 = monthly_series[column2]
column_names = [a for a in range(0,13)]
monthly_series_new1.columns = ["shop_id","item_id"] + column_names
monthly_series_new2.columns = ["shop_id","item_id"] + column_names
#monthly_series_new2[monthly_series_new2["shop_id"]==2].head()


In [11]:
#monthly_series_new1[monthly_series_new1["shop_id"]==2]

Something good to mention, notice that we have 13 months so the "0" and the "12" column are indeed the same month! 

In [12]:
data_series = pd.concat([monthly_series_new1,monthly_series_new2], axis=0)
data_series.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,31,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,35,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# to speed things up I am going to bluntly take out all rows that just have zero.
# the justification for that is probably those item in that shop are discontinue
data_series = data_series.loc[~(data_series[column_names]==0).all(axis=1)]

In [14]:
data_series.rename(columns={12: "label"},inplace=True)

In [15]:
data_series = data_series.drop(['item_id', 'shop_id'], axis=1)

In [16]:
labels = data_series['label']
data_series.drop('label', axis=1, inplace=True)
train, valid, Y_train, Y_valid = train_test_split(data_series, labels.values, test_size=0.10, random_state=0)

In [17]:
print("Train set", train.shape)
print("Validation set", valid.shape)
train.head()

Train set (321895, 12)
Validation set (35767, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
302831,0,0,0,0,0,0,1,1,0,0,0,0
332129,0,6,9,3,1,3,0,1,1,0,0,1
417454,1,0,2,0,1,1,2,0,1,0,0,0
340934,1,0,0,0,0,0,0,0,0,0,0,0
52883,0,5,0,0,0,0,0,0,0,0,0,0


In [18]:
X_train = train.values.reshape((train.shape[0], train.shape[1], 1))
X_valid = valid.values.reshape((valid.shape[0], valid.shape[1], 1))

print("Train set reshaped", X_train.shape)
print("Validation set reshaped", X_valid.shape)

Train set reshaped (321895, 12, 1)
Validation set reshaped (35767, 12, 1)


In [36]:
serie_size =  X_train.shape[1] # 12
n_features =  X_train.shape[2] # 1

epochs = 15
batch = 128
lr = 0.0001

lstm_model = Sequential()
lstm_model.add(LSTM(10, input_shape=(serie_size, n_features), return_sequences=True))
lstm_model.add(LSTM(6, activation='relu', return_sequences=True))
lstm_model.add(LSTM(1, activation='relu'))
lstm_model.add(Dense(15, kernel_initializer='glorot_normal', activation='relu'))
lstm_model.add(Dense(15, kernel_initializer='glorot_normal', activation='relu'))
lstm_model.add(Dense(1))
lstm_model.summary()

adam = optimizers.Adam(lr)
lstm_model.compile(loss='mse', optimizer=adam)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 12, 10)            480       
                                                                 
 lstm_4 (LSTM)               (None, 12, 6)             408       
                                                                 
 lstm_5 (LSTM)               (None, 1)                 32        
                                                                 
 dense_2 (Dense)             (None, 15)                30        
                                                                 
 dense_3 (Dense)             (None, 15)                240       
                                                                 
 dense_4 (Dense)             (None, 1)                 16        
                                                                 
Total params: 1,206
Trainable params: 1,206
Non-traina

In [None]:
lstm_history = lstm_model.fit(X_train, Y_train, 
                              validation_data=(X_valid, Y_valid), 
                              batch_size=batch, 
                              epochs=epochs, 
                              verbose=2)

In [21]:
import pickle
lstm_model.save('keras/lstm.keras')
with open('lstm_train_history_dict', 'wb') as file_pi:
    pickle.dump(lstm_history.history, file_pi)

In [22]:
# loading
with open('lstm_train_history_dict', "rb") as file_pi:
    history = pickle.load(file_pi)
#model = keras.models.load_model('keras/model_brain_trainable.keras')

In [26]:
#history

In [24]:
valid[10:25]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
102369,0,0,0,2,1,0,0,0,0,0,0,0
256822,2,0,0,0,1,1,0,0,0,0,1,0
226482,0,0,0,0,1,2,0,1,0,1,0,0
405942,0,0,0,0,0,0,1,0,0,0,0,0
162901,0,0,1,0,0,0,0,0,0,0,0,0
61475,0,0,0,0,0,0,0,0,1,0,0,0
359159,0,0,0,0,0,1,0,0,0,0,0,0
357491,0,1,1,0,0,1,0,0,0,0,0,0
229167,0,0,0,0,2,0,0,0,0,0,0,0
90818,0,0,0,1,0,0,0,0,0,0,0,0


In [25]:
pred = lstm_model.predict(valid[:10])
pred



array([[0.32603028],
       [0.11218613],
       [0.16677865],
       [0.16677865],
       [0.37823427],
       [0.11572798],
       [0.41687366],
       [0.5099525 ],
       [1.3837978 ],
       [0.11200292]], dtype=float32)

In [29]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [31]:
Y_valid[0:10]

array([0, 0, 0, 0, 0, 0, 2, 1, 1, 0], dtype=int64)

In [32]:
pred.flatten()

array([0.32603028, 0.11218613, 0.16677865, 0.16677865, 0.37823427,
       0.11572798, 0.41687366, 0.5099525 , 1.3837978 , 0.11200292],
      dtype=float32)

In [35]:
pd.DataFrame({"pred":pred.flatten(),"real":Y_valid[0:10]})

Unnamed: 0,pred,real
0,0.32603,0
1,0.112186,0
2,0.166779,0
3,0.166779,0
4,0.378234,0
5,0.115728,0
6,0.416874,2
7,0.509952,1
8,1.383798,1
9,0.112003,0


We got a very bad result! 
So what we are going to do is to keep improving this model to see if we can make it work. Maybe we need some hibrid model like we did in the previous notebook. One model to catch the trend and another to learn this complex interactions.