## IMPORT LIBRARIES

In [2]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from __future__ import division

import warnings
warnings.filterwarnings("ignore")

from chart_studio import plotly as py
from plotly import offline as pyoff, offline as pyoff, graph_objs as go

# Imports statsmodels.formula.api
import statsmodels.formula.api as smf

from sklearn.preprocessing import MinMaxScaler

import zipfile

from keras.layers import Dense
from keras.models import Sequential
from keras.layers import LSTM

aaaaa


## Data manipulation and cleansing<a name="preparation"></a>

In [3]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('dataSets.zip')

df_sales = pd.read_csv(zf.open('train.csv'))

In [4]:
df_sales.head(10)

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
5,2013-01-06,1,1,12
6,2013-01-07,1,1,10
7,2013-01-08,1,1,9
8,2013-01-09,1,1,12
9,2013-01-10,1,1,9


In [5]:
# Checks columns and their types
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB


In [6]:
# Converts column data into datetimelike values
df_sales['date'] = pd.to_datetime(df_sales['date'])

In [7]:
# Represents month in date field as its first day. For instance Dezember of 2010 will be 2010-12-01
df_sales['date'] = df_sales['date'].dt.year.astype('str') + '-' + df_sales['date'].dt.month.astype('str') + '-01'

# GroupBy sales per month/year
df_sales = df_sales.groupby('date').sales.sum().reset_index()

In [8]:
# Prints the dataframe to check whether the field date created is dateTimeLike
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    60 non-null     object
 1   sales   60 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [9]:
# Converts column data into datetimelike values
df_sales['date'] = pd.to_datetime(df_sales['date'])

# Sorts the dataframe by date
df_sales = df_sales.sort_values("date")

## Data Transformation

In [10]:
# Plot a graph of sales per month to check whether the data is stationary
plot_data = [
    go.Scatter(
        x=df_sales['date'],
        y=df_sales['sales'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [11]:
# As shown thoughout the gragh above the X varries over time, which means it is not stationary.

# Copies the main dataframe
df_diff = df_sales.copy()

# Creates a column containing the sales from the previous mounth
df_diff['prev_sales'] = df_diff['sales'].shift(1)

# Drops the null values
df_diff = df_diff.dropna()

# Creates a column with the different beetween the [actual month sold value -  previous month sold value]
df_diff['diff'] = (df_diff['sales'] - df_diff['prev_sales'])
df_diff.head(10)

Unnamed: 0,date,sales,prev_sales,diff
4,2013-02-01,459417,454904.0,4513.0
5,2013-03-01,617382,459417.0,157965.0
6,2013-04-01,682274,617382.0,64892.0
7,2013-05-01,763242,682274.0,80968.0
8,2013-06-01,795597,763242.0,32355.0
9,2013-07-01,855922,795597.0,60325.0
10,2013-08-01,766761,855922.0,-89161.0
11,2013-09-01,689907,766761.0,-76854.0
1,2013-10-01,656587,689907.0,-33320.0
2,2013-11-01,692643,656587.0,36056.0


In [12]:
# Plots sales diff to check whether X diff is stationary
plot_data = [
    go.Scatter(
        x=df_diff['date'],
        y=df_diff['diff'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales Diff'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [13]:
# Copies the dataFrame dropping the column prev_sales
df_supervised = df_diff.drop(['prev_sales'], axis=1)

# Adds the previous 12 months sold from the actual date
for inc in range(1, 13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)

# drops null values
df_supervised = df_supervised.dropna().reset_index(drop=True)    

In [14]:
df_supervised.head()

Unnamed: 0,date,sales,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
0,2014-02-01,529117,3130.0,19380.0,-186036.0,36056.0,-33320.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0,4513.0
1,2014-03-01,704301,175184.0,3130.0,19380.0,-186036.0,36056.0,-33320.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0
2,2014-04-01,788914,84613.0,175184.0,3130.0,19380.0,-186036.0,36056.0,-33320.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0
3,2014-05-01,882877,93963.0,84613.0,175184.0,3130.0,19380.0,-186036.0,36056.0,-33320.0,-76854.0,-89161.0,60325.0,32355.0,80968.0
4,2014-06-01,906842,23965.0,93963.0,84613.0,175184.0,3130.0,19380.0,-186036.0,36056.0,-33320.0,-76854.0,-89161.0,60325.0,32355.0


In [15]:
# Defines the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12' , data=df_supervised)

# Fit the regression
model_fit = model.fit()

regression_adj_raq = model_fit.rsquared_adj

print("The model has a prediction of " + str(regression_adj_raq) + " according to the adjusted R-squared")

The model has a prediction of 0.9795722233296558 according to the adjusted R-squared


In [16]:
# Drops columns sales and date 
df_model = df_supervised.drop(['sales','date'], axis=1)

# Splits train and test, which test data set contains the last 6 months’ sales
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [17]:
# Applies Min Max Scaler in train dataSet
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)

# Reshapes training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)

# Reshapes test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

### Building the LSTM model

In [18]:
# Applies the shape defined above [-1 to 1] in the Train model
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])

# Applies the shape defined above [-1 to 1] in the Test model
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [19]:
# Creates a sequencial model
model = Sequential()

# It is a LSTM RNN model
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))

# Calculates the prediction error by mean_squared_error
model.compile(loss='mean_squared_error', optimizer='adam')

# Trains the model
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=0, shuffle=False)

<keras.callbacks.History at 0x2476ed01640>

In [20]:
# Predicts the sales
y_pred = model.predict(X_test, batch_size=1)

# Reshapes prediction sales[ y_pred ] from 0 to 1 
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])

# Rebuild test set for inverse transform
pred_test_set=[]
for index in range(0,len(y_pred)):
    #print(np.concatenate([y_pred[index], X_test[index]], axis=1))
    pred_test_set.append(np.concatenate([y_pred[index], X_test[index]], axis=1))

# Reshapes pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])

# Inverses transform, which means get the original data scaling
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

In [21]:
# Create dataframe that shows the predicted sales
result_list = []

# Gets the sales' date
sales_dates = list(df_sales[-7:].date)

# Gets the sales' value
act_sales = list(df_sales[-7:].sales)

# From all sales
for index in range(0, len(pred_test_set_inverted)):
    result_dict = {}

    # Gets the prediction value of the actual index
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])

    # Gets the prediction date of the actual index
    result_dict['date'] = sales_dates[index+1]

    # Inserts in a list
    result_list.append(result_dict)

df_result = pd.DataFrame(result_list)

df_result

Unnamed: 0,pred_value,date
0,1188254,2017-07-01
1,1021266,2017-08-01
2,928635,2017-09-01
3,913077,2017-10-01
4,918851,2017-11-01
5,688778,2017-12-01


In [22]:
# Merges with actual sales dataframe

df_sales_pred = pd.merge(df_sales, df_result, on='date', how='left')

# Plots actual and predicted values
plot_data = [
    go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['sales'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]
plot_layout = go.Layout(
        title='Sales Prediction'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)