In [3]:
import pandas as pd
import math as m
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split

from time import time
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
import pmdarima as pm

"""To display up to 50 columns of dataset"""
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 8)

"""To display all outputs of each cell"""
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


np.random.seed(100)

"""To get a timing of each function decorated"""
def timing(f):
    """
    Decorator for timing functions
    Usage:
    @timing
    def function(&):
        pass
    """
    
    def wrapper(*args, **kwargs):
        start = time()
        result = f(*args, **kwargs)
        end = time()
        print('function:%r took: %2.2f sec' % (f.__name__, end - start))
        return result
    return wrapper

class Trainer():
    def __init__(self, input_directory, df_): #input_directory = path to the folder containing df. 
        self.input_directory = input_directory
        self.df = df_
                
    @timing
    def read(self):
        """Read raw data"""
        self.df = pd.read_csv(self.input_directory + self.df)
    
    @timing
    def auto_arima(self, nb_store):
        #Date as index
        self.df = self.df[self.df["Open"] == 1]
        self.df = pd.DataFrame(self.df, columns=['Date','Store','Sales'])
        self.df.sort_values(['Store', 'Date'], ascending=[True, True], inplace=True)
        #Re_indexation 
        self.df = self.df.reset_index(drop='True')
        
        """put the date as index"""
        self.df["Date"] = pd.to_datetime(self.df["Date"].astype(str), format = '%Y/%m/%d')
        self.df = self.df.set_index("Date")
        
        self.subset = self.df[(self.df["Store"]==(nb_store))]
        self.subset = self.subset.drop(["Store"], axis = 1)

        # Split train and test set  
        self.train = self.subset.loc[:'2015-06-13', :]
        self.test = self.subset.loc['2015-06-14':, :]

        # Build Model
        self.model = pm.auto_arima(self.subset.Sales, start_p=1, start_q=1,
                          test='adf',       # use adftest to find optimal 'd'
                          max_p=3, max_q=3, # maximum p and q
                          m=12,              # frequency of series
                          d=0,           # let model determine 'd'
                          seasonal=True,   # No Seasonality
                          start_P=1, max_P=2,
                          D=1, 
                          trace=True,
                          error_action='ignore',  
                          suppress_warnings=True, 
                          stepwise=True)
        
        # Forecast
        self.n_periods = len(self.test)
        self.fc, self.confint = self.model.predict(n_periods=self.n_periods, return_conf_int=True)

        self.fc_series = pd.DataFrame(self.fc)
        self.fc_series.columns = ['predicted_SARIMA']
        self.fc_series = self.fc_series.reset_index(drop=True)
        self.fc_series = self.fc_series.set_index(self.test.index)
        self.test.columns = ['Sales']

        # Validate
        self.errors = abs(self.fc_series['predicted_SARIMA'] - self.test['Sales'])
        self.mae = round(np.mean(self.errors), 2)
        self.rmse = round(m.sqrt((1/len(self.test))*sum(self.errors**2)), 2)

        self.result = pd.merge(self.test, self.fc_series ,how = 'left',left_index = True, right_index = True)
        self.result["error_SARIMA"] = abs(self.result['predicted_SARIMA'] - self.result['Sales'])

        return self.result

In [5]:
project = Trainer("/Users/rfdanti/Documents/UoM/2_data_analytics/coursework/CODE/", "full_trainset_v2.csv")
project.read()
result_403 = project.auto_arima(403)
result_403 = result_403.reset_index(drop=False)
result_403.head()

function:'read' took: 2.13 sec
Fit ARIMA: order=(1, 0, 1) seasonal_order=(1, 1, 1, 12); AIC=13490.636, BIC=13518.491, Fit time=3.543 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 1, 0, 12); AIC=13732.638, BIC=13741.923, Fit time=0.054 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(1, 1, 0, 12); AIC=13611.178, BIC=13629.748, Fit time=0.374 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 1, 1, 12); AIC=13556.956, BIC=13575.526, Fit time=1.297 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 1, 1, 12); AIC=13519.469, BIC=13542.681, Fit time=2.516 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(2, 1, 1, 12); AIC=13490.421, BIC=13522.918, Fit time=8.464 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(2, 1, 0, 12); AIC=13551.976, BIC=13579.831, Fit time=4.419 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(2, 1, 2, 12); AIC=13516.953, BIC=13554.093, Fit time=10.479 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(1, 1, 0, 12); AIC=13575.733, BIC=13598.945, F

Unnamed: 0,Date,Sales,predicted_SARIMA,error_SARIMA
0,2015-06-15,12441,6128.627049,6312.372951
1,2015-06-16,9790,8065.863294,1724.136706
2,2015-06-17,9176,7331.771196,1844.228804
3,2015-06-18,8040,7071.658547,968.341453
4,2015-06-19,7619,7175.703665,443.296335


In [12]:
project = Trainer("/Users/rfdanti/Documents/UoM/2_data_analytics/coursework/CODE/", "full_trainset_v2.csv")
project.read()
result_351 = project.auto_arima(351)
result_351 = result_351.reset_index(drop=False)
result_351.head()

function:'read' took: 2.25 sec
Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=13350.027, BIC=13373.317, Fit time=0.312 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 0, 0, 12); AIC=13667.225, BIC=13676.541, Fit time=0.020 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(1, 0, 0, 12); AIC=13354.029, BIC=13372.661, Fit time=0.678 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=13417.566, BIC=13436.198, Fit time=0.584 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(1, 0, 1, 12); AIC=13338.380, BIC=13366.328, Fit time=2.794 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(1, 0, 0, 12); AIC=13336.488, BIC=13359.778, Fit time=1.326 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(2, 0, 1, 12); AIC=13333.431, BIC=13366.037, Fit time=7.837 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(2, 0, 1, 12); AIC=13393.261, BIC=13421.209, Fit time=3.913 seconds
Fit ARIMA: order=(2, 0, 1) seasonal_order=(2, 0, 1, 12); AIC=13287.346, BIC=13324.610, Fi

Unnamed: 0,Date,Sales,predicted_SARIMA,error_SARIMA
0,2015-06-15,9850,6715.093758,3134.906242
1,2015-06-16,7449,6348.952442,1100.047558
2,2015-06-17,6829,5553.533569,1275.466431
3,2015-06-18,6628,5498.469978,1129.530022
4,2015-06-19,6716,5489.072939,1226.927061


In [13]:
result_351.to_csv("SARIMA_store351_6weeks.csv", index=False)

In [14]:
project = Trainer("/Users/rfdanti/Documents/UoM/2_data_analytics/coursework/CODE/", "full_trainset_v2.csv")
project.read()
result_279 = project.auto_arima(279)
result_279 = result_279.reset_index(drop=False)
result_279.head()

function:'read' took: 2.66 sec
Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=14260.924, BIC=14284.208, Fit time=0.344 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 0, 0, 12); AIC=14635.887, BIC=14645.201, Fit time=0.021 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(1, 0, 0, 12); AIC=14245.976, BIC=14264.603, Fit time=0.574 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=14307.935, BIC=14326.562, Fit time=0.556 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(0, 0, 0, 12); AIC=14343.359, BIC=14357.329, Fit time=0.154 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(2, 0, 0, 12); AIC=14244.064, BIC=14267.348, Fit time=3.232 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(2, 0, 1, 12); AIC=14215.690, BIC=14243.631, Fit time=3.489 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(2, 0, 1, 12); AIC=14435.978, BIC=14459.262, Fit time=3.095 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(2, 0, 1, 12); AIC=14217.800, BIC=14250.397, Fi

Unnamed: 0,Date,Sales,predicted_SARIMA,error_SARIMA
0,2015-06-15,19655,10799.676032,8855.323968
1,2015-06-16,11692,9620.93981,2071.06019
2,2015-06-17,12551,8422.759637,4128.240363
3,2015-06-18,10696,8126.53138,2569.46862
4,2015-06-19,10873,8877.243697,1995.756303


In [15]:
result_279.to_csv("SARIMA_store279_6weeks.csv", index=False)

In [16]:
project = Trainer("/Users/rfdanti/Documents/UoM/2_data_analytics/coursework/CODE/", "full_trainset_v2.csv")
project.read()
result_733 = project.auto_arima(733)
result_733 = result_733.reset_index(drop=False)
result_733.head()

function:'read' took: 2.68 sec
Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=16483.942, BIC=16508.182, Fit time=0.534 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 0, 0, 12); AIC=16837.285, BIC=16846.981, Fit time=0.028 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(1, 0, 0, 12); AIC=16503.880, BIC=16523.272, Fit time=0.829 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 0, 1, 12); AIC=16598.138, BIC=16617.530, Fit time=0.939 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(1, 0, 1, 12); AIC=16755.885, BIC=16784.973, Fit time=2.722 seconds
Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 0, 0, 12); AIC=16481.977, BIC=16501.369, Fit time=0.200 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 0, 0, 12); AIC=16608.055, BIC=16622.599, Fit time=0.324 seconds
Fit ARIMA: order=(2, 0, 1) seasonal_order=(0, 0, 0, 12); AIC=16479.084, BIC=16503.324, Fit time=0.296 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(0, 0, 0, 12); AIC=16480.343, BIC=16499.735, Fi

Unnamed: 0,Date,Sales,predicted_SARIMA,error_SARIMA
0,2015-06-14,16452,14656.415472,1795.584528
1,2015-06-15,16710,15189.754797,1520.245203
2,2015-06-16,16007,15485.621578,521.378422
3,2015-06-17,15036,14265.267545,770.732455
4,2015-06-18,15269,14428.190578,840.809422


In [17]:
result_733.to_csv("SARIMA_store733_6weeks.csv", index=False)

In [2]:
trainset = pd.read_csv("full_trainset_v2.csv")

#Date as index
df = trainset[trainset["Open"] == 1]
df = pd.DataFrame(df, columns=['Date','Store','Sales'])
df.sort_values(['Store', 'Date'], ascending=[True, True], inplace=True)
df = df.reset_index(drop=True)
df = df.set_index('Date')

In [None]:
import pmdarima as pm
dt = time()
mae_list = []
rmse_list = []
for i in range(len(df["Store"].unique())):
    try:
        # Filter by Store
        subset = df[(df["Store"]==(i+1))]
        subset = subset.drop(["Store"], axis = 1)

        # Split train and test set  
        train = subset.loc[:'2015-06-13', :]
        test = subset.loc['2015-06-14':, :]

        # Build Model
        model = pm.auto_arima(subset.Sales, start_p=2, start_q=0,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=2, # maximum p and q
                      m=12,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=True,   # Seasonality
                      start_P=1, max_P=2, Q=1,
                      D=1, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)
        # Forecast
        n_periods = len(test)
        fc, confint = model.predict(n_periods=n_periods, return_conf_int=True)
        
        fc_series = pd.DataFrame(fc)
        fc_series.columns = ['Sales']
        fc_series = fc_series.reset_index(drop=True)
        fc_series = fc_series.set_index(test.index)
        test.columns = ['Sales']

        # Validate
        errors = abs(fc_series['Sales'] - test['Sales'])
        mae = round(np.mean(errors), 2)
        rmse = round(m.sqrt((1/len(test))*sum(errors**2)), 2)

        mae_list.append(mae)
        rmse_list.append(rmse)
    
    except:
        mae_list.append(np.nan)
        rmse_list.append(np.nan)
        pass

duration = time() - dt
print(duration)
print(rmse_list)



Fit ARIMA: order=(2, 0, 0) seasonal_order=(1, 1, 1, 12); AIC=12356.254, BIC=12384.125, Fit time=4.021 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 1, 0, 12); AIC=13049.551, BIC=13058.842, Fit time=0.056 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(1, 1, 0, 12); AIC=12509.540, BIC=12528.120, Fit time=1.234 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 1, 1, 12); AIC=12552.645, BIC=12571.226, Fit time=2.472 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(0, 1, 1, 12); AIC=12363.684, BIC=12386.909, Fit time=2.867 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(2, 1, 1, 12); AIC=12349.369, BIC=12381.885, Fit time=13.848 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(2, 1, 0, 12); AIC=12454.375, BIC=12482.246, Fit time=6.350 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(2, 1, 2, 12); AIC=12363.857, BIC=12401.017, Fit time=17.232 seconds
Fit ARIMA: order=(2, 0, 0) seasonal_order=(1, 1, 0, 12); AIC=12499.872, BIC=12523.097, Fit time=1.594 seconds
Fit ARIM

In [None]:
rmse_list = pd.DataFrame(rmse_list, columns=["RMSE"])
mae_list = pd.DataFrame(mae_list, columns=["MAE"])

store = list(range(1,1116))
store_series = pd.DataFrame(store, columns=["Store"])
result = pd.merge(store_series, rmse_list, how = 'left',left_index = True, right_index = True)
result = pd.merge(result, mae_list, how = 'left',left_index = True, right_index = True)
result.head()

In [None]:
result.RMSE.mean()

In [None]:
result.RMSE.median()

In [None]:
result.to_csv("arima_result_6weeks.csv", index=False)