In [1]:
# grid search sarima hyperparameters for monthly mean sales 
import time
import pandas as pd
from math import sqrt
from multiprocessing import cpu_count
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [2]:
# load dataset
df = pd.read_csv('../data-processed/orderproducts_top20.csv', parse_dates=[1], infer_datetime_format=True)
prod_monthly = pd.crosstab(df['order_date'], df['product_sku']).resample('M').sum()
prod_monthly = prod_monthly['2018-01':'2021-03']
items = prod_monthly.columns.values

**Use auto_arima to find the best model since it is much faster that statsmodel gridsearch**

In [3]:
from pmdarima.arima import auto_arima

data = prod_monthly['2018':'2020']['EFX-FLY-BLK'].values
# Select the best model using auto_arima
stepwise_model = auto_arima(data, start_p=1, start_q=1,
                           max_p=3, max_q=3,
                           start_P=0, start_Q=0,
                           d=1, D=1, 
                           seasonal=True, m=12,
                           trace=True,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)
print('aic =', stepwise_model.aic())

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=165.938, Time=0.07 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=166.319, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=165.435, Time=0.09 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=165.164, Time=0.15 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=163.954, Time=0.03 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=165.164, Time=0.10 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=167.164, Time=0.14 sec
 ARIMA(0,1,2)(0,1,0)[12]             : AIC=165.869, Time=0.08 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=164.449, Time=0.03 sec
 ARIMA(1,1,2)(0,1,0)[12]             : AIC=inf, Time=0.18 sec
 ARIMA(0,1,1)(0,1,0)[12] intercept   : AIC=165.952, Time=0.05 sec

Best model:  ARIMA(0,1,1)(0,1,0)[12]          
Total fit time: 0.929 seconds
aic = 163.95356575099117


#### Forecast the last 3 month sales for the top 20 items

In [4]:
prod_monthly = pd.crosstab(df['order_date'], df['product_sku']).resample('M').sum()
prod_monthly = prod_monthly['2018-01':'2021-03']
items = prod_monthly.columns.values

# items are the names of the top20 items
test_predict = []
results =  pd.DataFrame()
trend = 'c'

for item in items:
    mse_list = []
    params = []
    data = prod_monthly[item]
    train = data.iloc[:-3]
    test  = data.iloc[-3:]
    
    stepwise_model = auto_arima(train, start_p=1, start_q=1,
                           max_p=2, max_q=2, m=12,
                           start_P=0, seasonal=True,
                           d=1, D=1, trace=False,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)
    
    order, sorder = stepwise_model.order, stepwise_model.seasonal_order
    
    model = SARIMAX(train, order=order, seasonal_order=sorder, 
                            trend=trend, enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit(disp=False)
    forecast = model_fit.predict(len(train), len(train)+3)
    adj_forecast = [ 0 if x < 0 else int(round(x)) for x in forecast ]
    item_name = [item for x in range(3)]
    params.append([order, sorder, trend])
    rmse = round(np.sqrt(mean_squared_error(test, adj_forecast[0:3])),2)
    res = pd.DataFrame(zip(item_name, np.array(test), np.array(adj_forecast)), 
                       index=['m+1','m+2','m+3'], columns=['item', 'test', 'predict'])
    res['rmse'] = 'NA'
    res.loc['m+1','rmse'] = rmse
    results = pd.concat([results,res], axis=0)
print(results)



                item  test  predict  rmse
m+1      EFX-FLY-BLK     2        0  3.32
m+2      EFX-FLY-BLK     2        0    NA
m+3      EFX-FLY-BLK     5        0    NA
m+1       M80-2B-BLK     1        4  4.51
m+2       M80-2B-BLK     1        7    NA
m+3       M80-2B-BLK     1        5    NA
m+1       M80-2G-BLK     5        3  2.16
m+2       M80-2G-BLK     4        7    NA
m+3       M80-2G-BLK     5        6    NA
m+1       M80-AC-BLK     0        4  2.65
m+2       M80-AC-BLK     0        2    NA
m+3       M80-AC-BLK     1        2    NA
m+1       M80-AD-BLK     3        6     3
m+2       M80-AD-BLK     1        4    NA
m+3       M80-AD-BLK     0        3    NA
m+1    M80-BTY-BLK-L     3        0  1.91
m+2    M80-BTY-BLK-L     1        0    NA
m+3    M80-BTY-BLK-L     1        0    NA
m+1    M80-BTY-BLK-S     0        0  1.29
m+2    M80-BTY-BLK-S     2        0    NA
m+3    M80-BTY-BLK-S     1        0    NA
m+1       M80-EB-BLK     3        0  1.91
m+2       M80-EB-BLK     1        

In [5]:
results.to_csv('../data-processed/top20forecasts_SARIMA_01to03.csv')