In [1]:
# grid search sarima hyperparameters for monthly mean sales 
import time
import pandas as pd
from math import sqrt
from multiprocessing import cpu_count
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [2]:
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://iss:6Jg3bwm56xtJ2mrfNQwvsaY$@idm5peipdsus5o.crcvo0yw3sz7.ap-southeast-1.rds.amazonaws.com:3306/iss_project')

In [3]:
# load dataset from DB
df = pd.read_sql_table('orderproducts_top20', engine)
prod_monthly = pd.crosstab(df['order_date'], df['product_sku']).resample('M').sum()
prod_monthly = prod_monthly['2018-01':'2021-03']
items = prod_monthly.columns

**Use auto_arima to find the best model since it is much faster that statsmodel gridsearch**

In [4]:
from pmdarima.arima import auto_arima

temp = prod_monthly['2018':'2020']['EFX-FLY-BLK'].values
# Select the best model using auto_arima
stepwise_model = auto_arima(temp, start_p=1, start_q=1,
                           max_p=3, max_q=3,
                           start_P=0, start_Q=0,
                           d=1, D=1, 
                           seasonal=True, m=12,
                           trace=True,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)
print('aic =', stepwise_model.aic())

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=165.938, Time=0.04 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=166.319, Time=0.01 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=165.435, Time=0.04 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=165.164, Time=0.06 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=163.954, Time=0.02 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=165.164, Time=0.05 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=167.164, Time=0.07 sec
 ARIMA(0,1,2)(0,1,0)[12]             : AIC=165.869, Time=0.03 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=164.449, Time=0.02 sec
 ARIMA(1,1,2)(0,1,0)[12]             : AIC=inf, Time=0.08 sec
 ARIMA(0,1,1)(0,1,0)[12] intercept   : AIC=165.952, Time=0.03 sec

Best model:  ARIMA(0,1,1)(0,1,0)[12]          
Total fit time: 0.452 seconds
aic = 163.95356575099117


#### Forecast the last 3 month sales for the top 20 items

In [9]:
# items are the names of the top20 items
test_predict = []
results =  pd.DataFrame()
trend = 'c'

for item in items:
    mse_list = []
    params = []
    data = prod_monthly[item]
    train = data.iloc[:-3]
    test  = data.iloc[-3:]
    
    stepwise_model = auto_arima(train, start_p=1, start_q=1,
                           max_p=2, max_q=2, m=12,
                           start_P=0, seasonal=True,
                           d=1, D=1, trace=False,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)
    
    order, sorder = stepwise_model.order, stepwise_model.seasonal_order
    
    model = SARIMAX(train, order=order, seasonal_order=sorder, 
                            trend=trend, enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit(disp=False)
    forecast = model_fit.predict(len(train), len(train)+3)
    adj_forecast = [ 0 if x < 0 else int(round(x)) for x in forecast ]
    item_name = [item for x in range(3)]
    params.append([order, sorder, trend])
    rmse = round(np.sqrt(mean_squared_error(test, adj_forecast[0:3])),2)
    diff = abs(sum(test.values - adj_forecast[0:3]))
    res = pd.DataFrame(zip(item_name, np.array(test), np.array(adj_forecast)), 
                       index=['m+1','m+2','m+3'], columns=['item', 'test', 'predict'])
    res['rmse'] = ''
    res.loc['m+1','rmse'] = rmse
    res['abs(diff)'] = ''
    res.loc['m+1','abs(diff)'] = diff    
    results = pd.concat([results,res], axis=0)
print(results)



                item  test  predict  rmse abs(diff)
m+1      EFX-FLY-BLK     2        0  3.83        10
m+2      EFX-FLY-BLK     2        0                
m+3      EFX-FLY-BLK     6        0                
m+1       M80-2B-BLK     1        4  4.24        12
m+2       M80-2B-BLK     1        7                
m+3       M80-2B-BLK     2        5                
m+1       M80-2G-BLK     5        3  2.71         2
m+2       M80-2G-BLK     4        7                
m+3       M80-2G-BLK     9        6                
m+1       M80-AC-BLK     0        4  2.65         5
m+2       M80-AC-BLK     0        2                
m+3       M80-AC-BLK     3        2                
m+1       M80-AD-BLK     3        6     3         9
m+2       M80-AD-BLK     1        4                
m+3       M80-AD-BLK     0        3                
m+1    M80-BTY-BLK-L     3        0  2.16         6
m+2    M80-BTY-BLK-L     1        0                
m+3    M80-BTY-BLK-L     2        0                
m+1    M80-B

In [10]:
results.to_csv('../data-processed/top20forecasts_SARIMA_01to03.csv')

#### Writing to database

results  = results.reset_index().rename(columns={'index':'month'})
results.to_sql(name='top20forecasts_SARIMA', con=engine, if_exists = 'replace', index=False)