In [None]:
'''
Author: Patrick Rudolph
Date: 1/8/20
Description: basic time series models: SEM, Holt, SARIMA
'''

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.tsa.stattools as ts
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, month_plot, quarter_plot
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

filepath = '/data2/users/prudolph/ids/'

plotsize = (13, 5)

In [None]:
# import sales file
sales = pd.read_csv(filepath + 'sales_053967_9490.csv')

In [None]:
# drop unnecessary columns
sales.drop(columns = ['STORE_ID','ARTICLE_ID','UNITS'], inplace = True)

In [None]:
# rename date column
sales.rename({'MONTH_END_DATE':'DATE'}, axis = 'columns', inplace = True)

In [None]:
sales['DATE'] = pd.to_datetime(sales['DATE'])

In [None]:
sales.head()

In [None]:
# check months
len(sales['DATE'].unique())

In [None]:
# check nulls
sales.isnull().sum()

In [None]:
# set date index
sales.set_index('DATE', inplace = True)

In [None]:
sales.head()

In [None]:
# plot series
sales.plot(figsize=plotsize);

In [None]:
# ACF and PACF
plot_acf(sales['UNITS_DAY'], lags = 12);
plot_pacf(sales['UNITS_DAY'], lags = 12);

In [None]:
# seasonal plot
month_plot(sales);

In [None]:
# violin plot
sns.violinplot(x=sales.index.month, y=sales['UNITS_DAY'])
plt.grid(b=True);

In [None]:
# convert to array
sales_array = np.array(sales.iloc[:,0])

In [None]:
# decomp
ss_decomposition = seasonal_decompose(x=sales_array, model='additive', freq=12)
estimated_trend = ss_decomposition.trend
estimated_seasonal = ss_decomposition.seasonal
estimated_residual = ss_decomposition.resid

In [None]:
# decomp plot
fig, axes = plt.subplots(4, 1, sharex=True, sharey=False)
fig.set_figheight(10)
fig.set_figwidth(15)

axes[0].plot(sales_array, label='Original')
axes[0].legend(loc='upper left');

axes[1].plot(estimated_trend, label='Trend')
axes[1].legend(loc='upper left');

axes[2].plot(estimated_seasonal, label='Seasonality')
axes[2].legend(loc='upper left');

axes[3].plot(estimated_residual, label='Residuals')
axes[3].legend(loc='upper left');

In [None]:
# split data into 12 chunks
chunks = np.split(sales['UNITS_DAY'], indices_or_sections=12)

In [None]:
# check mean and variance over time
mean_vals = np.mean(chunks,axis=1)
var_vals = np.var(chunks,axis=1)
vals = {'mean_vals': mean_vals , 'var_vals': var_vals}
mean_var = pd.DataFrame(vals)
mean_var

In [None]:
# plot rolling mean and variance
def dftest(timeseries):
    dftest = ts.adfuller(timeseries,)
    dfoutput = pd.Series(dftest[0:4], 
                         index=['Test Statistic','p-value','Lags Used','Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean and Standard Deviation')
    plt.grid()
    plt.show(block=False)

In [None]:
dftest(sales['UNITS_DAY'])

In [None]:
# stationarity (DF test)
adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(sales_array)
print(pvalue)