In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tsa.filters.hp_filter import hpfilter
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import statsmodels.api as sm
from statsmodels.tsa.stattools import acovf,acf,pacf,pacf_yw,pacf_ols
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.ar_model import AR,ARResults
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import month_plot,quarter_plot
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv',index_col='date',parse_dates=True)
df.head()

In [None]:
df_kagglemug = df[(df.country=='Finland') & (df.store == 'KaggleMart') & (df['product'] == 'Kaggle Mug')]
df_kagglemug = df_kagglemug.drop('row_id',axis=1)
del df

In [None]:
df_kagglemug.head()

In [None]:
df_kagglemug.tail()

#### For {Finland,KaggleMart,Kaggle Mug}, a time series dataset that we will use to understand various timeseries related analysis

In [None]:
df_kagglemug.index

#### Resampling

In [None]:
# Monthly Means
df_kagglemug.resample(rule='M').mean().plot(figsize=(16,6))

In [None]:
# Yearly Means
df_kagglemug.resample(rule='A').mean()

In [None]:
df_kagglemug.resample(rule='A').mean().plot()

#### Resampling rule 'A' takes all of the data points in a given year, applies the aggregation function like mean.
#### Resampling rule 'M' applies to month.

In [None]:
df_kagglemug['num_sold'].resample('A').mean().plot.bar(title='Yearly Mean of number of Kaggle Mug sold');

#### Kaggle Mug selling numbers increased over the years.

In [None]:
title = 'Monthly Mean of number of Kaggle Mug sold'
df_kagglemug['num_sold'].resample('M').max().plot.bar(figsize=(18,6), title=title,color='#1f77b4');

#### Timeshifting
#### .shift() forward
#### This method shifts the entire date index a given number of rows

In [None]:
df_kagglemug.shift(1).head()

In [None]:
df_kagglemug.shift(1).tail()

#### Rolling

In [None]:
df_kagglemug['num_sold'].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
# 7 day rolling mean
df_kagglemug.rolling(window=7).mean().head(15)

In [None]:
df_kagglemug['num_sold'].plot(figsize=(16,6)).autoscale(axis='x',tight=True)
df_kagglemug.rolling(window=30).mean()['num_sold'].plot();

In [None]:
df_kagglemug['num_sold: 30 Day Mean'] = df_kagglemug['num_sold'].rolling(window=30).mean()
df_kagglemug[['num_sold','num_sold: 30 Day Mean']].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
# The number of products sold graph for the year 2017
df_kagglemug['num_sold']['2017-01-01':'2017-12-31'].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

#### What was the percent increase in value from Jan 2015 to Jan 2018?

In [None]:
100 * (df_kagglemug.loc['2018-01-01']['num_sold'] - df_kagglemug.loc['2015-01-01']['num_sold']) / df_kagglemug.loc['2015-01-01']['num_sold']

#### What is the month with the least value in 2018?

In [None]:
df_kagglemug.loc['2018-01-01':]['num_sold'].idxmin()

#### What 6 months have the highest value?

In [None]:
df_kagglemug.sort_values(by='num_sold',ascending=False).head(6)

In [None]:
df_kagglemug['Yearly Mean'] = df_kagglemug['num_sold'].rolling(window=12).mean()
df_kagglemug[['num_sold','Yearly Mean']].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

# Time Series Analysis with statsmodel

#### Trend and Cycle

In [None]:
num_cycle, num_trend = hpfilter(df_kagglemug['num_sold'], lamb=1600)

In [None]:
df_kagglemug['trend'] = num_trend
df_kagglemug['cycle'] = num_cycle

In [None]:
df_kagglemug[['trend','num_sold']].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
df_kagglemug[['trend','num_sold']]['2018-01-01':].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

# Error, Trend and Seasonality models (ETS)

In [None]:
result = seasonal_decompose(df_kagglemug['num_sold'], model='multiplicative')  
result.plot();

# Simple moving average

In [None]:
df_kagglemug['6-month-SMA'] = df_kagglemug['num_sold'].rolling(window=6).mean()
df_kagglemug['12-month-SMA'] = df_kagglemug['num_sold'].rolling(window=12).mean()

In [None]:
df_kagglemug.head(15)

In [None]:
df_kagglemug[['num_sold','6-month-SMA','12-month-SMA']].plot(figsize=(16,6))

# Exponentially weighted moving average

In [None]:
df_kagglemug['EWMA12'] = df_kagglemug['num_sold'].ewm(span=12,adjust=False).mean()

In [None]:
df_kagglemug[['num_sold','EWMA12']].plot(figsize=(16,6))

In [None]:
df_kagglemug[['num_sold','EWMA12','12-month-SMA']].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
# since the data is daily hence we set the frequency to 'D'
df_kagglemug.index.freq = 'D'
df_kagglemug.index

In [None]:
span = 12
alpha = 2/(span+1)

df_kagglemug['EWMA12'] = df_kagglemug['num_sold'].ewm(alpha=alpha,adjust=False).mean()
df_kagglemug['SES12']=SimpleExpSmoothing(df_kagglemug['num_sold']).fit(smoothing_level=alpha,optimized=False).fittedvalues.shift(-1)
df_kagglemug.head()

In [None]:
df_kagglemug['DESadd12'] = ExponentialSmoothing(df_kagglemug['num_sold'], trend='add').fit().fittedvalues.shift(-1)
df_kagglemug.head()

In [None]:
df_kagglemug[['num_sold','EWMA12','DESadd12']].iloc[:24].plot(figsize=(18,6)).autoscale(axis='x',tight=True);

#### Double Exponential Smoothing is a much better representation of the time series data

In [None]:
df_kagglemug['DESmul12'] = ExponentialSmoothing(df_kagglemug['num_sold'], trend='mul').fit().fittedvalues.shift(-1)
df_kagglemug.head()

In [None]:
df_kagglemug.tail()

In [None]:
df_kagglemug[['num_sold','DESadd12']].iloc[:24].plot(figsize=(18,6)).autoscale(axis='x',tight=True);

In [None]:
df_kagglemug['TESadd12'] = ExponentialSmoothing(df_kagglemug['num_sold'],trend='add',seasonal='add',seasonal_periods=12).fit().fittedvalues
df_kagglemug.head()

In [None]:
df_kagglemug['TESmul12'] = ExponentialSmoothing(df_kagglemug['num_sold'],trend='mul',seasonal='mul',seasonal_periods=12).fit().fittedvalues
df_kagglemug.head()

In [None]:
df_kagglemug[['num_sold','TESadd12']].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
df_kagglemug[['num_sold','DESadd12']].iloc[:24].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

In [None]:
df_kagglemug[['num_sold','TESadd12']].iloc[:24].plot(figsize=(16,6)).autoscale(axis='x',tight=True);

#### Triple Exponential Smoothing does a poorer job of fitting than Double Exponential Smoothing

# Autocorrelation and Partial Autocorrelation function

#### Autocorrelation is the correlation between two values in a time series. In other words, the time series data correlate with themselves. The number of intervals between the two observations is the lag. For example, the lag between the current and previous observation is one day in this dataset.
#### The autocorrelation function (ACF) assesses the correlation between observations in a time series for a set of lags.
#### In an ACF plot, each bar represents the size and direction of the correlation. Bars that extend across the red line are statistically significant.

#### The partial autocorrelation function is similar to the ACF except that it displays only the correlation between two observations that the shorter lags between those observations do not explain. 

#### Reading https://statisticsbyjim.com/time-series/autocorrelation-partial-autocorrelation/

In [None]:
lag_plot(df_kagglemug['num_sold']);

#### The plot displays strong autocorrelation

In [None]:
# 20 lags
fig, ax = plt.subplots(figsize=(12,5))
plot_acf(df_kagglemug[["num_sold"]],title="lag",lags=20,ax=ax);

In [None]:
# 40 lags
fig, ax = plt.subplots(figsize=(12,5))
plot_acf(df_kagglemug[["num_sold"]],title="lag",lags=40,ax=ax);

#### non-stationary data, as there are a large number of lags before ACF values drop off.

In [None]:
lags=20
fig, ax = plt.subplots(figsize=(12,5))
plot_pacf(df_kagglemug['num_sold'].dropna(),title="lags",lags=np.arange(lags),ax=ax); 

In [None]:
lags=40
fig, ax = plt.subplots(figsize=(12,5))
plot_pacf(df_kagglemug['num_sold'].dropna(),title="lags",lags=np.arange(lags),ax=ax); 

#### applying differencing to make the data stationary

In [None]:
from statsmodels.tsa.statespace.tools import diff

df_kagglemug['d1'] = diff(df_kagglemug['num_sold'],k_diff=1)
df_kagglemug['d1'].plot(figsize=(12,5));

In [None]:
lags=40
fig, ax = plt.subplots(figsize=(12,5))
plot_pacf(df_kagglemug['d1'].dropna(),title="lags",lags=np.arange(lags),ax=ax); 

#### Partial autocorrelations work best with stationary data

# AR model

In [None]:
train_data = df_kagglemug['2015-01-01':'2017-12-31']
test_data = df_kagglemug['2018-01-01':'2018-12-31']

In [None]:
train_data.shape,test_data.shape

In [None]:
model = AR(train_data['num_sold'])
AR_model = model.fit(method='mle')

In [None]:
print(AR_model.k_ar) # lags

In [None]:
print(AR_model.params)

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions = AR_model.predict(start=start, end=end, dynamic=False).rename('AR Predictions')
predictions

In [None]:
# compare predictions and actual values
for idx in range(10):
    print("predicted values: ",round(predictions[idx],2),"----------", "actual values: ",test_data['num_sold'][idx])

In [None]:
test_data['num_sold'].plot(legend=True)
predictions.plot(legend=True,figsize=(16,6));

In [None]:
 mean_squared_error(test_data['num_sold'], predictions)

In [None]:
AR_model.aic # AIC value

# Test for Stationarity

#### A time series is stationary if the mean and variance are fixed no matter where we take the observations. A time series that shows seasonality is not stationary

## Augmented Dickey-Fuller Test
#### H0 the null hypothesis that the time series is non stationary. H1 the alternate hypothesis is that the time series is stationary

In [None]:
adfTest = adfuller(df_kagglemug['num_sold'],autolag='AIC')
adfTest

In [None]:
adfTest[0:4]

In [None]:
print('Augmented Dickey-Fuller Test')

adfTestValues = pd.Series(adfTest[0:4],index=['ADF test statistic','p-value','lags','observations'])

for key,value in adfTest[4].items():
    adfTestValues[f'critical value ({key})']=value
print(adfTestValues)

#### with a p value of 0.2 we fail to reject the null hypothesis. Hence, the time series is not stationary.

In [None]:

month_plot(df_kagglemug['num_sold'].resample(rule='M').mean());

In [None]:

quarter_plot(df_kagglemug['num_sold'].resample(rule='Q').mean());

# ARIMA and AUTO ARIMA model

#### ARIMA --> AR is the Auto Regressive. MA is the moving average. I is the differencing component to make the dataset stationary.

#### pmdarima to determine ARIMA orders

In [None]:
!pip install pmdarima

In [None]:
import pmdarima

In [None]:
from pmdarima import auto_arima


In [None]:
stepwise_fit = auto_arima(df_kagglemug['num_sold'], start_p=0, start_q=0,
                          max_p=2, max_q=2, m=12,
                          seasonal=False,
                          d=None, trace=True,
                          error_action='ignore',   
                          suppress_warnings=True,  
                          stepwise=True)           

stepwise_fit.summary()

#### Fit an ARIMA (1,1,2) model

In [None]:
model = ARIMA(train_data["num_sold"],order=(1,1,2))
results = model.fit()
results.summary()

#### obtain the predictions

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('ARIMA Predictions')

In [None]:
# compare predictions and actual values
for idx in range(10):
    print("predicted values: ",round(predictions[idx],2),"----------", "actual values: ",test_data['num_sold'][idx])

In [None]:
ax = test_data['num_sold'].plot(legend=True,figsize=(16,6),title="actual vs predictions")
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="numbers sold")

In [None]:
mean_squared_error(test_data['num_sold'], predictions)


In [None]:

rmse(test_data['num_sold'], predictions)

#### Retrain the model on the full data, and forecast the future

In [None]:
model = ARIMA(df_kagglemug['num_sold'],order=(1,1,2))
results = model.fit()
forecast_values = results.predict(len(df_kagglemug),len(df_kagglemug)+365-1,typ='levels').rename('ARIMA Forecast')

In [None]:
forecast_values

In [None]:
ax = df_kagglemug['num_sold'].plot(legend=True,figsize=(16,6),title=" ")
forecast_values.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="forecast value")

# Seasonal Autoregressive Integrated Moving Averages

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8

decomposition = sm.tsa.seasonal_decompose(df_kagglemug["num_sold"], model='additive')
fig = decomposition.plot()
plt.show()

In [None]:
auto_arima(df_kagglemug["num_sold"],seasonal=True,m=12).summary() # season set to true and value of m set to 12

In [None]:
model = SARIMAX(train_data['num_sold'],order=(3,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
results.summary()

In [None]:
# Checking diagnostic plots
results.plot_diagnostics(figsize=(10, 10))
plt.show()

#### The Normal Q-Q plot shows that the ordered distribution of residuals follows the distribution similar to normal distribution

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('Predictions')

In [None]:
# compare predictions and actual values
for idx in range(10):
    print("predicted values: ",round(predictions[idx],2),"----------", "actual values: ",test_data['num_sold'][idx])

In [None]:
ax = test_data['num_sold'].plot(legend=True,figsize=(16,6),title="actual vs predictions")
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="numbers sold")

In [None]:
mean_squared_error(test_data['num_sold'], predictions)

In [None]:
rmse(test_data['num_sold'], predictions)

In [None]:
model = SARIMAX(df_kagglemug['num_sold'],order=(3,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
forecast_values = results.predict(len(df_kagglemug),len(df_kagglemug)+365-1,typ='levels').rename('Forecast')

In [None]:
forecast_values

In [None]:
ax = df_kagglemug['num_sold'].plot(legend=True,figsize=(16,6),title=" ")
forecast_values.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="forecast value")

In [None]:
model = SARIMAX(train_data['num_sold'],order=(3,1,3),seasonal_order=(1,0,2,12))
results = model.fit()
results.summary()

In [None]:
# Checking diagnostic plots
results.plot_diagnostics(figsize=(10, 10))
plt.show()

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('Predictions')

In [None]:
# compare predictions and actual values
for idx in range(10):
    print("predicted values: ",round(predictions[idx],2),"----------", "actual values: ",test_data['num_sold'][idx])

In [None]:
ax = test_data['num_sold'].plot(legend=True,figsize=(16,6),title="actual vs predictions")
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="numbers sold")

In [None]:
mean_squared_error(test_data['num_sold'], predictions)

In [None]:
rmse(test_data['num_sold'], predictions)

In [None]:
model = SARIMAX(df_kagglemug['num_sold'],order=(3,1,3),seasonal_order=(1,0,2,12))
results = model.fit()
forecast_values = results.predict(len(df_kagglemug),len(df_kagglemug)+365-1,typ='levels').rename('Forecast')

In [None]:
ax = df_kagglemug['num_sold'].plot(legend=True,figsize=(16,6),title=" ")
forecast_values.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=" ", ylabel="forecast value")

# LSTM model

In [None]:
# Weekly Means
df_kagglemug["num_sold"].resample(rule='W').mean().plot(figsize=(16,6))

In [None]:
scaler = MinMaxScaler()

In [None]:
train_data = df_kagglemug['2015-01-01':'2017-12-31'][["num_sold"]]
test_data = df_kagglemug['2018-01-01':'2018-12-31'][["num_sold"]]

In [None]:
scaler.fit(train_data)

In [None]:
scaled_train = scaler.transform(train_data)
scaled_test = scaler.transform(test_data)

In [None]:
scaled_train

#### The input to LSTM model is [samples, timesteps, features]
#### timesteps = 7 that is we look back 7 days to predict on the 8th day
#### univariate time series hence features = 1
#### TimeseriesGenerator creates the dataset for training the LSTM model.

In [None]:
# define generator
n_input = 2 # looking into 2 time steps back to predict at time t
n_features = 1
generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1)

In [None]:
len(scaled_train)

In [None]:
len(generator) # n_input = 2

In [None]:
# How the batch looks now?
X,y = generator[0]

In [None]:
X.flatten(),y

In [None]:
# looking into 7 timesteps back to predict at time t
n_input = 7
generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1)

In [None]:
# How the batch looks now?
X,y = generator[0]

In [None]:
X.flatten(),y

In [None]:
# define model
model = Sequential()
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_input, n_features)))
model.add(LSTM(50,  activation='relu',return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

In [None]:
# fit model
model.fit_generator(generator,epochs=30)

In [None]:
model.history.history.keys()

In [None]:
loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)),loss_per_epoch)

In [None]:
test_ds = TimeseriesGenerator(scaled_test, scaled_test, length=n_input, batch_size=1)

In [None]:

model.evaluate(test_ds)

In [None]:
predictions = model.predict(test_ds)

In [None]:
labels = []
 
for batch in test_ds:
    labels.append(batch[1][0][0])
days = np.arange(0, predictions.shape[0])
plt.plot(days, labels)
plt.plot(days, predictions[:,0], color='green')