In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

In this notebook, we'll investigate long term trend in a time series. For the dataset at hand, this means the trend in the 365-day moving average. There are seasonal variations as well, but we'll focus for the moment on averages over 365 days. We would need to remove this trend before tree models can predict seasonal variations within a year.

In [None]:
from matplotlib import pyplot as plt

In [None]:
train_data =  pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',parse_dates=['date'])
test_data = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',parse_dates=['date'])

In [None]:
assert np.all(train_data['num_sold']>0)

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

def adf_test(timeseries, significance=0.05):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, autolag="AIC", regression='ctt')
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value

    print(dfoutput)
    if dfoutput['p-value']<significance:
        print(f'At {significance} significance level, the time series is trend-stationary')
    else:
        print(f'At {significance} significance level, the time series is not trend-stationary')
    print()
        

def kpss_test(timeseries,significance=0.05):
    print("Results of KPSS Test:")
    kpsstest = kpss(timeseries, regression="ct", nlags="auto")
    kpss_output = pd.Series(
        kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
    )
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value

    print(kpss_output)
    if kpss_output['p-value']<significance:
        print(f'At {significance} significance level, the time series is not trend-stationary')
    else:
        print(f'At {significance} significance level, the time series is trend-stationary')
    print() 
    
def test_for_stationarity(df,col,significance=0.05):
    print(F'Column to be tested: {col}')
    print()
    adf_test(pd.Series(df[col].to_numpy(),index=df['date']),significance=significance)
    kpss_test(pd.Series(df[col].to_numpy(),index=df['date']),significance=significance)

In [None]:
def get_df(df,country,store,product):
    return df[(df['country']==country) & (df['store']==store) & (df['product']==product)].copy()

We'll consider the following combination of country, store and product. That's why I call this a case study. It is reasonble to expect that the trend would be different for different combinations.

In [None]:
country,store,product = 'Finland','KaggleMart','Kaggle Mug'

In [None]:
df = get_df(train_data,country,store,product)
df_test = get_df(test_data,country,store,product)

# Working in log-space

Now we add new columns into the dataframe that will be useful later. `t` is the number of days since January 1, 2015. `log_num_sold` is where we'll be working on. It is noteworthy that if we remove a trend additively in log-space, it corresponds to removing a trend multiplicatively in the original space. More specifically, if we remove a trend \\(\eta_t\\) from \\(y_t\\) and \\(y_t'\\) so that \\(y_t=\eta_t\bar{y}_t\\), \\(y_t'=\eta_t\bar{y}_t'\\), the SMAPE term is the same, so that we can work exclusively in the log-space without worrying about the trend that has been removed. If we remove a trend additively in the original space, this would not be possible.
$$
2\frac{|y_t-y_t'|}{|y_t|+|y_t'|}=2\frac{|\bar{y}_t-\bar{y}_t'|}{|\bar{y}_t|+|\bar{y}_t'|}
$$

In [None]:
t0 = df.date.iloc[0]
df['t'] = (df.date-t0).astype('timedelta64[D]').astype(np.int).to_numpy()
df_test['t'] = (df_test.date-t0).astype('timedelta64[D]').to_numpy()
df['log_num_sold'] = np.log(df['num_sold'])
df = df.set_index('t')
df_test = df_test.set_index('t')

Let's make some plots to get some insight.

In [None]:
fig = plt.figure(figsize=(18, 6))
fig.add_subplot(131)
plt.plot(df.date,df.log_num_sold)
plt.title('log_num_sold: raw plot')
fig.add_subplot(132)
plt.plot(df.date,df.log_num_sold.rolling(365).mean())
plt.title('log_num_sold: 365-day moving average')
fig.add_subplot(133)
plt.plot(df.date,df.log_num_sold.rolling(183).mean())
plt.title('log_num_sold: 183-day moving average')
plt.show()

From these plots, it is clear that there is a year-on-year up trend, while there is a seasonal variation within a year.

# Stationarity?

It is common to use some statistical tests to test for stationarity of a time series. We will use the augmented Dickey-Fuller Test and the KPSS Test.

In [None]:
test_for_stationarity(df,'log_num_sold')

Not suprisingly, the answer is no. The KPSS test cannot reject the null hypothesis that the time series is trend-stationary, while the Dickey-Fuller test cannot reject the null hypothesis that it is *not* trend-stationary. We can tell from the plots already that it is not stationary. 

# Spline modeling

Looking at the 365-day moving average plot, it seems that we'd better go for a non-parametric model. We go with the PCHIP spline interpolater for its local monotonicity that tends to overshoot less. For the control points (knots) we use 5 evenly spaced time points over the 4-year period.

In [None]:
x0=[0,365,731,1096,1460] 
z0=np.full(len(x0),df.log_num_sold.mean()) # initial solution

Instead of fitting the spline to the data points directly (which vary wildly as a stochastic process with a lot of outliers), we chose to fit the spline such that its 365-day moving average matches the 365-day moving average of the data. Otherwise, it is a pretty straightword least squares fit.

In [None]:
window=365

In [None]:
from numba import njit

@njit(fastmath=True)
def moving_average(x,window):
    s = x[:len(x)-window+1].copy()
    for i in range(1,window):
        s += x[i:len(x)-window+1+i]
    return s/window        

In [None]:
from scipy.interpolate import PchipInterpolator
def obj_fn(z,x0,x,y_ma,window):
    y_pred = moving_average(PchipInterpolator(x0, z)(x),window)
    return y_pred-y_ma

In [None]:
from scipy.optimize import least_squares
y_ma = moving_average(df.log_num_sold.to_numpy(),window)
result = least_squares(lambda z: obj_fn(z,x0,df.index,y_ma,window),z0)

Now add columns `year_trend` and `log_num_sold_1` which is `log_num_sold` with the year trend removed.

In [None]:
cs = PchipInterpolator(x0, result.x)
df['year_trend'] = cs(df.index)
df['log_num_sold_1'] = df['log_num_sold']-df['year_trend']
df_test['year_trend'] = cs(df_test.index)
df

Let's make a plot of the 365-day moving averages to see how well the fit went.

In [None]:
plt.plot(df.date,df.log_num_sold.rolling(window).mean())
plt.plot(df.date,df.year_trend.rolling(window).mean())
plt.legend(['log_num_sold', 'year_trend'], loc='upper left')
plt.title('365-day moving averages')
plt.show()

Is the remainder `log_num_sold_1` stationary?

In [None]:
test_for_stationarity(df,'log_num_sold_1')

Not surprisingly, the answer is still no. We haven't removed the seasonal trend within a year yet. Let's make some plots to confirm that.

In [None]:
fig = plt.figure(figsize=(18, 6))
fig.add_subplot(131)
plt.plot(df.date,df.log_num_sold_1)
plt.title('log_num_sold_1: raw plot')
fig.add_subplot(132)
plt.plot(df.date,df.log_num_sold_1.rolling(365).mean())
plt.title('log_num_sold_1: 365-day moving average')
fig.add_subplot(133)
plt.plot(df.date,df.log_num_sold_1.rolling(183).mean())
plt.title('log_num_sold_1: 183-day moving average')
plt.show()

So we see from the 365-day moving average that over a 365-day period, `log_num_sold_1` is mostly noise. But the cyclical trend is obvious in the 183-day moving average.

Let's visualize the year trend in the original `num_sold` space

In [None]:
plt.plot(df.date,np.exp(df.year_trend),'b-')
plt.plot(df_test.date,np.exp(df_test.year_trend), 'g-')
plt.title('num_sold year trend')
plt.legend(['num_sold 2015 - 2018 (fitted)', 'num_sold 2019 (extrapolated)'], loc='upper left')
plt.show()

Finally, we can also estimate annual sales for year 2019 using the trend. 

In [None]:
xx = [np.datetime64(F'{year}') for year in range(2015,2020)]
yy_pred = []
yy_actual = []
df0 = pd.concat([df,df_test])
for i in range(2015,2020):
    yy_pred.append(np.sum(np.exp(df0.year_trend[(df0.date>=np.datetime64(F'{i}-01-01'))&(df0.date<=np.datetime64(F'{i}-12-31'))])))
    yy_actual.append(np.sum(df.num_sold[(df.date>=np.datetime64(F'{i}-01-01'))&(df.date<=np.datetime64(F'{i}-12-31'))]))
    
plt.plot(xx[:-1],yy_actual[:-1],'cx-')
plt.plot(xx,yy_pred,'bo-')
plt.legend(['actual', 'fitted/extrapolated'], loc='upper left')
plt.title('Annual sales')

from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
years = YearLocator()   
yearsFmt = DateFormatter('%Y')
ax = plt.gca()
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)

# What's Next?

In principle, the time series `log_num_sold_1` can be fitted already with your favorite tree model. It is not a stationary time series yet but over-the-years trend has been removed, so the tree model should be able to predict the seasonal cycles. For completeness sake, I plan to detrend the time series completely in a future notebook by removing the cyclical trend. Hope to see you then!