In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt

# Introduction

In this notebook, I will continue the effort started in [Detrending Case Study I](https://www.kaggle.com/siukeitin/tps012022-detrending-case-study-i), in which I model a long term trend (over the years) using a spline. One of the reasons for using a spline is so that I can extrapolate the trend to 2019. Of course, extrapolation with splines is risky, as a purely mathematical construct need not predict "reality", even if that "reality" is fabricated. Since then, many contestants use external GDP data to model a more plausible 2019 annual trend. I also think that that's more reliable than purely mathematical spline extrapolation.

In the following, I'll describe a different approach to detrending that is more data-driven. Finding the trend and removing it from the training data turns out to be relatively trivial. The problem is how to estimate the trend for the test data. We will describe an attempt using external annual and quarterly GDP data.

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

def adf_test(timeseries, significance=0.05):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, autolag="AIC", regression='ctt')
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value

    print(dfoutput)
    if dfoutput['p-value']<significance:
        print(f'At {significance} significance level, the time series is trend-stationary')
    else:
        print(f'At {significance} significance level, the time series is not trend-stationary')
    print()
        

def kpss_test(timeseries,significance=0.05):
    print("Results of KPSS Test:")
    kpsstest = kpss(timeseries, regression="ct", nlags="auto")
    kpss_output = pd.Series(
        kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
    )
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value

    print(kpss_output)
    if kpss_output['p-value']<significance:
        print(f'At {significance} significance level, the time series is not trend-stationary')
    else:
        print(f'At {significance} significance level, the time series is trend-stationary')
    print() 
    
def test_for_stationarity(timeseries,significance=0.05):
    adf_test(timeseries,significance=significance)
    kpss_test(timeseries,significance=significance)

In [None]:
def get_df(df,country,store,product):
    return df[(df['country']==country) & (df['store']==store) & (df['product']==product)].copy()

In [None]:
train_data=pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',parse_dates=['date'])
test_data=pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',parse_dates=['date'])

In [None]:
country,store,product = 'Finland','KaggleMart','Kaggle Mug'

In [None]:
df = get_df(train_data,country,store,product)
df_test = get_df(test_data,country,store,product)

# Motivation for doing the detrending from scratch

But perhaps I should explain the reason for this detrending effort, when there are existing tools out there to perform detrending. Typically, it is taught that a time series can be decomposed into a trend component, a seasonal component and a residual component. To demonstrate that such approach may not be desirable for the current dataset, let's try the well-known [STL decomposition](http://bit.ly/stl1990). We will use the implementation from [statsmodels](https://www.statsmodels.org). Since we would like to decompose the time series multiplicatively, we take logarithm before passing in the data.

In [None]:
from statsmodels.tsa.seasonal import STL

result = STL(np.log(df.num_sold), period= 365, seasonal=183).fit()
fig = plt.figure(figsize=(30, 6))
fig.add_subplot(311)
plt.plot(np.exp(result.trend))
plt.title('trend')
fig.add_subplot(312)
plt.plot(np.exp(result.seasonal))
plt.title('seasonal')
fig.add_subplot(313)
plt.plot(np.exp(result.resid))
plt.title('residual')
plt.show()

While the trend component looks reasonably smooth, the seasonal component seems to include a lot of high frequencies and looks much less smooth than the [textbook example](https://www.statsmodels.org/dev/examples/notebooks/generated/stl_decomposition.html) for the \\(\mbox{CO}_2\\) data. One big concern is that removing these high frequencies from the residual component might actually cause damage to the data when we eventually apply machine learning to the residual series. Another big question is, if we add the trend and seasonal component as features to the training data, how do we do that for the test data? There is no obvious way to do that. Training on the residual component would be pointless if we don't have the trend and seasonal component for the test data.

# Detrending the training data by monthly sales

We begin by adding some helper columns.

In [None]:
df['year'] = df.date.apply(lambda x:x.year)
df['month'] = df.date.apply(lambda x:x.month)
df['quarter'] = df.date.apply(lambda x: (x.month-1)//3+1)

In [None]:
df

Now compute the quarterly sales (to be used later) and the monthly sales.

In [None]:
quarterly_sales = np.zeros((4,4))
for year in range(2015,2019):
    for quarter in range(1,5):
        quarterly_sales[year-2015,quarter-1] = df[(df.year==year)&(df.quarter==quarter)]['num_sold'].sum()

quarterly_sales = quarterly_sales/np.sum(quarterly_sales,axis=1,keepdims=True)

In [None]:
monthly_sales = []
for year in range(2015,2019):
    for month in range(1,13):
        monthly_sales.append([year,month,df[(df.year==year)&(df.month==month)]['num_sold'].sum()])
        
monthly_sales_df = pd.DataFrame(monthly_sales,columns=['year','month','monthly_sales'])

In [None]:
df = df.join(monthly_sales_df.set_index(['year','month']),on=['year','month'])

Now simply remove the monthly sales from `num_sold`.

In [None]:
df['num_sold_resid'] = df['num_sold']/df['monthly_sales']*30
df

I claim that `num_sold_resid` is already detrend. Let's use statistical tests to ascertain that.

In [None]:
test_for_stationarity(df['num_sold_resid'])

The p-value for the Dickey-Fuller test is actually very small (much smaller than the 0.05 threshold). Let's visualize what's left after removing the monthly sales.

In [None]:
fig = plt.figure(figsize=(30, 3))
plt.plot(df.date,df.num_sold_resid)
plt.title('num_sold_resid')
plt.show()

OK, looks pretty trendless. And what does the trend that we just removed look like?

In [None]:
fig = plt.figure(figsize=(30, 3))
plt.plot(df.date,df.monthly_sales)
plt.title('monthly_sales')
plt.show()

This trend is piecewise constant which may not look like the kind of smooth trends shown in textbooks, but we can be sure that no high frequency information has been  removed from the time series. For example, a spike due to a holiday is still in the data, to be learned by our regressor model.

# Feature engineering for the test data

So that was easy, detrending the training data. But we added a feature `monthly_sales` to the training data. We could remove that feature when we train on `num_sold_resid`, but we still need the feature for the test data because we need to combine the predictions from our regressor model with the monthly sales for the  test data to produce our final predictions. To produce `monthly_sales` column for the test data, we'll follow the following steps.

1. Estimate the annual sales for 2019 from the annual GDP data
2. Estimate the quarterly sales breakdown for 2019 from the quarterly GDP data
3. Estimate the monthly sales breakdown for 2019 from the quarterly sales breakdown from step 2 and past history (2015 - 2018) of monthly sales breakdown
4. Combine results from step 1 and step 3 to produce the monthly sales estimate in units sold

### Estimating the annual sales for 2019

The annual GDP data is highly (linearly) correlated with the annual sales data, so we just use linear regression fit and interpolate (since 2019 GDP actually drops).

In [None]:
from sklearn.linear_model import LinearRegression

df_gdp = pd.read_csv('../input/gdp-fin-nor-swe-20152019-multiple-sources/GDP_FIN_NOR_SWE_2015-2019_Multiple_Sources.csv')
gdp_annual = df_gdp[(df_gdp['Measure']=='Current prices, current exchange rates')&(df_gdp['Data Source']=='World Bank')&
                   (df_gdp['Country']==country)].copy()
annual_sales=df.groupby('year')['num_sold'].sum()

def project_sales(gdp,annual_sales,plot=True):
    x=gdp['Value'].to_numpy()[:-1].reshape((-1,1))
    y=annual_sales.to_numpy()
    linreg = LinearRegression().fit(x,y)
    x_test=[[gdp['Value'].iloc[-1]]]
    y_test = linreg.predict(x_test)[0]
    y_pred = linreg.predict(x)
    residues = y_pred-y
    mu = np.mean(residues)
    sigma = np.std(residues)
    if plot:
        plt.plot(x,y,'x')
        plt.errorbar(x_test,[y_test],fmt='o',capsize=3,yerr=sigma)
        plot_xrange = np.linspace(min(x.flatten()),max(x.flatten()),100)
        plt.plot(plot_xrange,linreg.predict(plot_xrange.reshape((-1,1))))
        plt.legend(['GDP/annual sales data', 'Fitted line','Projected 2019 annual sales'], loc='upper left')
    return y_test, sigma

annual_sales_2019,_ = project_sales(gdp_annual,annual_sales)

### Estimating quarterly sales breakdown

The quarterly GDP data does not correlate so well with the quarterly sales so we cannot estimate the quarterly sales using linear regression as was done with the annual data. However, the quarterly sales breakdown (proportions of the annual GDP) can be predicted from the quarterly GDP data. For more details, please see [my other notebook](https://www.kaggle.com/siukeitin/tps012022-getting-quarterly-gdp-in-usd). In short, it involves determining the parameters for a projective transformation that maps quarterly GDP breakdown to quarterly sales breakdown.

In [None]:
gdp_quarterly_df = pd.read_csv('../input/gdp-fin-nor-swe-20152019-quarterly-imf/GDP_FIN_NOR_SWE_2015-2019_Quarterly_IMF.csv')
gdp_quarterly = gdp_quarterly_df[gdp_quarterly_df.Country==country][['Q1','Q2','Q3','Q4']].to_numpy().astype(np.float)
gdp_quarterly = gdp_quarterly/np.sum(gdp_quarterly,axis=1,keepdims=True)
gdp_quarterly

In [None]:
def obj_fn_q(x,gdp_quarterly,quarterly_sales):
    alpha = np.array([1,x[0],x[1],x[2]]).reshape((1,4))
    y=alpha*gdp_quarterly
    y=y/np.sum(y,axis=1,keepdims=True)
    return np.abs(y-quarterly_sales).mean()

In [None]:
from scipy.optimize import minimize
result = minimize(lambda x: obj_fn_q(x,gdp_quarterly[:-1,:],quarterly_sales), (1,1,1), bounds=[(0,None)]*3)
x1,x2,x3 = result.x
quarterly_sales_2019 = np.array([1,x1,x2,x3])*gdp_quarterly[-1,:]
quarterly_sales_2019 = quarterly_sales_2019/np.sum(quarterly_sales_2019)
quarterly_sales_2019 # This is just the breakdown, not the actual sales

### Estimating monthly sales breakdown

Now we have the quarterly sales breakdown, but we need the monthly sales breakdown. On the other hand, we have 4 years of monthly sales examples from 2015 to 2018. We don't want to make up monthly sales patterns, but we are comfortable with convex combinations of them. In other words, a candidate monthly sales breakdown for 2019 is
$$
s = w_1\cdot s_{2015}+w_2\cdot s_{2016}+w_3\cdot s_{2017}+w_4\cdot s_{2018}
$$
where \\(w_1\\), \\(w_2\\), \\(w_3\\), \\(w_4\\) are parameters such that \\(w_1+w_2+w_3+w_4=1\\). Note that this modeling is more general than the commonly used exponential smoothing that has only one parameter (\\(\alpha\\)).

With the monthly breakdown \\(s\\) we can derive the corresponding quarterly breakdown \\(s_q\\) by combining every 3 months. We require \\(s_q\\) to match the quarterly sales breakdown from step 2 as much as possible, in the \\(\ell^1\\) sense. Needless to say, the parameters \\(w_1\\), \\(w_2\\), \\(w_3\\), \\(w_4\\) are determined by constrained optimization.

In [None]:
from numba import njit

@njit(fastmath=True)
def obj_fn_m(w,data,sales_est):
    sales_m = (data@w).reshape((-1,))
    sales_q = np.array([sales_m[:3].sum(),sales_m[3:6].sum(),sales_m[6:9].sum(),sales_m[9:].sum()])
    return np.power(sales_q-sales_est,2).sum()

In [None]:
data=monthly_sales_df['monthly_sales'].to_numpy().reshape((-1,12)).T
data=data/data.sum(axis=0,keepdims=True)

In [None]:
from scipy.optimize import minimize, LinearConstraint

result = minimize(lambda w: obj_fn_m(w,data,quarterly_sales_2019),(0.25,0.25,0.25,0.25),bounds=[(0,1)]*4,
                  constraints=LinearConstraint(np.ones((1,4)), 1, 1),method='trust-constr')

assert result.success
w=result.x
w

The result of the optimization says that we should combine mostly the monthly sales breakdown from 2018 and 2015.

### Combining step1 and step 3

In [None]:
monthly_sales_2019 = (data@w).reshape((-1,))*annual_sales_2019
monthly_sales_2019

Let's visualize how well `monthly_sales_2019` matches `quarterly_sales_2019` when combining the months in each quarter.

In [None]:
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter

fig = plt.figure(figsize=(8, 8))
plt.bar(np.arange(4),quarterly_sales_2019,width=0.25)
plt.bar(np.arange(4)+0.25,monthly_sales_2019.reshape((-1,3)).sum(axis=1)/annual_sales_2019,width=0.25)
plt.xticks(np.arange(4)+0.25,['Q1','Q2','Q3','Q4'])
plt.legend(['Target (from GDP)','Matching'])
plt.title('2019 Quarterly Sales Distributions')
plt.gca().set_yscale('log',base=0.1,subs=range(2,10))
plt.tick_params(axis='y', which='minor')
plt.gca().yaxis.set_minor_formatter(FormatStrFormatter("%.2f"))
plt.show()

Now add feature `monthly_sales` to the test data!

In [None]:
df_test['monthly_sales'] = test_data.date.apply(lambda x: monthly_sales_2019[x.month-1])
df_test

In [None]:
fig = plt.figure(figsize=(30, 3))
plt.plot(df.date,df.monthly_sales,'b-')
plt.plot([df.date.iloc[-1]]+list(df_test.date),[df.monthly_sales.iloc[-1]]+list(df_test.monthly_sales),'g-')
plt.title('monthly_sales (aggregated (2015-2018) and estimated (2019))')
plt.show()

# Conclusions

This concludes the effort to detrend the time series. The estimated monthly sales for 2019 need to be tested. A regressor model needs to be trained on `num_sold_resid`, its predictions combined with `monthly_sales` to produce predictions for `num_sold`. That would be the ultimate test. It would be my next task. If you are trying a similar approach, feel free to leave a comment.