In [None]:
!pip install pystan==2.19.1.1
!pip install prophet
!pip install neuralprophet[live]

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
from neuralprophet import NeuralProphet

## 1. Introduction

This competition's objective is predicting a full year of sales for three items at two stores located in three different countries. Three countries are Sweden, Finland and Norway. Two stores located in those countries are KaggleMart and KaggleRama. Three items sold in those stores are Kaggle Mug, Kaggle Hat and Kaggle Sticker.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

countries = ['Sweden', 'Finland', 'Norway']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']

print(f'Training Set Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape: {df_test.shape} - Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

## 2. Data Analysis

Sales of every country-store-product combination is visualized below. All of the sales have very similar characteristics. Yearly and weekly seasonal fluctuations are quite strong and effects of holidays can be seen easily.

In [None]:
def visualize_ts(df, t, y, forecasts, start, end, country, store, product):
    
    idx = (df[t] >= start) & (df[t] < end) & (df['country'] == country) & (df['store'] == store) & (df['product'] == product)
    
    fig, ax = plt.subplots(figsize=(24, 6), dpi=100)
    ax.plot(df.loc[idx].set_index(t)[y], linewidth=2, label=y)
    if forecasts is not None:
        for forecast in forecasts:
            ax.plot(df.loc[idx].set_index(t)[forecast], linewidth=2, label=forecast)
    ax.tick_params(axis='x', labelsize=12.5, pad=10)
    ax.tick_params(axis='y', labelsize=12.5, pad=10)
    ax.set_title(f'[{start}, {end}) - {country} - {store} - {product}', size=20, pad=15)
    ax.legend(prop={'size': 18})
    plt.show()
    
    
for country in countries:
    for store in stores:
        for product in products:
            visualize_ts(
                df=df_train,
                t='date',
                y='num_sold',
                forecasts=None,
                start='2015-01-01',
                end='2019-01-01',
                country=country,
                store=store,
                product=product
            )

## 3. Holidays

The highest fluctuations can be seen on New Year's Day and Easter. The effect of New Year's Day is very short and strong but the effect Easter is weaker and longer.

In [None]:
new_year = pd.DataFrame({
  'holiday': 'new_year',
  'ds': pd.to_datetime(['2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01']),
  'lower_window': -1,
  'upper_window': 0,
})

easter = pd.DataFrame({
  'holiday': 'easter',
  'ds': pd.to_datetime(['2015-04-05', '2016-03-27', '2017-04-16', '2018-04-01', '2019-04-21']),
  'lower_window': 0,
  'upper_window': 7,
})

holidays = pd.concat((new_year, easter))
holidays

## 4. Metric

Submissions are evaluated on SMAPE (symmetric mean absolute percentage error) between forecasts and actual values. Advantages of using SMAPE are; it can interpreted as a percentage and it has lower (0%) and upper (200%) bounds.

In [None]:
def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)) * 100)

## 5. Prophet

In this section, Prophet is used for forecasting. [Prophet](https://facebook.github.io/prophet/) is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data.

Sales data between 2015-01-01 and 2018-01-01 are used as training set and sales data between 2018-01-01 and 2019-01-01 are used as validation set. Every country-store-product combination are modeled separately and forecasts are combined afterwards.

In [None]:
# Training period is between 2015-01-01 and 2018-01-01
# Validation period is between 2018-01-01 and 2019-01-01
folds = [
    ('2015-01-01', '2018-01-01'),
    ('2018-01-01', '2019-01-01'),
]

for country in countries:
    for store in stores:
        for product in products:
            for fold, (start, end) in enumerate(folds):
                # Skip iteration if it's the last fold
                if fold == len(folds) - 1:
                    continue
                    
                train_idx = (df_train['date'] >= start) &\
                            (df_train['date'] < end) &\
                            (df_train['country'] == country) &\
                            (df_train['store'] == store) &\
                            (df_train['product'] == product)
                train = df_train.loc[train_idx, ['date', 'num_sold']].reset_index(drop=True)
                train = train.rename(columns={'date': 'ds', 'num_sold': 'y'})
                val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                          (df_train['date'] < folds[fold + 1][1]) &\
                          (df_train['country'] == country) &\
                          (df_train['store'] == store) &\
                          (df_train['product'] == product)
                val = df_train.loc[val_idx, ['date', 'num_sold']].reset_index(drop=True)
                val = val.rename(columns={'date': 'ds', 'num_sold': 'y'})
                
                model = Prophet(
                    growth='linear',
                    holidays=holidays,
                    n_changepoints=10,
                    changepoint_range=0.4,
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=False,
                    seasonality_mode='additive',
                    seasonality_prior_scale=25,
                    holidays_prior_scale=100,
                    changepoint_prior_scale=0.01,
                    interval_width=0.5,
                    uncertainty_samples=False
                )
                model.fit(train)
                
                train_predictions = model.predict(train[['ds']])['yhat']
                val_predictions = model.predict(val[['ds']])['yhat']
                df_train.loc[val_idx, 'prophet_forecast'] =  val_predictions.values

                train_score = smape(train['y'].values, train_predictions.values)
                val_score = smape(val['y'].values, val_predictions.values)
                print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')
                
                test_idx = (df_test['country'] == country) &\
                           (df_test['store'] == store) &\
                           (df_test['product'] == product)
                test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                test = test.rename(columns={'date': 'ds'})
                test_predictions = model.predict(test[['ds']])['yhat']
                df_test.loc[test_idx, 'prophet_forecast'] = test_predictions.values


## 6. Neural Prophet

In this section, Neural Prophet is used for forecasting. [Neural Prophet](https://neuralprophet.com/html/contents.html) has a number of added features with respect to original Prophet. They are as follows.

* Gradient Descent for optimisation via using PyTorch as the backend
* Modelling autocorrelation of time series using AR-Net
* Modelling lagged regressors using a sepearate Feed-Forward Neural Network
* Configurable non-linear deep layers of the FFNNs
* Tuneable to specific forecast horizons (greater than 1)
* Custom losses and metrics

Sales data between 2015-01-01 and 2018-01-01 are used as training set and sales data between 2018-01-01 and 2019-01-01 are used as validation set. Every country-store-product combination are modeled separately and forecasts are combined afterwards.

In [None]:
# Training period is between 2015-01-01 and 2018-01-01
# Validation period is between 2018-01-01 and 2019-01-01
folds = [
    ('2015-01-01', '2018-01-01'),
    ('2018-01-01', '2019-01-01'),
]

# Neural Prophet requires holidays to be in one-hot encoded format on all timesteps
events = pd.concat((holidays['ds'], pd.get_dummies(holidays['holiday'])), axis=1)

for country in countries:
    for store in stores:
        for product in products:
            for fold, (start, end) in enumerate(folds):
                # Skip iteration if it's the last fold
                if fold == len(folds) - 1:
                    continue
                    
                train_idx = (df_train['date'] >= start) &\
                            (df_train['date'] < end) &\
                            (df_train['country'] == country) &\
                            (df_train['store'] == store) &\
                            (df_train['product'] == product)
                train = df_train.loc[train_idx, ['date', 'num_sold']].reset_index(drop=True)
                train = train.rename(columns={'date': 'ds', 'num_sold': 'y'})
                train = train.merge(events, on='ds', how='left').fillna(0)
                train['easter'] = train['easter'].astype(np.uint8)
                train['new_year'] = train['new_year'].astype(np.uint8)
                val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                          (df_train['date'] < folds[fold + 1][1]) &\
                          (df_train['country'] == country) &\
                          (df_train['store'] == store) &\
                          (df_train['product'] == product)
                val = df_train.loc[val_idx, ['date', 'num_sold']].reset_index(drop=True)
                val = val.rename(columns={'date': 'ds', 'num_sold': 'y'})
                val = val.merge(events, on='ds', how='left').fillna(0)
                val['easter'] = val['easter'].astype(np.uint8)
                val['new_year'] = val['new_year'].astype(np.uint8)
                
                model = NeuralProphet(
                    growth='linear',
                    n_changepoints=10,
                    changepoints_range=0.4,
                    trend_reg=1,
                    trend_reg_threshold=False,
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=False,
                    seasonality_mode='additive',
                    seasonality_reg=1,
                    n_forecasts=365,
                    normalize='off'
                )
                model = model.add_events(['new_year'], mode='multiplicative', lower_window=-1)
                model = model.add_events(['easter'], mode='additive', upper_window=7)
                model.fit(train, freq='D')
                
                train_predictions = model.predict(train)['yhat1']
                val_predictions = model.predict(val)['yhat1']
                df_train.loc[val_idx, 'neural_prophet_forecast'] =  val_predictions.values

                train_score = smape(train['y'].values, train_predictions.values)
                val_score = smape(val['y'].values, val_predictions.values)
                print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')
                
                test_idx = (df_test['country'] == country) &\
                           (df_test['store'] == store) &\
                           (df_test['product'] == product)
                test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                test = test.rename(columns={'date': 'ds'})
                test['y'] = np.nan
                test = test.merge(events, on='ds', how='left').fillna(0)
                test['easter'] = test['easter'].astype(np.uint8)
                test['new_year'] = test['new_year'].astype(np.uint8)
                test_predictions = model.predict(test)['yhat1']
                df_test.loc[test_idx, 'neural_prophet_forecast'] = test_predictions.values


## 7. Evaluation

Forecasts look decent. Models were able capture yearly and weekly seasonal fluctuations but struggle to fit some of the spikes.

In [None]:
val_idx = (df_train['date'] >= '2018-01-01') & (df_train['date'] < '2019-01-01')
prophet_score = smape(df_train.loc[val_idx, 'num_sold'], df_train.loc[val_idx, 'prophet_forecast'])
neural_prophet_score = smape(df_train.loc[val_idx, 'num_sold'], df_train.loc[val_idx, 'neural_prophet_forecast'])
print(f'Prophet - Validation SMAPE: {prophet_score:6f}')
print(f'Neural Prophet - Validation SMAPE: {neural_prophet_score:6f}')

In [None]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

for country in countries:
    for store in stores:
        for product in products:
            visualize_ts(
                df=df_all,
                t='date',
                y='num_sold',
                forecasts=['prophet_forecast', 'neural_prophet_forecast'],
                start='2018-01-01',
                end='2020-01-01',
                country=country,
                store=store,
                product=product
            )

## 8. Submission

Test date range is indexed and data is converted to submission format below. Forecast of Prophet and Neural Prophet models are blended and submitted.

In [None]:
test_idx = (df_all['date'] >= '2019-01-01') & (df_all['date'] < '2020-01-01')
df_submission = df_all.loc[test_idx, ['row_id', 'prophet_forecast', 'neural_prophet_forecast']].reset_index(drop=True)
df_submission['num_sold'] = (df_submission['prophet_forecast'] + df_submission['neural_prophet_forecast']) / 2
df_submission[['row_id', 'num_sold']].to_csv('submission.csv', index=False)