In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from statsmodels.tsa.deterministic import DeterministicProcess

import matplotlib.pyplot as plt
import seaborn as sns

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(13, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Motivation

This notebook is based on my learnings from the fifth lesson in Kaggle's [Time Series](https://www.kaggle.com/learn/time-series) course, Hybrid Models, applied to the January 2022 Tabular Playground Series challenge. In my previous notebooks I applied linear models and a seasonal model to this challenge with mixed results. Hopefully a hybrid model is what's needed to improve further and advance a bit on the learderboard.

### Load the training data set

In [None]:
TRAIN_CSV = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_CSV = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'
SAMPLE_CSV = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'

# Use the date column as the index
train_df = pd.read_csv(TRAIN_CSV, parse_dates=['date'])
train_df = train_df.set_index('date').to_period('D')

In [None]:
def extract_subset(data_set, country, store, product):
    """
    Extract a subset of sales data for one country, store, and product.
    """
    df = data_set.loc[(data_set['country'] == country) &
                      (data_set['store'] == store) &
                      (data_set['product'] == product)].copy()
    return df

subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Mug')
subset_df

### The BoostedHybrid class

The tutorial shows us how to define a `BoostedHybrid` class that will apply two different models to a data set and combine the results. This is a powerful idea that could be expanded to include any number of models, but I'd expect diminishing returns.

In [None]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method
        
    def fit(self, X_1, X_2, y):
        # fit self.model_1
        self.model_1.fit(X_1, y)

        y_fit = pd.DataFrame(
            # make predictions with self.model_1
            self.model_1.predict(X_1),
            index=X_1.index, columns=y.columns,
        )

        # compute residuals
        y_resid = y - y_fit
        y_resid = y_resid.stack().squeeze() # wide to long

        # fit self.model_2 on residuals
        self.model_2.fit(X_2, y_resid)

        # Save column names for predict method
        self.y_columns = y.columns
        self.y_fit = y_fit
        self.y_resid = y_resid
        
    def predict(self, X_1, X_2):
        y_pred = pd.DataFrame(
            # predict with self.model_1
            self.model_1.predict(X_1),
            index=X_1.index, columns=self.y_columns,
        )
        y_pred = y_pred.stack().squeeze()  # wide to long

        # add self.model_2 predictions to y_pred
        y_pred += self.model_2.predict(X_2)

        return y_pred.unstack()  # long to wide

We'll create a `BoostedHybrid` model for each of our 18 sales categories. We could pass different models for each category, but we'll start out keeping it simple and use the same models for all. Let's see what shape the data should take for each model.

In [None]:
y = subset_df[['num_sold']]

# X_1: Features for Linear Regression
dp = DeterministicProcess(index=y.index, order=1)
X_1 = dp.in_sample()

X_2 = X_1.copy()

X_2["day_of_week"] = X_2.index.dayofweek
X_2["day_of_month"] = X_2.index.day
X_2["day_of_year"] = X_2.index.dayofyear
X_2 = X_2.drop('trend', axis='columns')

Now we'll split each of those data sets in two for training and validation.

In [None]:
# train on (2015, 2016, 2017) data, test on 2018
X_1_pre_2018 = X_1.loc[X_1.index < '2018-01-01']
X_1_2018 = X_1.loc[X_1.index > '2017-12-31']

X_2_pre_2018 = X_2.loc[X_2.index < '2018-01-01']
X_2_2018 = X_2.loc[X_1.index > '2017-12-31']

y_pre_2018 = y.loc[y.index < '2018-01-01']
y_2018 = y.loc[y.index > '2017-12-31']

### Train boosted hybrid

Create the hybrid model by initializing a `BoostedHybrid` class with `LinearRegression()` and `XGBRegressor()` instances.

In [None]:
# Create LinearRegression + XGBRegressor hybrid with BoostedHybrid
model = BoostedHybrid(
    model_1=LinearRegression(),
    model_2=XGBRegressor(),
)

# Fit and predict
model.fit(X_1_pre_2018, X_2_pre_2018, y_pre_2018)
y_pred = model.predict(X_1_2018, X_2_2018)

y_pred = y_pred.clip(0.0)
# y_pred.plot()
# y_2018.plot()

axs = y_2018.plot(subplots=True, sharex=True, figsize=(13, 5), color='gray', alpha=0.5, legend=False,
                  title='Finland KaggleMart Mug Sales')
y_pred.plot(subplots=True, color='C0', ax=axs, legend=False);

Some of the peaks aren't quite as high as expected, but this looks like a really good set of predictions for such a simple model. Let's look at how the same model does for Hat and Sticker sales.

In [None]:
subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Hat')

y = subset_df[['num_sold']]

# X_1: Features for Linear Regression
dp = DeterministicProcess(index=y.index, order=1)
X_1 = dp.in_sample()

X_2 = X_1.copy()

X_2["day_of_week"] = X_2.index.dayofweek
X_2["day_of_month"] = X_2.index.day
X_2["day_of_year"] = X_2.index.dayofyear
X_2 = X_2.drop('trend', axis='columns')

# train on (2015, 2016, 2017) data, test on 2018
X_1_pre_2018 = X_1.loc[X_1.index < '2018-01-01']
X_1_2018 = X_1.loc[X_1.index > '2017-12-31']

X_2_pre_2018 = X_2.loc[X_2.index < '2018-01-01']
X_2_2018 = X_2.loc[X_1.index > '2017-12-31']
X_2_2018

y_pre_2018 = y.loc[y.index < '2018-01-01']
y_2018 = y.loc[y.index > '2017-12-31']

# Create LinearRegression + XGBRegressor hybrid with BoostedHybrid
model = BoostedHybrid(
    model_1=LinearRegression(),
    model_2=XGBRegressor(),
)

# Fit and predict
model.fit(X_1_pre_2018, X_2_pre_2018, y_pre_2018)
y_pred = model.predict(X_1_2018, X_2_2018)

y_pred = y_pred.clip(0.0)
# y_pred.plot()
# y_2018.plot()

axs = y_2018.plot(subplots=True, sharex=True, figsize=(13, 5), color='gray', alpha=0.5, legend=False,
                  title='Finland KaggleMart Hat Sales')
y_pred.plot(subplots=True, color='C0', ax=axs, legend=False);

In [None]:
subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Sticker')

y = subset_df[['num_sold']]

# X_1: Features for Linear Regression
dp = DeterministicProcess(index=y.index, order=1)
X_1 = dp.in_sample()

X_2 = X_1.copy()

X_2["day_of_week"] = X_2.index.dayofweek
X_2["day_of_month"] = X_2.index.day
X_2["day_of_year"] = X_2.index.dayofyear
X_2 = X_2.drop('trend', axis='columns')

# train on (2015, 2016, 2017) data, test on 2018
X_1_pre_2018 = X_1.loc[X_1.index < '2018-01-01']
X_1_2018 = X_1.loc[X_1.index > '2017-12-31']

X_2_pre_2018 = X_2.loc[X_2.index < '2018-01-01']
X_2_2018 = X_2.loc[X_1.index > '2017-12-31']
X_2_2018

y_pre_2018 = y.loc[y.index < '2018-01-01']
y_2018 = y.loc[y.index > '2017-12-31']

# Create LinearRegression + XGBRegressor hybrid with BoostedHybrid
model = BoostedHybrid(
    model_1=LinearRegression(),
    model_2=XGBRegressor(),
)

# Fit and predict
model.fit(X_1_pre_2018, X_2_pre_2018, y_pre_2018)
y_pred = model.predict(X_1_2018, X_2_2018)

y_pred = y_pred.clip(0.0)
# y_pred.plot()
# y_2018.plot()

axs = y_2018.plot(subplots=True, sharex=True, figsize=(13, 5), color='gray', alpha=0.5, legend=False,
                  title='Finland KaggleMart Sticker Sales')
y_pred.plot(subplots=True, color='C0', ax=axs, legend=False);

There's definitely room for improvement, but this looks much better than my previous notebooks. Let's put this all together and see how it does on the entire data set.

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414

def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.round(np.mean(diff),5)

In [None]:
# train on (2015, 2016, 2017) data, test on 2018
df_pre_2018 = train_df.loc[train_df.index < '2018-01-01']
df_2018 = train_df.loc[train_df.index > '2017-12-31']

In [None]:
df_2018

### Build the models and make predictions

In [None]:
countries = ['Finland', 'Norway', 'Sweden']
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
stores = ['KaggleMart', 'KaggleRama']

def train_and_predict(train_df, test_df):
    pred_dfs = list()

    for country in countries:
        for product in products:
            for store in stores:
                subset_df = extract_subset(train_df, country, store, product)
                test_sample = extract_subset(test_df, country, store, product)
                
                y = subset_df[['num_sold']]

                # X_1: Features for Linear Regression
                dp = DeterministicProcess(index=y.index, order=1)
                X_1 = dp.in_sample()

                X_2 = X_1.copy()
                X_2["day_of_week"] = X_2.index.dayofweek
                X_2["day_of_month"] = X_2.index.day
                X_2["day_of_year"] = X_2.index.dayofyear
                X_2 = X_2.drop('trend', axis='columns')
                

                # Create features for a forecast.
                X_1_fore = dp.out_of_sample(steps=len(test_sample))
                
                X_2_fore = X_1_fore.copy()
                X_2_fore["day_of_week"] = X_2_fore.index.dayofweek
                X_2_fore["day_of_month"] = X_2_fore.index.day
                X_2_fore["day_of_year"] = X_2_fore.index.dayofyear
                X_2_fore = X_2_fore.drop('trend', axis='columns')

                # Create LinearRegression + XGBRegressor hybrid with BoostedHybrid
                model = BoostedHybrid(
                    model_1=LinearRegression(),
                    model_2=XGBRegressor(),
                )

                # Fit and predict
                model.fit(X_1, X_2, y)
                y_pred = model.predict(X_1_fore, X_2_fore)
                y_fore = y_pred.clip(0.0)
                
                pred_df = pd.DataFrame({'row_id': test_sample['row_id'],
                                        'y_pred': y_fore['num_sold']}).reset_index(drop=True)
                pred_dfs.append(pred_df)

    predictions_df = pd.concat(pred_dfs)
    return predictions_df

preds_2018_df = train_and_predict(df_pre_2018, df_2018)
preds_2018_df

In [None]:
val_pred_df = df_2018.merge(preds_2018_df, how='left', on='row_id')
val_pred_df

In [None]:
actual = np.array(val_pred_df['num_sold'])
forecast = np.array(val_pred_df['y_pred'])
SMAPE(actual, forecast)

That's a big improvement over previous notebooks, but the estimated SMAPE score isn't always close to what I've seen on the leaderboard when making forecasts with new data. Let's load the test data and make a submission.

### Load the test data set

In [None]:
# Keep the row_id for the sample submission
test_df = pd.read_csv(TEST_CSV, parse_dates=['date'])
test_df = test_df.set_index('date').to_period('D')
test_df

### Re-train models and make predictions using all training data

In [None]:
preds_2019_df = train_and_predict(train_df, test_df)
preds_2019_df

In [None]:
test_pred_df = test_df.merge(preds_2019_df, how='left', on='row_id')
test_pred_df

Let's combine the predictions with the original data to see how the forecasts look.

In [None]:
for country in countries:
    for product in products:
        for store in stores:
            df = extract_subset(train_df, country, store, product)
            pred_df = extract_subset(test_pred_df, country, store, product)
            pred_df = pred_df.rename(columns = {'y_pred': 'num_sold'})
            pred_df.index = pd.date_range(start='1/1/2019', end='12/31/2019')
            
            title = '{} - {} - {}'.format(country, product, store)
            sales_and_pred = pd.concat([df[['num_sold']], pred_df[['num_sold']]])
            
            sales_and_pred.plot(title=title, legend=False)

These plots all look like reasonable forecasts. The models we used are extremely simple two-model hybrids without any hyperparameter tuning. There's still a lot of room for improvement by selecting different models for different products, tuning those models, and adding additional models to the `BoostedHybrid` class.

### Create the submission file

In [None]:
submission_df = test_pred_df[['row_id', 'y_pred']]
submission_df.columns = ['row_id', 'num_sold']
submission_df.to_csv('./submission.csv', index=False)