In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
import seaborn as sns

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(13, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Motivation

This notebook is based on my learnings from the third lesson in Kaggle's [Time Series](https://www.kaggle.com/learn/time-series) course, *Seasonality*, applied to the January 2022 Tabular Playground Series challenge. There's a definite annual cycle to the sales of two out of three products that we're trying to predict, so I expect we'll be able to improve on the linear baseline by using seasonality in our models.

### Load the training data set

In [None]:
TRAIN_CSV = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_CSV = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'
SAMPLE_CSV = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'

# Use the date column as the index
train_df = pd.read_csv(TRAIN_CSV, parse_dates=['date'])
train_df = train_df.set_index('date').to_period('D')

In my previous two notebooks, I had a lot of code duplication that I'm trying to reduce. Let's start by defining a simple function to extract a subset of the data for a country, store, and product. This will be useful for creating charts and for building models.

In [None]:
def extract_subset(data_set, country, store, product):
    df = data_set.loc[(data_set['country'] == country) &
                      (data_set['store'] == store) &
                      (data_set['product'] == product)].copy()
    return df

subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Mug')
subset_df

In [None]:
subset_df['num_sold'].plot(title='Time plot of Mug sales at KaggleMart in Finland');

I'm also stealing two functions for making seasonal plots and periodogram from the Kaggle Time Series [Seasonality tutorial](https://www.kaggle.com/ryanholbrook/seasonality/tutorial). Expand the following code section to view their source.

In [None]:
# annotations: https://stackoverflow.com/a/49238256/5769929
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax

def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

Let's take a look at seasonal plots for the mug sales sample data over a week and over a year.

In [None]:
X = subset_df.copy()

# days within a week
X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)

# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year

fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="num_sold", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="num_sold", period="year", freq="dayofyear", ax=ax1);

In [None]:
plot_periodogram(subset_df['num_sold']);

There's definitely very strong annual seasonality and weekly seasonality. There are several other time periods as well, but for this notebook I really just want to capture the annual seasons. There will be other notebooks in the series where I might explore other trends.

Let's see the sales, seasonal plots and periodograms for hat and sticker sales as well.

In [None]:
subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Hat')
subset_df['num_sold'].plot(title='Time plot of Hat sales at KaggleMart in Finland');

In [None]:
X = subset_df.copy()

# days within a week
X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)

# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year

fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="num_sold", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="num_sold", period="year", freq="dayofyear", ax=ax1);

In [None]:
plot_periodogram(subset_df['num_sold']);

In [None]:
subset_df = extract_subset(train_df, 'Finland', 'KaggleMart', 'Kaggle Sticker')
subset_df['num_sold'].plot(title='Time plot of Sticker sales at KaggleMart in Finland');

In [None]:
X = subset_df.copy()

# days within a week
X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)

# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year

fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="num_sold", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="num_sold", period="year", freq="dayofyear", ax=ax1);

In [None]:
plot_periodogram(subset_df['num_sold']);

For hat sales, the annual seasonality is even stronger than for other time periods. For sticker sales, the weekly seasonality period is much stronger.

### Estimate SMAPE score

To estimate our leaderboard score, we can train out models on the first three years of training data, then compute the SMAPE score for the last year of training data. I'll use the SMAPE function provided by CPMP during the [Web Traffic Time Series Forecasting](https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414) competition a few years ago.

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414

def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.round(np.mean(diff),5)

In [None]:
# train on (2015, 2016, 2017) data, test on 2018
df_pre_2018 = train_df.loc[train_df.index < '2018-01-01']
df_2018 = train_df.loc[train_df.index > '2017-12-31']

### Build the models and make predictions

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

countries = ['Finland', 'Norway', 'Sweden']
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
stores = ['KaggleMart', 'KaggleRama']

def train_and_predict(train_df, test_df):
    pred_dfs = list()

    for country in countries:
        for product in products:
            for store in stores:
                df = extract_subset(train_df, country, store, product)
                test_sample = extract_subset(test_df, country, store, product)
                
                y = df['num_sold'].copy()  # the target
                
                if product == 'Kaggle Sticker':
                    # Use a simple linear model for sticker sales
                    dp = DeterministicProcess(index=df.index, order=1)
                else:
                    # Add an annual seasonal component for mug sales and for hats
                    order = 1 if product == 'Kaggle Mug' else 2
                    fourier = CalendarFourier(freq="A", order=order)  # sin/cos pairs for "A"nnual seasonality

                    dp = DeterministicProcess(
                        index=df.index,
                        constant=True,               # dummy feature for bias (y-intercept)
                        order=1,                     # trend (order 1 means linear)
                        seasonal=True,               # weekly seasonality (indicators)
                        additional_terms=[fourier],  # annual seasonality (fourier)
                        drop=True,                   # drop terms to avoid collinearity
                    )

                # Create the feature set for the dates given in y.index
                X = dp.in_sample()

                # Create features for a forecast.
                X_fore = dp.out_of_sample(steps=len(test_sample))

                # Train the model
                model = LinearRegression()
                model.fit(X, y)
                
                # Make the forecast
                y_fore = model.predict(X_fore)
                # y_fore = pd.Series(model.predict(X), index=X.index)
                
                pred_df = pd.DataFrame({'row_id': test_sample['row_id'], 'y_pred': y_fore}).reset_index(drop=True)
                pred_dfs.append(pred_df)

    predictions_df = pd.concat(pred_dfs)
    return predictions_df

preds_2018_df = train_and_predict(df_pre_2018, df_2018)
preds_2018_df

In [None]:
val_pred_df = df_2018.merge(preds_2018_df, how='left', on='row_id')

In [None]:
actual = np.array(val_pred_df['num_sold'])
forecast = np.array(val_pred_df['y_pred'])
SMAPE(actual, forecast)

This is a much better score than we saw in previous notebooks! My linear baseline (and best score so far) was 15.6. Let's make a submission with these models.

### Load the test data set

In [None]:
# Keep the row_id for the sample submission
test_df = pd.read_csv(TEST_CSV, index_col='date', parse_dates=['date'])
test_df

### Re-train models and make predictions using all training data

In [None]:
preds_2019_df = train_and_predict(train_df, test_df)
preds_2019_df

In [None]:
test_pred_df = test_df.merge(preds_2019_df, how='left', on='row_id')
test_pred_df

Let's combine the predictions with the original data to see how the forecasts look.

In [None]:
for country in countries:
    for product in products:
        for store in stores:
            df = extract_subset(train_df, country, store, product)
            pred_df = extract_subset(test_pred_df, country, store, product)
            pred_df = pred_df.rename(columns = {'y_pred': 'num_sold'})
            pred_df.index = pd.date_range(start='1/1/2019', end='12/31/2019')
            
            title = '{} - {} - {}'.format(country, product, store)
            sales_and_pred = pd.concat([df[['num_sold']], pred_df[['num_sold']]])
            
            sales_and_pred.plot(title=title, legend=False)

It looks like some of the 2019 predictions might be a little bit high, maybe due to the spike in sales at the end of 2018. This led to a LB score of 14.3, much higher than the estimate. Hopefully we'll see how to deal with this in one of the next lessons.

In [None]:
submission_df = test_pred_df[['row_id', 'y_pred']]
submission_df.columns = ['row_id', 'num_sold']
submission_df.to_csv('./submission.csv', index=False)