In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(14, 6))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

from sklearn.linear_model import LinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Motivation

This notebook is based on my learnings from the second lesson in Kaggle's [Time Series](https://www.kaggle.com/learn/time-series) course, *Trend*, applied to the January 2022 Tabular Playground Series challenge. I don't expect the results to be much improved over my [linear baseline](https://www.kaggle.com/bcruise/tps-jan-2022-linear-baseline/notebook), but it's worth a shot.

### Load the training data set

In [None]:
TRAIN_CSV = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_CSV = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'
SAMPLE_CSV = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'

# Use the date column as the index
train_df = pd.read_csv(TRAIN_CSV, parse_dates=['date'])
train_df = train_df.set_index('date').to_period('D')
train_df

We'll start with just a small sample of the data so we can show a rolling average for one product, country, and store.

In [None]:
hat_sales = train_df.loc[train_df['product'] == 'Kaggle Hat'].copy()
data_sample=hat_sales.loc[(hat_sales['country']=='Finland') & (hat_sales['store']=='KaggleMart')]
data_sample = data_sample['num_sold']

### Plotting a rolling average

In [None]:
trend = data_sample.rolling(
    window=365,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=183,  # choose about half the window size
).mean()              # compute the mean (could also do median, std, min, max, ...)

# Make a plot
ax = data_sample.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=3, legend=False)

The rolling average is used to determine if there's an overall trend in the data by averaging over the longest period (a year in this case). The data seems to be trending slightly upwards each year, but it looks fairly linear. This is why I suspect the results will not be much better than the baseline. Let's try a quadratic fit to see if it's any better.

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

y = data_sample.copy()  # the target

# Instantiate `DeterministicProcess` with arguments appropriate for a quadratic trend model
dp = DeterministicProcess(index=data_sample.index, order=2)

# Create the feature set for the dates given in y.index
X = dp.in_sample()

# Create features for a 1-year forecast.
X_fore = dp.out_of_sample(steps=365)

model = LinearRegression()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)
y_fore = pd.Series(model.predict(X_fore), index=X_fore.index)

ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, linewidth=3, label="Trend", color='C0')
ax = y_fore.plot(ax=ax, linewidth=3, label="Trend Forecast", color='C3')

Note the slight upwards curve of the trendline and forecast. The predictions for this model should be slightly higher than for a linear model.

### Estimate SMAPE score
To estimate our leaderboard score, we can train out models on the first three years of training data, then compute the SMAPE score for the last year of training data.

In [None]:
df_pre_2018 = train_df.loc[train_df.index < '2018-01-01']
df_2018 = train_df.loc[train_df.index > '2017-12-31']
df_2018

### Build the quadratic models and make predictions

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

countries = ['Finland', 'Norway', 'Sweden']
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
stores = ['KaggleMart', 'KaggleRama']

def train_and_predict(train_df, test_df):
    pred_dfs = list()

    for country in countries:
        for product in products:
            for store in stores:
                df = train_df.loc[(train_df['country'] == country) &
                                  (train_df['product'] == product) &
                                  (train_df['store'] == store)].copy()
                test_sample = test_df.loc[(test_df['country'] == country) &
                                          (test_df['product'] == product) &
                                          (test_df['store'] == store)].copy()
                
                y = df['num_sold'].copy()  # the target
                
                # Instantiate `DeterministicProcess` with arguments appropriate for a quadratic trend model.
                # If we use order=1 here, it will be exactly the same as a linear model.
                dp = DeterministicProcess(index=df.index, order=2)

                # Create the feature set for the dates given in y.index
                X = dp.in_sample()

                # Create features for a forecast.
                X_fore = dp.out_of_sample(steps=len(test_sample))

                # Train the model
                model = LinearRegression()
                model.fit(X, y)
                
                # Make the forecast
                y_fore = model.predict(X_fore)
                
                pred_df = pd.DataFrame({'row_id': test_sample['row_id'], 'y_pred': y_fore}).reset_index(drop=True)
                pred_dfs.append(pred_df)

    predictions_df = pd.concat(pred_dfs)
    return predictions_df

preds_2018_df = train_and_predict(df_pre_2018, df_2018)
preds_2018_df

In [None]:
val_pred_df = df_2018.merge(preds_2018_df, how='left', on='row_id')
val_pred_df

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

actual = np.array(val_pred_df['num_sold'])
forecast = np.array(val_pred_df['y_pred'])
smape(actual, forecast)

This score is (as expected) worse than using a linear model. We could try higher-order polynomials, but those are only likely to be even worse for forecasting.

There's not much use in making predictions to submit to the competition based on this model, but let's do it anyway. We've made it this far, and the hard part is already done.

### Load the test data set

In [None]:
# Keep the row_id for the sample submission
test_df = pd.read_csv(TEST_CSV, index_col='date', parse_dates=['date'])
test_df

### Re-train models and make predictions using all training data

In [None]:
preds_2019_df = train_and_predict(train_df, test_df)
preds_2019_df

In [None]:
test_pred_df = test_df.merge(preds_2019_df, how='left', on='row_id')
test_pred_df

In [None]:
submission_df = test_pred_df[['row_id', 'y_pred']]
submission_df.columns = ['row_id', 'num_sold']
submission_df.to_csv('./submission.csv', index=False)