In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Motivation

Since this month's Tabular Playground Series challenge has a time series dataset, this notebook is based on my learnings from the first lesson in Kaggle's [Time Series](https://www.kaggle.com/learn/time-series) course, *Linear Regression With Time Series*. I'm using multiple linear regression models (one for each country, product, store combination) without taking any seasonality into account.

The leaderboard results so far are very poor, but I like a nice low baseline to beat. I'll share other notebooks for this competition as I progress through the course.

### Load the training data set

In [None]:
TRAIN_CSV = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_CSV = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'
SAMPLE_CSV = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'

# Use the date column as the index
train_df = pd.read_csv(TRAIN_CSV, index_col='date', parse_dates=['date'])
train_df

### View total sales figures by country, product, and store

In [None]:
train_df.groupby(['country', 'product', 'store'])['num_sold'].sum()

We're looking at sales for three different products in three different countries at two competing stores. Let's start by looking at plots for each product to see what trends we can spot.

### Plot sales for each product, country, and retailer

In [None]:
def sales_viz(df, country, product, store):
    """
    Plot the sales data for a given country, product, and store.
    """
    fig, ax = plt.subplots(figsize=(16, 8))
    ax.plot('time_step', 'num_sold',
            data=df.loc[(df['country']==country) & (df['store']==store)],
            color='0.75')
    ax = sns.regplot(x='time_step', y='num_sold',
                     data=df.loc[(df['country']==country) & (df['store']==store)],
                     ci=None, scatter_kws=dict(color='0.25'))
    title = "Time Plot of {product} Sales at {store} in {country}".format(product=product,
                                                                          store=store,
                                                                          country=country)
    ax.set_title(title)
    ax.set_xlabel('Time')
    ax.set_ylabel('{product}s Sold'.format(product=product));
    

In [None]:
hat_sales = train_df.loc[train_df['product'] == 'Kaggle Hat'].copy()
hat_sales['time_step'] = np.arange(len(hat_sales.index)) # Create a time step feature
hat_sales.shape

In [None]:
sales_viz(hat_sales, 'Finland', 'Hat', 'KaggleMart')
sales_viz(hat_sales, 'Finland', 'Hat', 'KaggleRama')
sales_viz(hat_sales, 'Norway', 'Hat', 'KaggleMart')
sales_viz(hat_sales, 'Norway', 'Hat', 'KaggleRama')
sales_viz(hat_sales, 'Sweden', 'Hat', 'KaggleMart')
sales_viz(hat_sales, 'Sweden', 'Hat', 'KaggleRama')

In [None]:
mug_sales = train_df.loc[train_df['product'] == 'Kaggle Mug'].copy()
mug_sales['time_step'] = np.arange(len(mug_sales.index)) # Create a time step feature
mug_sales.shape

In [None]:
sales_viz(mug_sales, 'Finland', 'Mug', 'KaggleMart')
sales_viz(mug_sales, 'Finland', 'Mug', 'KaggleRama')
sales_viz(mug_sales, 'Norway', 'Mug', 'KaggleMart')
sales_viz(mug_sales, 'Norway', 'Mug', 'KaggleRama')
sales_viz(mug_sales, 'Sweden', 'Mug', 'KaggleMart')
sales_viz(mug_sales, 'Sweden', 'Mug', 'KaggleRama')

In [None]:
sticker_sales = train_df.loc[train_df['product'] == 'Kaggle Sticker'].copy()
sticker_sales['time_step'] = np.arange(len(sticker_sales.index)) # Create a time step feature
sticker_sales.shape

In [None]:
sales_viz(sticker_sales, 'Finland', 'Sticker', 'KaggleMart')
sales_viz(sticker_sales, 'Finland', 'Sticker', 'KaggleRama')
sales_viz(sticker_sales, 'Norway', 'Sticker', 'KaggleMart')
sales_viz(sticker_sales, 'Norway', 'Sticker', 'KaggleRama')
sales_viz(sticker_sales, 'Sweden', 'Sticker', 'KaggleMart')
sales_viz(sticker_sales, 'Sweden', 'Sticker', 'KaggleRama')

### Initial thoughts

We can see from the grouped totals that KaggleRama outsells KaggleMart in every country and for every product. The difference is nearly 2:1.

From the sales trends we can see that both companies have similar seasonality trends for hats and mugs in all three countries. Sticker sales do not seem to have the same seasonality, other than spikes during end-of-year holidays.

It's probably worthwhile to develop linear models for each country/product/store combination and combine the results to submit as a baseline.

### Estimate SMAPE score

To estimate our leaderboard score, we can train out models on the first three years of training data, then compute the SMAPE score for the last year of training data.

In [None]:
df_pre_2018 = train_df.loc[train_df.index < '2018-01-01']
df_2018 = train_df.loc[train_df.index > '2017-12-31']
df_2018

### Build the linear models

In [None]:
countries = ['Finland', 'Norway', 'Sweden']
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
stores = ['KaggleMart', 'KaggleRama']

In [None]:
def train_linear_models(data_df):
    models = dict() # keys will be a tuple of (country, product, store)
    training_step_len = dict()

    for country in countries:
        for product in products:
            for store in stores:
                df = data_df.loc[(data_df['country'] == country) &
                                 (data_df['product'] == product) &
                                 (data_df['store'] == store)].copy()
                training_step_len[(country, product, store)] = len(df.index)
                df['time_step'] = np.arange(len(df.index)) # Create a time step feature

                # Train the model
                model = LinearRegression()
                model.fit(df[['time_step']], df['num_sold'])
                models[(country, product, store)] = model

    return models, training_step_len

val_models, training_step_len = train_linear_models(df_pre_2018)
val_models

### Make predictions for 2018 sales

In [None]:
def make_predictions(data_df, training_step_len, models):
    pred_dfs = list()

    for country in countries:
        for product in products:
            for store in stores:
                df = data_df.loc[(data_df['country'] == country) &
                                 (data_df['product'] == product) &
                                 (data_df['store'] == store)].copy()
                tsl = training_step_len[(country, product, store)]
                test_len = len(df.index)
                df['time_step'] = np.arange(tsl, tsl + test_len) # Create a time step feature offset from training data

                model = models[(country, product, store)]
                y_pred = model.predict(df[['time_step']])
                pred_df = pd.DataFrame({'row_id': df['row_id'], 'num_sold': y_pred}).reset_index(drop=True)
                pred_dfs.append(pred_df)

    predictions_df = pd.concat(pred_dfs)
    return predictions_df

preds_2018_df = make_predictions(df_2018, training_step_len, val_models)
preds_2018_df.columns = ['row_id', 'y_pred']
preds_2018_df

In [None]:
val_pred_df = df_2018.merge(preds_2018_df, how='left', on='row_id')
val_pred_df

Now we're ready to compare the predictions to the acutal number of each product sold. We'll need a [SMAPE (symmetric mean absolute percentage error)](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error) function to do that.

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

actual = np.array(val_pred_df['num_sold'])
forecast = np.array(val_pred_df['y_pred'])
smape(actual, forecast)

This is not a great score, but it's fine for a start. Out actual score should be a little bit better, since we'll be training on 4 years of data instead of only 3 before we make predictions for the test data.

### Load the test data set

In [None]:
# Keep the row_id for the sample submission
test_df = pd.read_csv(TEST_CSV, index_col='date', parse_dates=['date'])
test_df

### Re-train models on all training data

In [None]:
train_models, training_step_len = train_linear_models(train_df)
train_models

### Make predictions for 2019 sales

In [None]:
preds_2019_df = make_predictions(test_df, training_step_len, train_models)
preds_2019_df

In [None]:
test_pred_df = test_df.merge(preds_2019_df, how='left', on='row_id')
test_pred_df

In [None]:
submission_df = test_pred_df[['row_id', 'num_sold']]
submission_df.to_csv('./submission.csv', index=False)