This notebook is a baseline application of [MINIROCKET](https://paperswithcode.com/paper/minirocket-a-very-fast-almost-deterministic). It uses the sktime library and simple trend and seasonality features. There are many more features that can be generated to increase performance. Please comment and upvote if you find it useful.

## Prep the environment

In [None]:
!pip install sktime --quiet --ignore-installed -U

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import random
seed = 53 ## used for consistency
random.seed(seed)

# Prepare Training Data

In [None]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
train['date'] = train.date.dt.to_period('D')
train = train.set_index(['store_nbr', 'family', 'date']).sort_index()
train.shape

In [None]:
# set dates for subset of training data to avoid earthquake effect
start_date = '2016-07-01'
end_date = '2017-08-15'

from learntools.time_series.utils import plot_periodogram, seasonal_plot

average_sales = train.groupby('date').mean().squeeze().loc[start_date:end_date]
plot_periodogram(average_sales)

It appears there is some strong biweekly (paychecks) seasonality and also some weak annual seaonality. Create trend and seasonal features based on Annual and Weekly frequencies.

In [None]:
y = train.unstack(['store_nbr', 'family']).loc[start_date:end_date]
y.shape

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, TimeTrend, Seasonality, Fourier
fourier1 = CalendarFourier(freq='W', order=4)
fourier2 = CalendarFourier(freq='A', order=6)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier1, fourier2],
    drop=True,
)

x = dp.in_sample()
x.shape

In [None]:
# convert to input format needed for sktime
from sktime.datatypes._panel._convert import from_2d_array_to_nested, is_nested_dataframe
X_2d = x.values
X_nested = from_2d_array_to_nested(X_2d)
print(f"X_nested is a nested DataFrame: {is_nested_dataframe(X_nested)}")
print(f"The cell contains a {type(X_nested.iloc[0,0])}.")
print(f"The nested DataFrame has shape {X_nested.shape}")
X_nested.head()

# Transform features and fit

In [None]:
from sktime.transformations.panel.rocket import MiniRocket
minirocket = MiniRocket()  # by default, MiniRocket uses ~10,000 kernels
minirocket.fit(X_nested)
X_nested_transform = minirocket.transform(X_nested)
print(X_nested_transform.shape)

In [None]:
import numpy as np
from sklearn.linear_model import RidgeCV
model = RidgeCV(alphas=np.logspace(-3, 3, 10), normalize=True, fit_intercept=True)
model.fit(X_nested_transform, y)

# Assess with competition metric


In [None]:
from sklearn.metrics import mean_squared_log_error
def RMSLE(y_true:np.ndarray, y_pred:np.ndarray) -> np.float64:
    """
        The Root Mean Squared Log Error (RMSLE) metric 
        
        :param y_true: The ground truth labels given in the dataset
        :param y_pred: Our predictions
        :return: The RMSLE score
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
y_pred = model.predict(X_nested_transform)
# remove negative predictions
y_pred[y_pred<0] = 0.0
score = RMSLE(y, y_pred)
print(score)

Check detrended and deseasoned results

In [None]:
import matplotlib.pyplot as plt
y_des = y - y_pred
y_des = pd.DataFrame(y_des, index=x.index, columns=y.columns)
y_des = y_des.stack(['store_nbr', 'family']).reset_index()
average_sales_des = y_des.groupby('date').mean().squeeze().loc[start_date:end_date]

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(average_sales, ax=ax1)
ax1.set_title("Sales Frequency Components")
ax2 = plot_periodogram(average_sales_des, ax=ax2);
ax2.set_title("Deseasonalized");

Check which product families are not well predicted

In [None]:
y_pred = pd.DataFrame(y_pred, index=x.index, columns=y.columns)
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) 
results = y_target.groupby('family').apply(lambda r: RMSLE(r['sales'], r['sales_pred']))
results.plot(kind='barh', figsize=(4,8))

Future work should concentrate on food and drinks. 

# Prep Test Data

In [None]:
df_test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

# Create features for test set
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'

In [None]:
X_2d = X_test.values
X_nested = from_2d_array_to_nested(X_2d)
print(f"X_nested is a nested DataFrame: {is_nested_dataframe(X_nested)}")
print(f"The cell contains a {type(X_nested.iloc[0,0])}.")
print(f"The nested DataFrame has shape {X_nested.shape}")
X_nested.head()

In [None]:
# only transform, do not fit
X_nested_transform = minirocket.transform(X_nested)

In [None]:
## Make prediction and create submission file
y_pred = model.predict(X_nested_transform)
# remove negative predictions
y_pred[y_pred<0] = 0.0
y_submit = pd.DataFrame(y_pred, index=X_test.index, columns=y.columns)
y_submit.info()
y_submit = y_submit.stack(['store_nbr', 'family'])
y_submit = y_submit.join(df_test.id).reindex(columns=['id', 'sales'])
y_submit.to_csv('submission.csv', index=False)