# Hybrid Model - Introduction #
- simple (linear) model recursively improved by the second (gradient boosting) model

- based on Kaggle Time Series Course: https://www.kaggle.com/learn/time-series

- motivation to share this notebook is presented in Discussion about course

Run the cell below to:
(1) import libraries;
(2) load data from files;
(3) set deafaults for charts.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import mutual_info_regression
from lightgbm import LGBMRegressor
from learntools.time_series.utils import (plot_lags, make_lags, make_leads,
                                          plot_multistep, make_multistep_target,
                                          plot_periodogram, seasonal_plot)
from pathlib import Path

# Load data from files
comp_dir = Path('../input/store-sales-time-series-forecasting')

train = pd.read_csv(comp_dir / 'train.csv',
                    parse_dates=['date'], infer_datetime_format=True,
                    dtype={'store_nbr': 'category',
                           'family': 'category',
                           'sales': 'float32',
                           'onpromotion': 'uint64'})
train['date'] = train.date.dt.to_period(freq="D")

test = pd.read_csv(comp_dir / 'test.csv',
                   parse_dates=['date'], infer_datetime_format=True,
                   dtype={'store_nbr': 'category',
                          'family': 'category',
                          'onpromotion': 'uint32'})
test['date'] = test.date.dt.to_period(freq='D')

transactions = pd.read_csv(comp_dir / 'transactions.csv',
                           parse_dates=['date'], infer_datetime_format=True,
                           dtype={'store_nbr': 'category',
                                  'transactions': 'uint32'})

stores = pd.read_csv(comp_dir / 'stores.csv')

oil = pd.read_csv(comp_dir / 'oil.csv',
                  parse_dates=['date'], infer_datetime_format=True)
oil = oil.set_index('date').to_period('D')

holidays_events = pd.read_csv(comp_dir / 'holidays_events.csv',
                              parse_dates=['date'], infer_datetime_format=True,
                              dtype={'type': 'category',
                                     'locale': 'category',
                                     'locale_name': 'category',
                                     'description': 'category',
                                     'transferred': 'bool'})
holidays_events = holidays_events.set_index('date').to_period('D')


# Set Matplotlib defaults
sns.set_theme(color_codes=True)
plt.rc('figure',
       autolayout=True,
       figsize=(11, 4),
       titlesize=18,
       titleweight='bold')
plt.rc('axes',
       labelweight='bold',
       labelsize='large',
       titleweight='bold',
       titlepad=10,
       titlesize=16)
plot_params = dict(color='0.75',
                   style='.-',
                   markeredgecolor='0.25',
                   markerfacecolor='0.25',
                   legend=False)

In the next cell we define some useful functions. Feel free to skip it out and return here only when you need it. 

In [None]:
# Share plot: EDA
def share_plot_eda(ts, START, END, shop, start_col, num_families, show_trend, show_model_1, show_improvement,
                   trend_frame=None, model_1_fit=None, model_1_predict=None, hybrid_predict=None):
    """Plot charts of many ("num_families") families,
    starting from given family ("start_col"), in given store ("shop") at once;
    for time series table "ts", from "START" to "END" date"""
    families = ts[shop].columns[start_col:start_col + num_families]
    axs = ts[shop].loc(axis=1)[families].loc[START:END].plot(
        subplots=True, sharex=True, figsize=(11, 9), **plot_params, alpha=1)
    if show_trend:
        _ = trend_frame['sales'][shop].loc(axis=1)[families].loc[START:END].plot(
            subplots=True, color='blue', alpha=0.5, linewidth=1.5, ax=axs, legend=False)
    if show_model_1:
        _ = model_1_fit['sales'][shop].loc(axis=1)[families].loc[START:END].plot(subplots=True, color='blue', ax=axs)
        _ = model_1_predict['sales'][shop].loc(axis=1)[families].plot(subplots=True, color='green', ax=axs)
    if show_improvement:
        _ = hybrid_predict['sales'][shop].loc(axis=1)[families].plot(subplots=True, color='red', ax=axs)
    for ax, family in zip(axs, families):
        if show_model_1:
            ax.legend(['sales', 'fit', 'predict'], loc='upper left')
        ax.set_title(family + " || " + shop)


# Share plot: residuals
def share_plot_residuals(ts, START, END, shop, start_col, num_families):
    """Plot charts of many ("num_families") families,
    starting from given family ("start_col"), in given store ("shop") at once;
    for residuals table "ts", from "START" to "END" date"""
    families = ts[shop].columns[start_col:start_col + num_families]
    axs = ts[shop].loc(axis=1)[families].loc[START:END].plot(
        subplots=True, sharex=True, figsize=(11, 9), alpha=0.5, style='.'
    )
    for ax, family in zip(axs, families):
        ax.set_title(family + " || " + shop)



# Create features based on datetime index
def create_index_features(basic_set, steps, fourier_freq='M', fourier_order=4, seasonal=True, order=1,
                          holidays=True, oil_prices=True):
    """Use DeterministicProcess and CalendarFourier to create:
    - "trend-feature" of given order;
    - indicators for weekly seasons ("seasonal");
    - Fourier features for long-time seasons;
    - indicators for holidays;
    - oil prices.
    Validation or test set starts after the last day of basic set"""
    fourier = CalendarFourier(freq=fourier_freq, order=fourier_order)
    dp = DeterministicProcess(index=basic_set.index,
                              constant=True,
                              order=order,
                              seasonal=seasonal,
                              additional_terms=[fourier],
                              drop=True)
    # Create features for the first model (features derived from index) - train set
    X_train = dp.in_sample()
    X_test = dp.out_of_sample(steps=steps)
    X_test.index.rename('date', inplace=True)
    if holidays:
        X_train = X_train.merge(X_holidays, how='left', left_index=True, right_index=True).fillna(0)
        X_test = X_test.merge(X_holidays, how='left', left_index=True, right_index=True).fillna(0)
    if oil_prices:
        X_train = X_train.merge(oil, how='left', left_index=True, right_index=True) \
            .fillna(method='ffill').fillna(method='bfill')
        X_test = X_test.merge(oil, how='left', left_index=True, right_index=True) \
            .fillna(method='ffill').fillna(method='bfill')
    return X_train, X_test


# Error function
def error_check(y_train_, y_fit_, y_valid_, y_pred_):
    """Returns RMSLE, RMSE and MAE for train and validation data."""
    # Root mean square log error for train and validation data
    rmsle_train = mean_squared_log_error(y_train_, y_fit_) ** 0.5
    rmsle_valid = mean_squared_log_error(y_valid_, y_pred_) ** 0.5
    print(f'Training RMSLE: {rmsle_train:.5f}')
    print(f'Validation RMSLE: {rmsle_valid:.5f}')
    # Root mean square error for train and validation data
    rmse_train = mean_squared_error(y_train_, y_fit_) ** 0.5
    rmse_valid = mean_squared_error(y_valid_, y_pred_) ** 0.5
    print(f'Training RMSE: {rmse_train:.5f}')
    print(f'Validation RMSE: {rmse_valid:.5f}')
    # Mean absolute error for train and validation data
    mae_train = mean_absolute_error(y_train_, y_fit_)
    mae_valid = mean_absolute_error(y_valid_, y_pred_)
    print(f'Training MAE: {mae_train:.5f}')
    print(f'Validation MAE: {mae_valid:.5f}')


# Prepare submission file
def prepare_submission(forecast, path):
    """Prepare submission file."""
    forecast_ = forecast.unstack()
    forecast_ = forecast_.reset_index()
    # forecast_ = forecast_.drop(columns={'level_0'})
    forecast_ = forecast_.rename(columns={forecast_.columns[-1]: 'sales'})
    sub = test.merge(forecast_, on=['date', 'store_nbr', 'family'])
    submit = sub[['id', 'sales']]
    submit.to_csv(path, index=False)


# Compute lags
def lets_lag(df, group, column, name, lag):
    """Compute n-lag of time series in long format."""
    lag = df.groupby(group)[column].shift(lag)
    lag.index = df.index
    lag = lag.to_frame()
    lag = lag.rename(columns={lag.columns[0]: name})
    return lag


# Compute rolling means
def lets_roll(df, group, column, name, window, min_periods=1, center=False):
    """Compute rolling mean for (1-lagged) time series."""
    rolling_mean = df.groupby(group)[column]. \
        rolling(window=window, min_periods=min_periods, center=center).mean()
    rolling_mean.index = df.index
    # rolling_mean = rolling_mean.to_frame()
    rolling_mean = rolling_mean.rename(columns={rolling_mean.columns[0]: name})
    return rolling_mean


# Compute ewm
def lets_ewm(df, group, column, name, alpha):
    """Compute ewm for (1-lagged) time series."""
    ewm = df.groupby(group)[column]. \
        ewm(alpha=alpha).mean()
    ewm.index = df.index
    # ewm = ewm.to_frame()
    ewm = ewm.rename(columns={ewm.columns[0]: name})
    return ewm


# Create basic tables
def create_next_day_table(basic_set):
    """Given basetable in wide format, creates features (lags, rollings, etc.),
    splits data into train set ('before'), with corresponding 'target', and test/valid set ('after')."""
    # Add next day to existing table and transform table to the long format
    next_day = pd.DataFrame(columns=basic_set.columns,
                            index=pd.period_range(start=basic_set.index.max() + 1, periods=1, name='date'))
    concat = pd.concat([basic_set, next_day]).fillna(0)
    concat = concat.stack(['store_nbr', 'family'])
    concat = concat.reset_index()
    concat = concat.set_index(['store_nbr', 'family', 'date']).sort_index()
    # Create 1 - 7 'lag' features
    y_lag_1 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_1", lag=1)
    y_lag_2 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_2", lag=2)
    y_lag_3 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_3", lag=3)
    y_lag_4 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_4", lag=4)
    y_lag_5 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_5", lag=5)
    y_lag_6 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_6", lag=6)
    y_lag_7 = lets_lag(df=concat, group=['store_nbr', 'family'], column="sales", name="y_lag_7", lag=7)
    # Merge tables
    concat = concat.merge(y_lag_1, left_index=True, right_index=True) \
        .merge(y_lag_2, left_index=True, right_index=True) \
        .merge(y_lag_3, left_index=True, right_index=True) \
        .merge(y_lag_4, left_index=True, right_index=True) \
        .merge(y_lag_5, left_index=True, right_index=True) \
        .merge(y_lag_6, left_index=True, right_index=True) \
        .merge(y_lag_7, left_index=True, right_index=True)
    # Create rolling features
    rolling_mean_3 = lets_roll(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], name="rolling_mean_3",
                               window=3)
    rolling_mean_7 = lets_roll(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], name="rolling_mean_6",
                               window=7)
    rolling_mean_14 = lets_roll(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], name="rolling_mean_12",
                                window=14, min_periods=13)
    ewm_01 = lets_ewm(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], alpha=0.1, name='ewm_01')
    ewm_03 = lets_ewm(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], alpha=0.3, name='ewm_03')
    ewm_09 = lets_ewm(df=concat, group=['store_nbr', 'family'], column=['y_lag_1'], alpha=0.9, name='ewm_09')
    # Merge tables
    concat = concat.merge(rolling_mean_3, left_index=True, right_index=True) \
        .merge(rolling_mean_7, left_index=True, right_index=True) \
        .merge(rolling_mean_14, left_index=True, right_index=True) \
        .merge(ewm_01, left_index=True, right_index=True) \
        .merge(ewm_03, left_index=True, right_index=True) \
        .merge(ewm_09, left_index=True, right_index=True)
    # Create features from index
    concat['day_of_week'] = concat.index.get_level_values(2).dayofweek
    concat['day_of_month'] = concat.index.get_level_values(2).day
    concat['month'] = concat.index.get_level_values(2).month
    concat['year'] = concat.index.get_level_values(2).year
    concat['fam'] = concat.index.get_level_values(1)
    concat['fam'], _ = concat['fam'].factorize()
    concat['store'] = concat.index.get_level_values(0)
    # Create features by merging with another data
    concat = concat.reset_index()
    # With 'stores'
    concat['store'] = concat['store'].astype('int64')
    concat = concat.merge(stores[['store_nbr', 'type', 'cluster']],
                          how='left', left_on='store', right_on='store_nbr')
    concat = concat.drop(columns={'store_nbr_y'})
    concat = concat.rename(columns={'store_nbr_x': 'store_nbr'})
    concat['store_52'] = np.where(concat.store == 52, 1, 0)
    # With 'onpromotion'
    concat = concat.merge(onpromotion, on=['store_nbr', 'family', 'date'])
    # Uncomment when you need to merge with X_holidays
    # concat = concat.merge(X_holidays, how='left', on='date')
    # concat.iloc[:, -13:] = concat.iloc[:, -13:].fillna(0)
    # Uncomment when you need to merge with 'holidays'
    # concat = concat.fillna(0)
    # concat = concat.merge(holidays, how='left', on='date')
    # fillna = concat.loc[:, 'is_holiday'].fillna(0).values
    # concat.loc[:, 'is_holiday'] = fillna
    # concat['is_holiday'] = concat.is_holiday.astype('int32')
    # concat = concat.drop(columns={'description'})
    # Uncomment when you need to merge with oil prices
    # concat = concat.merge(oil, how='left', on='date')
    # concat.loc[:, 'dcoilwtico'] = concat.loc[:, 'dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
    # Fit-predict split
    before = concat[concat.date <= basic_set.index.max()]
    after = concat[concat.date == (basic_set.index.max() + 1)]
    # Create target
    target = before[['store_nbr', 'family', 'date', 'sales']]
    target = target[target.date >= '2017-01-14']
    target = target.set_index(["store_nbr", "family", "date"])
    before = before.drop(columns={'sales'})
    after = after.drop(columns={'sales'})
    # Create fit-table
    before = before.set_index(['store_nbr', 'family', 'date'])
    before = before.dropna()
    # Create predict-table
    after = after.set_index(['store_nbr', 'family', 'date'])
    after = after.dropna()
    return before, after, target


# Make prediction for the next day
def make_one_day_forecast(booster, features, before, after, target, clip):
    """Make prediction for the next day. Use in recursive loop."""
    model = booster
    model.fit(before[features], target)
    # model_fit = pd.DataFrame(model.predict(before[features]), index=before.index, columns=['sales']).clip(0.0)
    model_pred = pd.DataFrame(model.predict(after[features]), index=after.index, columns=['sales'])
    if clip:
        model_pred = pd.DataFrame(model.predict(after[features]), index=after.index, columns=['sales']).clip(0.0)
    one_day_pred = model_pred.unstack(['store_nbr', 'family'])
    # cut = one_day_pred.index.max() - 20
    return one_day_pred


# Mutual information scores
def make_mi_scores(X, y, discrete_features='auto'):
    """Returns mutual cross-entropy score for given train-set features and the target."""
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


# Plot mi scores
def plot_mi_scores(scores):
    """Plot mi scores"""
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


# Corrplot
def corrplot(df, method="pearson", annot=True, **kwargs):
    """Plot correlation matrix-heatmap"""
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )

We also need to prepare our data (i.e. set indexes, transform to the wide format etc.).

In [None]:
# Prepare data
store_sales = train.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales_wide = store_sales[['sales', 'onpromotion']].unstack(['store_nbr', 'family'])  # wide format

holidays = holidays_events.query("locale in ['National', 'Regional']").loc['2017-01-01':'2017-08-31', ['description']] \
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
X_holidays = pd.get_dummies(holidays)
holidays['is_holiday'] = 1

store_sales_test = test.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales_test_wide = store_sales_test[['onpromotion']].unstack(['store_nbr', 'family'])  # wide format

onpromotion = pd.concat([store_sales_wide['onpromotion'], store_sales_test_wide['onpromotion']])
onpromotion = onpromotion.stack(['store_nbr', 'family'])  # wide format
onpromotion = onpromotion.to_frame(name='onpromotion')

Now we are ready, so it is a good idea to do some short EDA to check (or just recollect) how our data look like in general.

One can easily manipulate these plots by changing parameters of the function below. Shops are numbered from '1' to '54', and there are 33 families ordered alphabetically.

In [None]:
# EDA: Share-plots of data
share_plot_eda(ts=store_sales_wide['sales'], START='2017-01-01', END='2017-08-15',
               shop='17', start_col=5, num_families=4, show_trend=False, show_model_1=False, show_improvement=False)

Let's look at the trends. It seems that simple linear function will be quite a good choice here.

In [None]:
# Estimate trend
store_sales_trend = store_sales_wide[['sales']].rolling(window=90, center=True, min_periods=45).mean()

# Plot trend
share_plot_eda(ts=store_sales_wide['sales'], START='2017-01-01', END='2017-08-15',
               shop='17', start_col=5, num_families=4, show_trend=True, trend_frame=store_sales_trend,
               show_model_1=False, show_improvement=False)

Let's compare total sales with 'holidays' data. This comparison will show that we can use 'holidays' data as an informative feature in our model.

In [None]:
# EDA: Compare holidays with total sales
total = transactions.groupby('date').sum().loc['2017'].reset_index()
total['date'] = total.date.dt.to_period(freq="D")
total = total.set_index('date')
holidays_restricted = holidays.loc['2017-01-01':'2017-08-15']

ax = total.plot(**plot_params)
plt.plot_date(holidays_restricted.index, total.loc[holidays_restricted.index], color='C3')
ax.set_title('National and Regional Holidays')
plt.show()
plt.clf()

# 1) Set parameters, create features based on datetime index and fit simple linear model 
Everything we did above was just a preparation. Let's begin then by setting special dates and splitting our basic dataset into the 'train' and 'validation' parts.

In [None]:
# Special dates
train_start = train[train.date == '2017-01-01'].date.min()
train_end = train[train.date == '2017-07-30'].date.min()
valid_start = train_end + 1
valid_end = valid_start + 15
test_start = valid_end + 1
test_end = test_start + 15
print("Train start:")
print(train_start)
print("Train end:")
print(train_end)
print("Validation start:")
print(valid_start)
print("Validation end:")
print(valid_end)
print("Test start:")
print(test_start)
print("Test end:")
print(test_end)

In [None]:
# Basic sets: train & valid
store_sales_train = store_sales_wide.loc[train_start: train_end][['sales']]
store_sales_valid = store_sales_wide.loc[valid_start: valid_end][['sales']]
store_sales_train_and_valid = store_sales_wide.loc[train_start: valid_end][['sales']]

Now we will use the function `create_index_features` to create features based on datetime index. It may be beneficial to check how this function works (it was defined in the second cell of this notebook), one can also explore its output in the console. Then we will define the model (using Ridge estimator - it seems to be better than Linear Regression), predict both on train and validation data and show the results.

In [None]:
# Create train & valid features
X_train, X_valid = create_index_features(store_sales_train, steps=16, order=1, holidays=True, oil_prices=True,
                                         seasonal=True)

# Define the model and fit it
model_1 = Ridge(alpha=0.1, fit_intercept=False, random_state=1842)
# model_1 = LinearRegression(fit_intercept=False)
model_1.fit(X_train, store_sales_train)

# Predict on train data
model_1_fit = pd.DataFrame(model_1.predict(X_train), index=X_train.index, columns=store_sales_train.columns).clip(0.0)

# Predict on valid data
model_1_predict = pd.DataFrame(model_1.predict(X_valid), index=X_valid.index, columns=store_sales_train.columns) \
    .clip(0.0)

In [None]:
# Plot the result of the model
share_plot_eda(ts=store_sales_wide['sales'], START='2017-05-01', END='2017-08-15',
               shop='17', start_col=5, num_families=4, show_trend=False,
               show_model_1=True, model_1_fit=model_1_fit, model_1_predict=model_1_predict, show_improvement=False)

Predictions look nice, but we also need to check them in general. Besides RMSLE (competition's metric), we use also two other simple metrics RMSE and MAE.

In [None]:
# Check error
error_check(y_train_=store_sales_train, y_fit_=model_1_fit, y_valid_=store_sales_valid, y_pred_=model_1_predict)

If we retrain the model that was defined above on all the data from 2017 year, i.e. on `store_sales_train_and_valid`, then predict on the test set and submit the result, we will get a test score around 0.51. It seems quite good, but it is (nearly) the same as the score one can achieve in the third excercise ("Seasonality") in Time Series Tutorial. So our aim is to improve this outcome.

This is the place when we start to get into some troubles. I tried almost everything I learn in the Tutorial about hybrid models and the ways they could improve the result. I started from DirRec and Direct Strategies: scores were generally bad or even horrible; the best I was able to achieve was not to disturb the score of the first, Ridge model. It seemed to me, after some time, that the only reasonable strategy was DirRec,  but in a slightly different fashion than it is explained in the last lesson of the Time Series Tutorial. What difference I mean? 'RegressorChain' estimator uses forecasts based on previous step as new lag features, but it also uses every feature we had already used (in previous steps). Let's say we have 3 lag features: 'lag1', 'lag2' and 'lag3' at the begining. After step 1 we have, in fact, four lag features: old 'lag1', 'lag2', 'lag3' and the forecast, i.e. new 'lag1' (old 'lag1' becomes actually 'lag2' etc., in particular, we have now also 'lag4' feature!). When we don't use such 'remaining' features, our model seems to perform better in general. I tried to write some code; there were many trials. Finally I found the way to improve the result and I will present it below.

In this version of my efforts I resigned from using of HybridBooster class defined in the fifth lesson of the Tutorial. Idea of such a class was one of the most important things I learn from this amazing course and I used such class when I made my experiments with DirRec and Direct Strategies and also in other tries. However, the way I eventually found seems to be clearer when we follow it step by step, so without HybridBooster; at least it seems so for me.

# 2) Residuals and feature selection
At the beginning we need to compute the residuals of the first model. Then we will check:

(1) general statistics of the residuals (or rather of their absolute values);

(2) in which time series they are the greatest;

(3) how do they look like.

In [None]:
# Compute residuals of the first model
residuals = store_sales_train - model_1_fit

# Long format:
residuals_long = residuals.stack(['store_nbr', 'family'])
residuals_long = residuals_long.rename(columns={'sales': 'residuals'})

# Greatest residuals
print("General description of residuals:")
print(np.abs(residuals_long).describe())

residuals_grouped = np.abs(residuals_long).groupby(['store_nbr', 'family']).mean().\
    sort_values(ascending=False, by='residuals')
print()
print("Greatest residuals on average:")
print(residuals_grouped.head())

In [None]:
# Plot residuals
share_plot_residuals(ts=residuals['sales'], START='2017-01-01', END='2017-08-31', shop='17',start_col=5, num_families=4)

For EDA purposes we will use the function `create_next_day_table` which produces:

(1) train set `before`;

(2) column with the values of residuals: `target` (and also (3) validation set `after` - we will use it later).

We will check how does the table with the train features look like. You can see that there are many features in it that we can use:

(1) lagged values;

(2) rolling means and exponentially weighted means of (1-step-lagged) time series;

(3) features derived from datetime index;

(4) another features like (encoded) family, store, onpromotion etc.

In [None]:
# Create tables - just for EDA purposes
before, after, target = create_next_day_table(basic_set=residuals[['sales']])

In [None]:
# How does the table look like?
before.head()

As usual, one of the most difficult thing is to choose the best features for our model. We will check:

(1) mutual information score (you can check it on the greater fractions `frac`, but the results will be similar);

(2) correlations between features and target;

(3) you can also check the relation between the selected feature (e.g. 'y_lag_1') and the target, i.e. residuals.

These things may be helpful to get some ideas about the features, at least for the good start.

In [None]:
# Mutual info
sample_train = before.sample(frac=0.05, random_state=1842)
sample_target = target.sample(frac=0.05, random_state=1842)
sample_concat = pd.concat([sample_train, sample_target], axis=1)

mi_scores = make_mi_scores(sample_train.select_dtypes(['uint64', 'int32', 'int64', 'float32', 'float64']),
                           np.ravel(sample_target))

In [None]:
# Plot mi scores
plot_mi_scores(mi_scores)

# Correlations between features and target
corrplot(sample_concat.select_dtypes(['float32', 'float64', 'uint64']), annot=True)

In [None]:
# Relations of features and target
sns.relplot(x="y_lag_1", y="sales", data=sample_concat)

After endless trials I achieve the best score with the features you can see below, even though local validation was sometimes slightly better for another set of them. It is not surprising however - local validation is a bit arbitrary. Better validation strategies may be a furhter way to improve this model. I tried e.g. TimeSeriesSplit & GridSearch, but without satisfactory results.

I also tried another feature ideas, for example I used the lags of the original time series (i.e. sales values) instead of the lags of the residuals. It worked worse. Rolling features were very promissing, but turned out to be rather dissapointing here. I also try to move some features used by the first model, e.g. days of week or holidays, to the second model, but it didn't help either.

In [None]:
# Select features
# 'y_lag_1', 'y_lag_2', 'y_lag_3', 'y_lag_4', 'y_lag_5', 'y_lag_6', 'y_lag_7'
# 'rolling_mean_3', 'rolling_mean_6', 'rolling_mean_12', 'ewm_01', 'ewm_03', 'ewm_09'
# 'day_of_week', 'day_of_month', 'month', 'year'
# 'fam', 'store', 'store_52', 'type', 'cluster', onpromotion

features = ['y_lag_1', 'y_lag_2', 'y_lag_3', 'y_lag_4', 'y_lag_5', 'y_lag_6', 'y_lag_7', 'store', 'store_52']

# 3) Improve the result by the hybrid model: DirRec Strategy
We need to define basic set and the number of steps we want to predtict. Later, i.e. when we predict on the test data, we will need to choose 16 steps, but here, for validation purposes, we can take less steps. The main reason to do it is that every step takes some time (around 40 seconds, so you need 2 minutes to achieve step 3).

DirRec loop goes as follows:

(1) we start by establishing `basic_set` of residuals (train set);

(2) we make train features `before`, `target` and validation features `after` using `create_next_day_table` function that we have already seen in action;

(3) we make a forecast for the next day using `make_one_day_forecast` function;

(4) at last, one-day-forecast `one_day_pred`, concatenated with `basic_set`, becomes a new basic set.

In [None]:
# Define basic set
basic_set = residuals[['sales']]

# Choose the number of steps
n = 3

In [None]:
# DirRec strategy
for _ in range(n):
    t = time.time()
    before, after, target = create_next_day_table(basic_set=basic_set)
    one_day_pred = make_one_day_forecast(booster=LGBMRegressor(random_state=1842, n_estimators=100), features=features,
                                         before=before, after=after, target=target, clip=False)
    new_basic_set = pd.concat([basic_set, one_day_pred])
    basic_set = new_basic_set
    print("Step:")
    print(_)
    print("Time:")
    print(time.time() - t)

When the loop is over, we just get predtictions from the newest `basic_set` and add them to the corresponding part of the first model's prediction.

In [None]:
# Prediction of model_2, i.e. predicted residuals:
residuals_predict = basic_set.loc[valid_start:]
cut = residuals_predict.index.max()

# Add prediction of models 1 & 2
hybrid_predict = model_1_predict.loc[:cut] + residuals_predict
hybrid_predict = hybrid_predict.clip(0.0)

Now we can check the error of the hybrid model. Only 'Validation' metrics are different than earlier (i.e. when we check Ridge model), because we didn't predcit on train set with recursive strategy. Anyway we can see some improvement. We will check it also on charts. The improvemnt is obvious especially in a problematic shop '52'.

In [None]:
# Check error
error_check(y_train_=store_sales_train, y_fit_=model_1_fit, y_valid_=store_sales_valid.loc[:cut], y_pred_=hybrid_predict)

In [None]:
# Plot the results of the model
share_plot_eda(ts=store_sales_wide['sales'], START='2017-04-01', END='2017-08-31',
               shop='52', start_col=5, num_families=4, show_trend=False, show_model_1=True, show_improvement=True,
               model_1_fit=model_1_fit, model_1_predict=model_1_predict, hybrid_predict=hybrid_predict)

# 4) Retrain the model and submit the forecast
Now we need to retrain the model on the train set concateneted with the validation set. It seems to be a generally good idea to use the most 'up-to-date' data available (our validation set is of this kind); here, it means when we use recursive strategy, it is necessary and crucial thing. Retraining is very simple: we just use `store_sales_train_and_valid` instead of `store_sales_train`.

In [None]:
# Retrain model on all data available: create train & test features
X_train, X_test = create_index_features(store_sales_train_and_valid, steps=16, order=1, holidays=True,
                                        oil_prices=True, seasonal=True)

# Define the model and fit it
model_1 = Ridge(alpha=0.1, fit_intercept=False, random_state=1842)
model_1.fit(X_train, store_sales_train_and_valid)

# Predict on train data
model_1_fit = pd.DataFrame(model_1.predict(X_train), index=X_train.index,
                           columns=store_sales_train_and_valid.columns).clip(0.0)

# Predict on valid data
model_1_forecast = pd.DataFrame(model_1.predict(X_test), index=X_test.index,
                                columns=store_sales_train_and_valid.columns).clip(0.0)

In [None]:
# Plot the result of the model
share_plot_eda(ts=store_sales_wide['sales'], START='2017-04-01', END='2017-08-31',
               shop='52', start_col=5, num_families=4, show_trend=False,
               show_model_1=True, model_1_fit=model_1_fit, model_1_predict=model_1_forecast, show_improvement=False)

If you would like to submit the result of this simple model in the competition, you need to use `prepare_submission` function with argument forecast=model_1_forecast. We will use this function later, after hybrid improvement.

Now we are going to compute the residuals and make use of DirRec Loop again; this time we need to choose 16 steps, so it will take a while (around 10 minutes).

In [None]:
# Compute residuals of the first model
residuals = store_sales_train_and_valid - model_1_fit

# Define basic set
basic_set = residuals[['sales']]

# Choose the number of steps
n = 16

In [None]:
# DirRec prediction
for _ in range(n):
    t = time.time()
    before, after, target = create_next_day_table(basic_set=basic_set)
    one_day_pred = make_one_day_forecast(booster=LGBMRegressor(random_state=1842, n_estimators=100), features=features,
                                         before=before, after=after, target=target, clip=False)
    new_basic_set = pd.concat([basic_set, one_day_pred])
    basic_set = new_basic_set
    print("Step:")
    print(_)
    print("Time:")
    print(time.time() - t)

In [None]:
# Prediction of model_2, i.e. predicted residuals:
residuals_forecast = basic_set.loc[test_start:]

# Add prediction of models 1 & 2
hybrid_forecast = model_1_forecast + residuals_forecast
hybrid_forecast = hybrid_forecast.clip(0.0)

In [None]:
# Plot the results of the model
share_plot_eda(ts=store_sales_wide['sales'], START='2017-04-01', END='2017-08-31',
               shop='52', start_col=27, num_families=4, show_trend=False, show_model_1=True, show_improvement=True,
               model_1_fit=model_1_fit, model_1_predict=model_1_forecast, hybrid_predict=hybrid_forecast)

As we can see, some improvements seem to make sense.

Eventually, we can prepare our submission file and test our forecast. If you don't know how to do it, you can read a step-by-step instruction in the end of the excersise 3 of the "Time series tutorial". 

In [None]:
# Prepare submission file
prepare_submission(forecast=hybrid_forecast,path='hybrid_forecast.csv')

# 5) Next steps
I hope I will find some way to improve the hybrid model performance; however I decided to share my work because at this point I don't see a clear way how to do it. Maybe someone will just tell me, it would be much appreciated.

Nonetheless, it seems to me that it will be necessary to build not only one (hybrid), but many models to get significantly better results. I am going to try a few ideas: 'one model per store', 'one per family' or maybe even 'one per time series'. It will make things longer and more complicated, so I hope it is not the only way.