# Introduction

#### I have tried the Boosted Hybrid model, which is based on Chapter 5 of the [Time Series Tutorial](https://www.kaggle.com/learn/time-series). The Feature Engineering part is taken from [KDJ2020's Notebook](https://www.kaggle.com/code/dkomyagin/simple-ts-ridge-rf/notebook). I have purposely not included any EDA/charts in this Notebook.

#### The idea of Hybrid Models is to use one algorithm to predict some components and another algorithm for the rest. This way we can always choose the best algorithm for each component. For Boosted Hybrid, we use one algorithm to fit the original series and then the second algorithm to fit the residual series. For more details please read this [chapter](https://www.kaggle.com/code/ryanholbrook/hybrid-models).

#### Please upvote if you find this useful.

# Import Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

import warnings
warnings.filterwarnings("ignore")

# Fetching Dataset

In [None]:
path = '../input/store-sales-time-series-forecasting/'

In [None]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()

In [None]:
df_test= pd.read_csv(path + 'test.csv',
                       usecols=['store_nbr', 'family', 'date'],
                       dtype={'store_nbr': 'category', 'family': 'category'},
                       parse_dates=['date'], infer_datetime_format=True)

df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

# Feature Engineering

In [None]:
calendar = pd.DataFrame(index=pd.date_range('2013-01-01','2017-08-31'))

# Oil moving average

data_oil = pd.read_csv(path + 'oil.csv', parse_dates=['date'], infer_datetime_format = True, index_col='date')
data_oil['ma_oil'] = data_oil['dcoilwtico'].rolling(7).mean()

calendar = calendar.merge(data_oil, how='left', left_index=True, right_index=True)
calendar['ma_oil'].fillna(method='ffill', inplace=True)

# Day of week

calendar['dofw'] = calendar.index.dayofweek

In [None]:
# Holiday Events 

df_hev = pd.read_csv(path + 'holidays_events.csv', parse_dates=['date'], infer_datetime_format = True)
df_hev['date'] = df_hev['date'].replace({'2013-04-29': pd.to_datetime('2013-03-29')}) # Good Friday correction
df_hev = df_hev.set_index('date').sort_index()
df_hev = df_hev[df_hev.locale == 'National'] 
df_hev = df_hev.groupby(df_hev.index).first()  # Keep one even on a day

In [None]:
# Work days

calendar['wd'] = True
calendar.loc[calendar.dofw > 4, 'wd'] = False

calendar = calendar.merge(df_hev, how='left', left_index=True, right_index=True)

calendar.loc[calendar.type == 'Bridge', 'wd'] = False
calendar.loc[calendar.type == 'Work Day', 'wd'] = True
calendar.loc[calendar.type == 'Transfer', 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True), 'wd'] = True

# Set up Training Data

In [None]:
# Start and end date for training data
sdate = '2017-01-01'
edate = '2017-08-15'

# Create target matrix
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]

In [None]:
# Deterministic Process

fourier = CalendarFourier(freq = 'W', order = 4)

dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

# Adding calendar features to training feature matrix X

X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['dofw'] = calendar.loc[sdate:edate]['dofw'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values
X['type'] = calendar.loc[sdate:edate]['type'].values

X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

# Define Boosted Hybrid Model
Here we use one algorithm to fit the original series and then the second algorithm to fit the residual series.

In [None]:
# Create class for two algorithms
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method

In [None]:
# Add 'fit' method to the BoostedHybrid class 
def fit(self, X, y):

    self.model_1.fit(X,y)
    y_fit = pd.DataFrame(self.model_1.predict(X),index=X.index, columns=y.columns)
    y_resid = y - y_fit
    self.model_2.fit(X, y_resid)

    # Save column names for predict method
    self.y_columns = y.columns

# Add method to class
BoostedHybrid.fit = fit

In [None]:
# Add 'predict' method to the BoostedHybrid class 
def predict(self, X):
    y_pred1 = pd.DataFrame(self.model_1.predict(X),index=X.index, columns=self.y_columns)
    y_pred2 = pd.DataFrame(self.model_2.predict(X),index=X.index, columns=self.y_columns)     
    y_pred = y_pred1 + y_pred2 
    
    return y_pred

# Add method to class
BoostedHybrid.predict = predict

# Train Boosted Hybrid Model

In [None]:
model = BoostedHybrid(                                              
        model_1=Ridge(alpha = 0.5, normalize = True),
        model_2=RandomForestRegressor(n_estimators = 250, random_state = 0))

model.fit(X, y)
y_pred= model.predict(X)
y_pred = y_pred.clip(0.0)

In [None]:
y_pred  = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales']
print('MSLE:', y_target.groupby('family').apply(lambda a: np.sqrt(mean_squared_log_error(a['sales'],a['sales_pred']))).sum())
y_target.groupby('family').apply(lambda a: np.sqrt(mean_squared_log_error(a['sales'],a['sales_pred'])))

# Testing Data and Predict

In [None]:
# Start and end date for test data
stest = '2017-08-16'
etest = '2017-08-31'

X_test = dp.out_of_sample(steps=16)

# Adding calendar features to test feature matrix X_test

X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test['dofw'] = calendar.loc[stest:etest]['dofw'].values
X_test['wd']   = calendar.loc[stest:etest]['wd'].values

X_test = pd.get_dummies(X_test, columns=['dofw'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer' ]] = 0       

sales_pred = pd.DataFrame(model.predict(X_test), index = X_test.index,columns = y.columns)  
sales_pred = sales_pred.stack(['store_nbr', 'family'])

In [None]:
# Submit results to competition
df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission.csv', index=True)