# TPS - Jan 2022 - Time Series

In this notebook, I will try to implement what I have learned from the [time series course](https://www.kaggle.com/learn/time-series).

# Importing Libraries and Loading datasets

In [None]:
import math
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import dateutil.easter as easter

from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, make_lags, make_multistep_target, plot_multistep

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain

In [None]:
train_df = pd.read_csv(
    '../input/tabular-playground-series-jan-2022/train.csv',
    usecols=['country', 'store', 'product', 'date', 'num_sold'],
    dtype={
        'country': 'category',
        'store': 'category',
        'product': 'category',
        'num_sold': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

test_df = pd.read_csv(
    '../input/tabular-playground-series-jan-2022/test.csv',
    usecols=['country', 'store', 'product', 'date'],
    dtype={
        'country': 'category',
        'store': 'category',
        'product': 'category',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

average_sales = (
    train_df
    .groupby('date').mean()
    .squeeze()
)

# Explore Data

In [None]:
train_df.head(6)

In [None]:
print("Columns: {0}".format(list(train_df.columns)))

In [None]:
countries = train_df['country'].unique()
print("Countries: {0}".format(countries))

In [None]:
stores = train_df['store'].unique()
print("Stores: {0}".format(stores))

In [None]:
products = train_df['product'].unique()
print("Products: {0}".format(products))

In [None]:
start_date = train_df.date.min()
end_date = train_df.date.max()
print("Start and end date of the data: ({0}, {1})".format(start_date, end_date))

# Basic Data Check

In [None]:
print('Train data shape:', train_df.shape)
print('Test data shape:', test_df.shape)

In [None]:
missing_values_train = train_df.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test_df.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

In [None]:
duplicates_train = train_df.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test_df.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

# Helper functions

## Split data

In the time series course the first course is creating a Time-step feature and a Lag feature. The data it is using only contains the dates and the target. Hence, I will try to create the same features, but I need to split the data to the same format. I am not sure doing that will cause information to be lost since I will apply a regression algorithm for each piece separately.

In [None]:
def split_data(data):
    splitted_data = []
    for country in countries:
        for store in stores:
            for product in products:
                splitted_data.append(data.loc[(data['country'] == country) & (data['store'] == store) & (data['product'] == product)])
    return splitted_data

In [None]:
splitted_train = split_data(train_df)
splitted_test = split_data(test_df)
splitted_train[0]

## Merge predictions

Function to merge predictions and sort the indices again, so that they can be used to create a submission after running the model.

In [None]:
def merge_predictions(test_predictions):
    predictions = pd.concat(test_predictions)
    predictions.sort_index(inplace=True)
    return predictions

## Create Submission

Very simple helper function to create a submission.

In [None]:
def create_submission(submission, predictions):
    output = pd.DataFrame({'row_id': test_df.index + len(train_df), 'num_sold': predictions})
    output.to_csv(submission, index=False)
    return output

# Features and Modelling



# [Time-step feature](https://www.kaggle.com/ryanholbrook/linear-regression-with-time-series)

Let's apply I have learned from the first lesson, the time-step feature.  

Start with the linear regression on number of sales.

In [None]:
fig, ax = plt.subplots()

# Format the date columns so that it can be used with regplot
df = splitted_train[0].reset_index()
df = df.sort_values('date')
df['date_f'] = pd.factorize(df['date'])[0]
mapping = dict(zip(df['date_f'], df['date'].dt.date))

# Plot
ax.plot('date_f', 'num_sold', data=df, color='0.75')
ax = sns.regplot(x='date_f', y='num_sold', data=df, scatter_kws=dict(color='0.25'), line_kws=dict(linewidth=5))

# Set ticks, labels and titles
ticks = np.array([df['date_f'][date] for date in range(0, len(df), 365)])
ax.set_xticks(ticks)
labels = pd.Series(ax.get_xticks()).map(mapping).fillna('')
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

Now create the time dummy and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train)):
    train = splitted_train[i].copy()
    test = splitted_test[i].copy()

    # Create a time dummy
    train['time'] = np.arange(len(train.index))
    test['time'] = np.arange(len(test.index)) + len(train)

    # Create training data
    X = train.loc[:, ['time']]        # features
    y = train.loc[:, 'num_sold']      # target
    test_X = test.loc[:, ['time']]    # features

    # Train the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
# Plot
ax = splitted_train[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = train_predictions[0].plot(ax=ax, linewidth=5)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
# Merge predictions
predictions = merge_predictions(test_predictions)
# Create submission
output = create_submission('submission_time-step.csv', predictions)
output

# [Trend feature](https://www.kaggle.com/ryanholbrook/trend)

From the second lesson create the trend feature.  

Make a moving average plot to estimate and identify the trend.

In [None]:
trend = average_sales.rolling(
    window=240,
    center=True,
    min_periods=120,
).mean()

ax = average_sales.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=5)

# Set titles
ax.set_title('Average sales')
ax.set(xlabel='Date', ylabel='Number of sales');

Create the feature and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train)):
    train = splitted_train[i].copy()
    test = splitted_test[i].copy()
    
    # Target
    y = train.loc[:, 'num_sold']
    
    # Instantiate `DeterministicProcess` with arguments appropriate trend model
    dp = DeterministicProcess(index=y.index, order=3)
    
    # Create features
    X = dp.in_sample()
    test_X = dp.out_of_sample(steps=len(test), forecast_index=test.index)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = train_predictions[0].plot(ax=ax, linewidth=5)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
# Merge predictions
predictions = merge_predictions(test_predictions)
# Create submission
output = create_submission('submission_trend.csv', predictions)
output

# [Seasonality](https://www.kaggle.com/ryanholbrook/seasonality)

Now try to discover the trend and the seasonal patterns according to the third lesson.  

Let's examine the periodogram.

In [None]:
plot_periodogram(average_sales);

Periodogram suggest a strong annual and weekly seasonality.

Use DeterministicProcess and CalendarFourier to create:
  * indicators for weekly seasons and
  * Fourier features of order 4 for yearly seasons.

In [None]:
y = average_sales.copy()
fourier = CalendarFourier(freq='Y', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()

Let's fit and check the seasonal model.

In [None]:
model = LinearRegression()
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)
ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="sales")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax.legend();

Check the periodogram of the deseasonalized series.

In [None]:
y_deseason = y - y_pred
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(y, ax=ax1)
ax1.set_title("Sales Frequency Components")
ax2 = plot_periodogram(y_deseason, ax=ax2);
ax2.set_title("Deseasonalized");

Now create the features and get the predictions.

In [None]:
splitted_train_df = split_data(train_df)
splitted_test_df = split_data(test_df)

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train_df)):
    df_train = splitted_train_df[i].copy()
    df_train = df_train.set_index('date')
    
    df_test = splitted_test_df[i].copy()
    df_test = df_test.set_index('date')
    
    # Target
    y = df_train.loc[:, 'num_sold']

    # Use DeterministicProcess and CalendarFourier to create:
    # indicators for weekly seasons and
    # Fourier features of order 4 for yearly seasons.
    fourier = CalendarFourier(freq='Y', order=4)
    dp = DeterministicProcess(
        index=y.index,
        constant=True,
        order=1,
        seasonal=True,
        additional_terms=[fourier],
        drop=True,
    )
    # Create the feature set for the dates given in y.index
    X = dp.in_sample()
    
    # Create features for forecast.
    test_X = dp.out_of_sample(steps=len(df_test), forecast_index=df_test.index)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train_df[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = pd.Series(data=train_predictions[0].values, index=splitted_train_df[0].index).plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
predictions = np.zeros(len(test_predictions) * len(test_predictions[0]))
for i in range(len(test_predictions)):
    for j in range(len(test_predictions[i])):
        predictions[len(test_predictions) * j + i] = test_predictions[i][j]
output = create_submission('submission_seasonality.csv', predictions)
output

## Create features for holidays

In the exercise of the third lesson, there is also a part about creating the holiday features.  

So, I have used [this notebook's](https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model) feature engineering about holidays to reach the same goal. Credits to https://www.kaggle.com/ambrosm.

In [None]:
# Credits to https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model, https://www.kaggle.com/ambrosm
# Feature engineering for holidays
def engineer_holidays(df):
    # May
    df = pd.concat([df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                                      for d in list(range(1, 10))}) * 1, #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(19, 26))}) * 1],
                       axis=1)
    
    # June and July
    df = pd.concat([df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(8, 14))}) * 1,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(22, 31))}) * 1,
                        pd.DataFrame({f"july{d}":
                                      (df.date.dt.month == 7) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(1, 3))}) * 1],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df = pd.concat([df,
                        pd.DataFrame({f"wed_june{d}": 
                                      (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(-4, 6))}) * 1],
                       axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(0, 9))}) * 1],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    df = pd.concat([df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))}) * 1],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))}) * 1],
                       axis=1)

    return df

Now create the features and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train_df)):
    df_train = splitted_train_df[i].copy()
    df_train = df_train.set_index('date')
    
    df_test = splitted_test_df[i].copy()
    df_test = df_test.set_index('date')
    
    # Target
    y = df_train.loc[:, 'num_sold']

    # Use DeterministicProcess and CalendarFourier to create:
    # indicators for weekly seasons and
    # Fourier features of order 4 for yearly seasons.
    fourier = CalendarFourier(freq='Y', order=4)
    dp = DeterministicProcess(
        index=y.index,
        constant=True,
        order=1,
        seasonal=True,
        additional_terms=[fourier],
        drop=True,
    )
    # Create the feature set for the dates given in y.index
    X = dp.in_sample()
    X['date'], X['country'] = X.index, df_train['country']
    X = engineer_holidays(X)
    X.drop(['date', 'country'], axis=1, inplace=True)
    
    # Create features for forecast.
    test_X = dp.out_of_sample(steps=len(df_test), forecast_index=df_test.index)
    test_X['date'], test_X['country'] = test_X.index, df_test['country']
    test_X = engineer_holidays(test_X)
    test_X.drop(['date', 'country'], axis=1, inplace=True)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train_df[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = pd.Series(data=train_predictions[0].values, index=splitted_train_df[0].index).plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
predictions = np.zeros(len(test_predictions) * len(test_predictions[0]))
for i in range(len(test_predictions)):
    for j in range(len(test_predictions[i])):
        predictions[len(test_predictions) * j + i] = test_predictions[i][j]
output = create_submission('submission_seasonality_holidays.csv', predictions)
output

# [Time Series as Features](https://www.kaggle.com/ryanholbrook/time-series-as-features)

According to our fourth lesson trend and seasonality will both create serial dependence that shows up in correlograms and lag plots.  
To isolate any purely cyclic behavior, we'll start by deseasonalizing the series.

In [None]:
df_train = splitted_train_df[0]
df_train = df_train.set_index('date')
y = df_train.loc[:, 'num_sold']

fourier = CalendarFourier(freq='Y', order=4)
dp = DeterministicProcess(
    constant=True,
    index=y.index,
    order=1,
    seasonal=True,
    drop=True,
    additional_terms=[fourier],
)
X_time = dp.in_sample()

model = LinearRegression(fit_intercept=False).fit(X_time, y)
deseason = y - model.predict(X_time)

In [None]:
ax = deseason.plot()
# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug (deseasonalized)')
ax.set(xlabel='Date', ylabel='Number of sales');

Create time series features and get the predictions.

In [None]:
# Create lag features
X_lags = make_lags(deseason, lags=1)

X = X_lags.dropna()
y, X = y.align(X, join='inner')

model = LinearRegression().fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)

In [None]:
ax = y.plot(**plot_params, alpha=0.5)
ax = y_pred.plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(labels.to_list())
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

Create and add statistical features and get the predictions.

In [None]:
y_lag = df_train.loc[:, 'num_sold'].shift(1)

# 28-day mean of lagged target
X['mean_7'] = y_lag.rolling(7).mean()
# 14-day median of lagged target
X['median_14'] = y_lag.rolling(14).median()
# 7-day rolling standard deviation of lagged target
X['std_7'] = y_lag.rolling(7).std()

X = X.dropna()
y, X = y.align(X, join='inner')

model = LinearRegression().fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)

In [None]:
ax = y.plot(**plot_params, alpha=0.5)
ax = y_pred.plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(labels.to_list())
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

Now create the features and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train_df)):
    df_train = splitted_train_df[i].copy()
    df_train = df_train.set_index('date')

    df_test = splitted_test_df[i].copy()
    df_test = df_test.set_index('date')

    # Target
    y = df_train.loc[:, 'num_sold']

    # Start by deseasonalizing the series 
    fourier = CalendarFourier(freq='Y', order=4)
    dp = DeterministicProcess(
        constant=True,
        index=y.index,
        order=1,
        seasonal=True,
        drop=True,
        additional_terms=[fourier],
    )
    X_time = dp.in_sample()
    test_X_time = dp.out_of_sample(steps=len(df_test), forecast_index=df_test.index)

    model = LinearRegression(fit_intercept=False).fit(X_time, y)
    deseason = y - model.predict(X_time)
    test_X_y = pd.Series(model.predict(test_X_time), index=df_test.index)
    
    # Make features from `deseason`
    X = make_lags(deseason, lags=1)
    X.fillna(X.mean(), inplace=True)
    
    test_X = make_lags(test_X_y, lags=1)
    test_X.fillna(test_X_y.mean(), inplace=True)

    # Create lagged targets
    y_lag = df_train.loc[:, 'num_sold'].shift(1)
    test_X_y_lag = test_X_y.shift(1)

    # 28-day mean of lagged target
    X['mean_7'] = y_lag.rolling(7).mean()
    X.fillna(y_lag.mean(), inplace=True)
    
    test_X['mean_7'] = test_X_y_lag.rolling(7).mean()
    test_X.fillna(test_X_y_lag.mean(), inplace=True)
    
    # 14-day median of lagged target
    X['median_14'] = y_lag.rolling(14).median()
    X.fillna(y_lag.median(), inplace=True)
    
    test_X['median_14'] = test_X_y_lag.rolling(14).median()
    test_X.fillna(test_X_y_lag.median(), inplace=True)
    
    # 7-day rolling standard deviation of lagged target
    X['std_7'] = y_lag.rolling(7).std()
    X.fillna(y_lag.std(), inplace=True)
    
    test_X['std_7'] = test_X_y_lag.rolling(7).std()
    test_X.fillna(test_X_y_lag.std(), inplace=True)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train_df[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = pd.Series(data=train_predictions[0].values, index=splitted_train_df[0].index).plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
predictions = np.zeros(len(test_predictions) * len(test_predictions[0]))
for i in range(len(test_predictions)):
    for j in range(len(test_predictions[i])):
        predictions[len(test_predictions) * j + i] = test_predictions[i][j]
output = create_submission('submission_time-series.csv', predictions)
output

# Future engineering from another notebook

Credits to https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model, https://www.kaggle.com/ambrosm

In [None]:
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)

In [None]:
# Future Engineering for the end of the year
def engineer_end_of_year(df):
    df = pd.concat([df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                                      for d in range(24, 32)}) * 1,
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(24, 32)}) * 1,
                        pd.DataFrame({f"f-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in range(1, 14)}) * 1,
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(1, 10)}) * 1,
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 15)}) * 1],
                       axis=1)
    return df

In [None]:
# Feature engineering
def engineer(df):
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    df = pd.concat([df,
                    pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd4': (df.date.dt.weekday == 4) * 1, # Friday
                           'wd56': (df.date.dt.weekday >= 5) * 1, # Saturday and Sunday
                          })],
                       axis=1)

    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)

    # Apply other engineering methods and drop the unwanted columns
    df = engineer_end_of_year(df)
    df = engineer_holidays(df)
    df = df.drop(['date', 'country', 'store', 'product'], axis=1)

    return df

Now create the features and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train_df)):
    df_train = splitted_train_df[i].copy()
    df_test = splitted_test_df[i].copy()
    
    # Target
    y = df_train.loc[:, 'num_sold']
    df_train = df_train.drop('num_sold', axis=1)

    # Create the feature set for the dates given in y.index
    X = df_train.copy()
    X = engineer(X)
    
    # Create features for forecast.
    test_X = df_test.copy()
    test_X = engineer(test_X)
    
    # Train the model - Instead of linear regression,
    # I have used CatBoostRegressor now.
    model = CatBoostRegressor(silent=True)
    model.fit(X, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X), index=X.index))
    test_predictions.append(pd.Series(model.predict(test_X), index=test_X.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = train_predictions[0].plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
# Merge predictions
predictions = merge_predictions(test_predictions)
# Create submission
output = create_submission('submission_fe_other.csv', predictions)
output

# [Hybrid Models](https://www.kaggle.com/ryanholbrook/hybrid-models)

As the fifth lesson suggests, create a boosted hybrid by implementing a new Python class.  
Also, add the fit and predict methods to give it a scikit-learn like interface.

In [None]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        
    def fit(self, X_1, X_2, y):
        # Fit self.model_1
        self.model_1.fit(X_1, y)
        
        # Make predictions with self.model_1
        y_fit = pd.Series(self.model_1.predict(X_1), index=X_1.index,
                          dtype=np.float32, name='num_sold')
        
        # Fit self.model_2 on residuals
        y_resid = y - y_fit
        self.model_2.fit(X_2, y_resid)
        
    def predict(self, X_1, X_2):
        # Predict with self.model_1
        y_pred = self.model_1.predict(X_1)
        
        # Add self.model_2 predictions to y_pred
        y_pred += self.model_2.predict(X_2)
        
        return y_pred

Now create the features and get the predictions.

In [None]:
train_predictions = []
test_predictions = []
for i in range(len(splitted_train_df)):
    df_train = splitted_train_df[i].copy()
    df_test = splitted_test_df[i].copy()
    
    # Target
    y = df_train.loc[:, 'num_sold']
    df_train = df_train.drop('num_sold', axis=1)
    
    # Features
    X = engineer(df_train.copy())
    test_X = engineer(df_test.copy())
    
    # X_1: Features for LinearRegression
    features = ['gdp']
    X_1 = X.loc[:, features].copy()
    test_X_1 = test_X.loc[:, features].copy()
    
    # X_2: Features for CatBoostRegressor 
    X_2 = X.drop(features, axis=1).copy()
    test_X_2 = test_X.drop(features, axis=1).copy()
    
    # Create LinearRegression + CatBoostRegressor hybrid with BoostedHybrid
    model = BoostedHybrid(
        model_1=LinearRegression(),
        model_2=CatBoostRegressor(silent=True),
    )
    # Fit the model
    model.fit(X_1, X_2, y)

    # Make predictions
    train_predictions.append(pd.Series(model.predict(X_1, X_2), index=X_1.index))
    test_predictions.append(pd.Series(model.predict(test_X_1, test_X_2), index=test_X_1.index))

Take a look at the predictions and use test predictions to create submission.

In [None]:
ax = splitted_train[0].loc[:, 'num_sold'].plot(**plot_params, alpha=0.5)
ax = train_predictions[0].plot(ax=ax)

# Set ticks, labels and titles,
# Since splitted data has different indices, apply it to ticks
ax.set_xticks(ticks * 18)
ax.set_xticklabels(labels)
ax.set_title('Finland - KaggleMart - Kaggle Mug')
ax.set(xlabel='Date', ylabel='Number of sales');

In [None]:
# Merge predictions
predictions = merge_predictions(test_predictions)
# Create submission
output = create_submission('submission_hybrid.csv', predictions)
output