# **Acknowledgements:**
* Kaggle's [time series course](https://www.kaggle.com/learn/time-series).
* [TPS2201_Hybrid_Time_Series notebook](https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series) by [Teck Meng Wong](https://www.kaggle.com/teckmengwong).
* [Many great notebooks](https://www.kaggle.com/ambrosm/code) by [AmbrosM's](https://www.kaggle.com/ambrosm) 

# Libraries

In [None]:
import gc
import os
import math
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import holidays
from datetime import date
import dateutil.easter as easter

from collections import defaultdict
le = defaultdict(LabelEncoder)

import warnings
warnings.simplefilter('ignore')
np.warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
from learntools.time_series.style import *  # plot style settings

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(12, 8))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

# Fine tuning

In [None]:
# -----------------------------------------------------------------
# Some parameters to config 
SEED = 42
N_ESTIMATORS = 1000
VERBOSE = 0

ID = "row_id"
INPUT = "../input/tabular-playground-series-jan-2022"
PSEUDO_DIR = "../input/tps-jan-2022-pseudo-labels/pseudo_labels_1.csv"
PSEUDO_DIR2 = "../input/tps-jan-2022-pseudo-labels/pseudo_labels_2.csv"

# time series data common new feature  
DATE = "date"
YEAR = "year"
MONTH = "month"
WEEK = "week"
DAY = "day"
DAYOFYEAR = "dayofyear"
DAYOFMONTH = "dayofMonth"
DAYOFWEEK = "dayofweek"
WEEKDAY = "weekday"

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Loss function SMAPE

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def smape_loss(y_true, y_pred):
    """
    SMAPE Loss
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    Returns
    -------
    loss : float or ndarray of floats
        If multioutput is 'raw_values', then mean absolute error is returned
        for each output separately.
        If multioutput is 'uniform_average' or an ndarray of weights, then the
        weighted average of all output errors is returned.
        SMAPE output is non-negative floating point. The best value is 0.0.

    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)


In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

# Data/Feature Engineering

In [None]:
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

# Compute Fourier features to the 4th order (8 new features) for a
# series y with daily observations and annual seasonality:
#
# fourier_features(y, freq=365.25, order=4)

def get_basic_ts_features(df):
    
    gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation
    # gdp_exponent = 1.3458829179823577  # UPDATE: value from version17

    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
    
    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    
    for country in ['Finland', 'Norway', 'Sweden']:
        df[country] = df.country == country
    for store in ['KaggleMart', 'KaggleRama']:
        df[store] = df['store'] == store
    for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
        df[product] = df['product'] == product
    
    df[YEAR] = df[DATE].dt.year
    df[MONTH] = df[DATE].dt.month
    df[WEEK] = df[DATE].dt.week
    df[DAY] = df[DATE].dt.day
    # df[DAYOFYEAR] = df[DATE].dt.dayofyear
    # df[DAYOFMONTH] = df[DATE].dt.days_in_month
    # df[DAYOFWEEK] = df[DATE].dt.dayofweek
    df[WEEKDAY] = df[DATE].dt.weekday

    df['wd4'] = df[DATE].dt.weekday == 4
    df['wd56'] = df[DATE].dt.weekday >= 5
#     df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), DAYOFYEAR] += 1 # fix for leap years
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear
    # for k in range(1, 32, 4):
    for k in [7, 14, 21, 28, 30, 31, 91]:
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'Finland_sin{k}'] = df[f'sin{k}'] * df['Finland']
        df[f'Finland_cos{k}'] = df[f'cos{k}'] * df['Finland']
        df[f'Norway_sin{k}'] = df[f'sin{k}'] * df['Norway']
        df[f'Norway_cos{k}'] = df[f'cos{k}'] * df['Norway']
        df[f'Sweden_sin{k}'] = df[f'sin{k}'] * df['Sweden']
        df[f'Sweden_cos{k}'] = df[f'cos{k}'] * df['Sweden']
        df[f'storeMart_sin{k}'] = df[f'sin{k}'] * df['KaggleMart']
        df[f'storeMart_cos{k}'] = df[f'cos{k}'] * df['KaggleMart']
        df[f'storeRama_sin{k}'] = df[f'sin{k}'] * df['KaggleRama']
        df[f'storeRama_cos{k}'] = df[f'cos{k}'] * df['KaggleRama']
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'hat_sin{k}'] = df[f'sin{k}'] * df['Kaggle Hat']
        df[f'hat_cos{k}'] = df[f'cos{k}'] * df['Kaggle Hat']
        df[f'sticker_sin{k}'] = df[f'sin{k}'] * df['Kaggle Sticker']
        df[f'sticker_cos{k}'] = df[f'cos{k}'] * df['Kaggle Sticker']
    
#     df = pd.concat([df, pd.DataFrame({f'fin{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#                                       for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'nor{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#                                       for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'swe{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
#                                       for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items()})], axis=1)

#     for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items():
#         df[f"fin{ptr[1]}"] = (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#     for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items():
#         df[f"nor{ptr[1]}"] = (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#     for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items():
#         df[f"swe{ptr[1]}"] = (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
    
    # End of year
    # Dec
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    # Jan
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    # May
    for d in list(range(1, 10)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    #Swedish Rock Concert
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    # First half of December (Independence Day of Finland, 6th of December)
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df  

In [None]:
def feature_engineer(df):
    df = get_basic_ts_features(df)
    return df

In [None]:
for ptr in holidays.Norway(years=[2019], observed=True).items():
    print(ptr)

In [None]:
from pathlib import Path

def load_data():
    # Read data
    data_dir = Path(INPUT)
    
    df_train = pd.read_csv(
        Path(INPUT) / "train.csv", parse_dates=[DATE],
        usecols=['date', 'country', 'store', 'product', 'num_sold'],
        dtype={
            'country': 'category',
            'store': 'category',
            'product': 'category',
            'num_sold': 'float32',
        },
        infer_datetime_format=True,)
    
    df_test = pd.read_csv(Path(INPUT) / "test.csv", index_col=ID, parse_dates=[DATE])
    
    column_y = df_train.columns.difference(df_test.columns)[0]  # column_y target_col label_col
    df_train[DATE] = pd.to_datetime(df_train[DATE])
    df_test[DATE] = pd.to_datetime(df_test[DATE])
    
    return df_train, df_test, column_y


In [None]:
def process_data(df_train, df_test):
    df_train = feature_engineer(df_train)
    df_test = feature_engineer(df_test)

    return df_train, df_test

# Load Data #

And now we can call the data loader and get the processed data splits:

In [None]:
%%time
train_df, test_df, column_y = load_data()   
train_df, test_df = process_data(train_df, test_df)

# Data Pipeline

In [None]:
train_data = train_df.copy()
train_data[DATE] = train_df.date.dt.to_period('D')
test_data = test_df.copy()
test_data[DATE] = test_df.date.dt.to_period('D')

## Pseudolabeling

In [None]:
df_pseudolabels = pd.read_csv(PSEUDO_DIR, index_col=ID)
df_pseudolabels[DATE] = pd.to_datetime(test_df[DATE])
df_pseudolabels.to_csv("pseudo_labels_v0.csv", index=True)

test_data[column_y] = df_pseudolabels[column_y].astype(np.float32)
train_data = pd.concat([train_data, test_data], axis=0)
train_df = pd.concat([train_df, test_data], axis=0)

In [None]:
X = train_data.set_index([DATE]).sort_index()
X_test = test_data.set_index([DATE]).sort_index()

In [None]:
train_data = train_data.set_index(['date', 'country', 'store', 'product']).sort_index()

In [None]:
kaggle_sales_2015 = (
    train_data
    .groupby(['country', 'store', 'product', 'date'])
    .mean()
    .unstack(['country', 'store', 'product'])
    .loc['2015']
)

In [None]:
kaggle_sales_2016 = (
    train_data
    .groupby(['country', 'store', 'product', 'date'])
    .mean()
    .unstack(['country', 'store', 'product'])
    .loc['2016']
)

In [None]:
kaggle_sales_2017 = (
    train_data
    .groupby(['country', 'store', 'product', 'date'])
    .mean()
    .unstack(['country', 'store', 'product'])
    .loc['2017']
)

In [None]:
kaggle_sales_2018 = (
    train_data
    .groupby(['country', 'store', 'product', 'date'])
    .mean()
    .unstack(['country', 'store', 'product'])
    .loc['2018']
)

In [None]:
frames = [kaggle_sales_2015, kaggle_sales_2016, kaggle_sales_2017, kaggle_sales_2018]
kaggle_sales = pd.concat(frames)

In [None]:
kaggle_sales

In [None]:
gc.collect()

In [None]:
# Check NA
missing_val = X.isnull().sum()
print(missing_val[missing_val > 0])

In [None]:
train_data.groupby(column_y).apply(lambda s: s.sample(min(len(s), 5)))

In [None]:
fig_dims = (50,30)
ax = kaggle_sales.num_sold.plot(title='Sales Trends', figsize=fig_dims)
_ = ax.set(ylabel="Numbers sold")

# What is Seasonality? #

We say that a time series exhibits **seasonality** whenever there is a regular, periodic change in the mean of the series. Seasonal changes generally follow the clock and calendar -- repetitions over a day, a week, or a year are common. Seasonality is often driven by the cycles of the natural world over days and years or by conventions of social behavior surrounding dates and times.
### Choosing Fourier features with the Periodogram

How many Fourier pairs should we actually include in our feature set? We can answer this question with the periodogram. The **periodogram** tells you the strength of the frequencies in a time series. Specifically, the value on the y-axis of the graph is `(a ** 2 + b ** 2) / 2`, where `a` and `b` are the coefficients of the sine and cosine at that frequency (as in the *Fourier Components* plot above).

<figure style="padding: 1em;">
<img src="https://i.imgur.com/PK6WEe3.png" width=600, alt="">
<figcaption style="textalign: center; font-style: italic"><center>Periodogram for the <em>Wiki Trigonometry</em> series.</center></figcaption>
</figure>

From left to right, the periodogram drops off after *Quarterly*, four times a year. That was why we chose four Fourier pairs to model the annual season. The *Weekly* frequency we ignore since it's better modeled with indicators.

### Computing Fourier features (optional)

Knowing how Fourier features are computed isn't essential to using them, but if seeing the details would clarify things, the cell hidden cell below illustrates how a set of Fourier features could be derived from the index of a time series. (We'll use a library function from `statsmodels` for our applications, however.)

Now let's look at the periodogram:

In [None]:
plot_periodogram(X[column_y]);

The periodogram agrees with the seasonal plots above: a strong semiweekly season and a weaker annual season. The weekly season we'll model with indicators and the annual season with Fourier features. From right to left, the periodogram falls off between Bimonthly (6) and Monthly (12), so let's use 10 Fourier pairs.

We'll create our seasonal features using DeterministicProcess, the same utility we used in Lesson 2 to create trend features. To use two seasonal periods (weekly and annual), we'll need to instantiate one of them as an "additional term":

# Components and Residuals #

So that we can design effective hybrids, we need a better understanding of how time series are constructed. We've studied up to now three patterns of dependence: trend, seasons, and cycles. Many time series can be closely described by an additive model of just these three components plus some essentially unpredictable, entirely random *error*:

```
series = trend + seasons + cycles + error
```

Each of the terms in this model we would then call a **component** of the time series.

The **residuals** of a model are the difference between the target the model was trained on and the predictions the model makes -- the difference between the actual curve and the fitted curve, in other words. Plot the residuals against a feature, and you get the "left over" part of the target, or what the model failed to learn about the target from that feature.

In [None]:
# Plot all num_sold_true and num_sold_pred (five years) for one country-store-product combination
def plot_five_years_combination(engineer, country='Norway', store='KaggleMart', product='Kaggle Hat'):
    demo_df = pd.DataFrame({'row_id': 0,
                            'date': pd.date_range('2015-01-01', '2019-12-31', freq='D'),
                            'country': country,
                            'store': store,
                            'product': product})
    demo_df.set_index('date', inplace=True, drop=False)
    demo_df = engineer(demo_df)
    demo_df[GROUP_INDEX] = demo_df[GROUP_INDEX].apply(lambda x: le[x.name].transform(x))

#     display(demo_df)
    demo_df['num_sold'] = np.expm1(model.predict(preproc.transform(demo_df[features])))
    train_subset = train_df[(train_df.country == country) & (train_df.store == store) & (train_df['product'] == product)]
    plt.figure(figsize=(24, 8))
    plt.plot(demo_df[DATE], demo_df.num_sold, label='prediction', alpha=0.5)
    plt.scatter(train_subset[DATE], train_subset.num_sold, label='true', alpha=0.5, color='red', s=2)
    plt.grid(True)
#     plt.grid(which='major',axis ='y', linewidth='0.7', color='black')
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
    plt.legend()
    plt.title(f'{country} {store} {product} Predictions and true for five years')
    plt.show()
    return demo_df['num_sold']

In [None]:
def plot_true_vs_prediction(df_true, df_hat):
    plt.figure(figsize=(20, 13))
    plt.scatter(np.arange(len(df_hat)), np.log1p(df_hat), label='prediction', alpha=0.5, color='blue', s=3) #np.arange(len(df_hat))
    plt.scatter(np.arange(len(df_true)), np.log1p(df_true), label='Pseudo/true', alpha=0.5, color='red', s=7) #np.arange(len(df_true))
    plt.legend()
    plt.title(f'Predictions VS Pseudo-label {column_y} (LOG)') #{df_true.index[0]} - {df_true.index[-1]}
    plt.show()

In [None]:
def plot_residuals(y_residuals):
    plt.figure(figsize=(13, 3))
    plt.scatter(np.arange(len(y_residuals)), np.log1p(y_residuals), label='residuals', alpha=0.1, color='blue', s=5)
    plt.legend()
    plt.title(f'Linear Model residuals {column_y} (LOG)') #{df_true.index[0]} - {df_true.index[-1]}
    plt.tight_layout()
    plt.show()

In [None]:
def plot_oof(y_true, y_predict):
    # Plot y_true vs. y_pred
    plt.figure(figsize=(5, 5))
    plt.scatter(y_true, y_predict, s=1, color='r', alpha=0.5)
#     plt.scatter(np.log1p(y_true), np.log1p(y_predict), s=1, color='g', alpha=0.3)
    plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
    plt.gca().set_aspect('equal')
    plt.xlabel('y_true')
    plt.ylabel('y_pred')
    plt.title('OOF Predictions')
    plt.show()

In [None]:
def find_min_SMAPE(y_true, y_predict):
    # loss_correction = 1
    scores = []
    # float step
    for WEIGHT in np.arange(0.97, 1.02, 0.0001):
        y_hat = y_predict.copy()
        y_hat *= WEIGHT
        scores.append(np.array([WEIGHT, np.mean(smape_loss(y_true, y_hat))]))
        
    scores = np.vstack(scores)
    min_SMAPE = np.min(scores[:, 1])
    print(f'min SMAPE {min_SMAPE:.5f}')
    for x in scores:
        if x[1] == min_SMAPE:
            loss_correction = x[0]
            print(f'loss_correction: {x[0]:.5f}')
            
    plt.figure(figsize=(5, 3))
    plt.plot(scores[:, 0],scores[:, 1])
    plt.scatter([loss_correction], [min_SMAPE], color='g')
    plt.ylabel(f'SMAPE')
    plt.xlabel(f'loss_correction: {loss_correction:.5f}')
    plt.legend()
    plt.title(f'min SMAPE:{min_SMAPE:.5f} scaling')
    plt.show()
    
    return loss_correction

In [None]:
def evaluate_SMAPE(y_va, y_va_pred):
    # loss_correction = 1
    # Evaluation: Execution time and SMAPE
    smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
    smape = np.mean(smape_loss(y_va, y_va_pred))
    loss_correction = find_min_SMAPE(y_va, y_va_pred)
    y_va_pred *= loss_correction
    
    print(f"SMAPE (before correction: {smape_before_correction:.5f})")
    print(f'Min SMAPE: {np.mean(smape_loss(y_va, y_va_pred))}')
    
    return loss_correction

In [None]:
GROUP_INDEX = ['country', 'store', 'product']

# Target series
y = X.loc[:, column_y]

# X_1: Features for Linear Regression
fourier = CalendarFourier(freq="A", order=10)  # 10 sin/cos pairs for Annual seasonality

dp = DeterministicProcess(
    index=X.index,
    constant=True,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    additional_terms=[fourier],  # annual seasonality (fourier)
    drop=True,                   # drop terms to avoid collinearity
)

X_1 = dp.in_sample()  # create features for dates in tunnel.index


# X_2: Features for XGBoost
X_2 = X.drop(column_y, axis=1)
# Encoding the variable
X_2[GROUP_INDEX] = X_2[GROUP_INDEX].apply(lambda x: le[x.name].fit_transform(x))
# Using the dictionary to label future data
X_test[GROUP_INDEX] = X_test[GROUP_INDEX].apply(lambda x: le[x.name].transform(x))

# Label encoding for seasonality
# X_2["day"] = X_2.index.dayofyear  # values are day of the month

In [None]:
features = X_2.columns

In [None]:
X_test_1 = dp.out_of_sample(365)

# Hybrid Models
Linear regression excels at extrapolating trends, but can't learn interactions. XGBoost excels at learning interactions, but can't extrapolate trends. We'll learn how to create "hybrid" forecasters that combine complementary learning algorithms and let the strengths of one make up for the weakness of the other.

In [None]:
from pyearth import Earth
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, HuberRegressor, RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import StackingRegressor

In [None]:
# You'll add fit and predict methods to this minimal class
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class BoostedHybrid(BaseEstimator, RegressorMixin):
    def __init__(self, model_1, model_2, scaler):
        self.model_1 = model_1
        self.model_2 = model_2
        self.scaler = scaler
        self.y_columns = None  # store column names from fit method
    def fit(self, X, y): #, X_1_valid, y_valid
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, accept_sparse=True)
        # Train model_1
        self.model_1.fit(X, y)

        # Make predictions
        y_fit = self.model_1.predict(X)
        # Compute residuals
        y_resid = y - y_fit
        
        # Make predictions
#         y_valid_fit = self.model_1.predict(X_1_valid)
        # Compute residuals
#         y_valid_resid = y_log_valid - y_valid_fit

        # Train model_2 on residuals , eval_set=[(X_1_valid, y_valid_resid)]
        self.model_2.fit(X, y_resid)
        
        # Model2 prediction
        y_fit2 = self.model_2.predict(X)
        # Compute noise
        y_resid2 = y_resid - y_fit2
        
        # Save data for question checking
        self.y = y
        self.y_fit = y_fit
        self.y_resid = y_resid
        self.y_fit2 = y_fit2
        self.y_resid2 = y_resid2

        self.is_fitted_ = True
        
        return self
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        # Predict with model_1
        y_predict = self.model_1.predict(X)
        # Add model_2 predictions to model_1 predictions
        y_predict += self.model_2.predict(X)

        return y_predict


# Training

In [None]:
TRAIN_END_DATE = "2019-12-31"
VALID_START_DATE = "2015-01-01"
VALID_END_DATE = "2018-12-31"

y_train, y_valid = y[:TRAIN_END_DATE], y[VALID_START_DATE:VALID_END_DATE]
X1_train, X1_valid = X_1[:TRAIN_END_DATE], X_1[VALID_START_DATE:VALID_END_DATE]
X2_train, X2_valid = X_2.loc[:TRAIN_END_DATE], X_2.loc[VALID_START_DATE:VALID_END_DATE]

## Using StandardScaler

In [None]:
preproc = StandardScaler()

In [None]:
def model_fit_eval(hybrid_model, X_train, y_train, X_valid, y_valid, scaler):
    # test_pred_list = []
    # Boosted Hybrid
    hybrid_model.fit(X_train, y_train) #, X_valid, y_valid
    y_va_pred = hybrid_model.predict(X_valid)
    
    # loss_correction = 1
    ###### Preprocess the validation data
    y_va = np.expm1(y_valid.copy())
    # Inference for validation
    y_va_pred = np.expm1(hybrid_model.predict(X_valid))
    loss_correction = evaluate_SMAPE(y_va, y_va_pred)
    
    ###### Visualize and evual
    plot_oof(y_va, y_va_pred)
    plot_true_vs_prediction(y_va, y_va_pred)
#     plot_residuals(hybrid_model.y_resid)
#     plot_residuals(hybrid_model.y_resid2)
#     plot_residuals(model.y_resid3)
    
    ###### Validate against 2019 PSEU #######
    # loss_correction = 1
    ###### Preprocess the validation data
    y_va = df_pseudolabels[column_y].values.reshape(-1, 1)
    
    # Inference test 2019 for validation
    y_va_pred = np.expm1(hybrid_model.predict(scaler.transform(X_test[features])))
    
    # Evaluation: Execution time and SMAPE
    smape_before_correction = np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)))
    smape = np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)))
    print(f'***********Test Data*****************')
    loss_correction = find_min_SMAPE(y_va, y_va_pred.reshape(-1, 1))
#     y_va_pred *= loss_correction
    
    ### Mean test prediction ###
    # test_pred_list.append(y_va_pred)

    print(f'SMAPE (before correction: {smape_before_correction:.5f})')
    print(f'Min SMAPE: {np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)*loss_correction))}')
    
    # return hybrid_model, test_pred_list, loss_correction
    return y_va_pred

In [None]:
test_pred_list = []
model_list = []

for seed in range(15):
    estimator_stack = []

    param1 = {
        'loss_function': 'MultiRMSE',
        'eval_metric': 'MultiRMSE',
        'n_estimators': N_ESTIMATORS,
        'od_type': 'Iter',
        'od_wait': 20,
        'random_state': seed,
        'verbose': VERBOSE
    }

    # Try different combinations of the algorithms above KNeighborsRegressor
    models_1 = [
        Earth(verbose=VERBOSE),
        Ridge(random_state=seed),
        HuberRegressor(epsilon=1.20, max_iter=N_ESTIMATORS),
        # LinearSVR(max_iter=N_ESTIMATORS, random_state=seed, verbose=VERBOSE),
    ]

    models_2 = [
        XGBRegressor(
            objective='reg:squarederror',
            tree_method='gpu_hist',
            eval_metric=smape_loss,
            n_estimators=N_ESTIMATORS,
            random_state=seed
        ),
        LGBMRegressor(
            objective='regression',
            n_estimators=N_ESTIMATORS,
            random_state=seed
        ),
        CatBoostRegressor(**param1),
    ]

    for model_1 in models_1:
        for model_2 in models_2:
            model1_name = type(model_1).__name__
            model2_name = type(model_2).__name__

            hybrid_model = BoostedHybrid(
                    model_1 = model_1,
                    model_2 = model_2,
                    scaler = preproc
            )

            print(f'******************Stacking {model1_name:>15} with {model2_name:<18}*************************')
            estimator_stack.append((f'model_{model1_name}_{model2_name}', hybrid_model))


    X2 = preproc.fit_transform(X2_train[features])
    model = StackingRegressor(estimators=estimator_stack, final_estimator=RidgeCV(), n_jobs=-1, verbose=VERBOSE)
    test_pred = model_fit_eval(
        model,
        X2,
        np.log1p(y_train),
        preproc.transform(X2_valid[features]),
        np.log1p(y_valid),
        preproc
    )
    
    model_list.append(model)
    test_pred_list.append(test_pred)

# Inference validation

In [None]:
for country in np.unique(train_df['country']):
    for product in np.unique(train_df['product']):
        for store in np.unique(train_df['store']):
            y_fit = plot_five_years_combination(feature_engineer, country=country, product=product, store=store)

# Inference year 2019 test data

In [None]:
y_pred = sum(test_pred_list) / len(test_pred_list) # model.predict(X_test[features])

In [None]:
%%time
LOSS_CORRECTION = 1

###### Preprocess the validation data
y_va = df_pseudolabels[column_y].values.reshape(-1, 1)

# Inference for validation
y_va_pred = y_pred.copy().reshape(-1, 1) #model.predict(X_test[features])

# Evaluation: Execution time and SMAPE
smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
smape = np.mean(smape_loss(y_va, y_va_pred))
LOSS_CORRECTION = find_min_SMAPE(y_va, y_va_pred)
y_va_pred *= LOSS_CORRECTION

print(f" SMAPE: {smape:.5f} (before correction: {smape_before_correction:.5f})")
print(np.mean(smape_loss(y_va, y_va_pred)))

In [None]:
for model in model_list:
    plot_oof(y_va, y_va_pred)
    plot_true_vs_prediction(y_va, y_va_pred)
    plot_residuals(model.estimators_[0].y_resid)
    plot_residuals(model.estimators_[0].y_resid2)
    # plot_residuals(model.y_resid3)

# Submission
Once you're satisfied with everything, it's time to create your final predictions! This cell will:

- use the best trained model to make predictions from the test set
- save the predictions to a CSV file


In [None]:
from math import ceil, floor, sqrt
# from https://www.kaggle.com/fergusfindley/ensembling-and-rounding-techniques-comparison
def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)

    return result_array

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")

In [None]:
# Inference for test
test_prediction_list = []
test_prediction_list.append(y_pred) # * LOSS_CORRECTION)

test_prediction_list.append(df_pseudolabels[column_y].values)

df_pseudolabels1 = pd.read_csv(PSEUDO_DIR2, index_col=ID)
test_prediction_list.append(df_pseudolabels1[column_y].values)

test_prediction_list = np.median(test_prediction_list, axis=0) # median is better https://www.kaggle.com/saraswatitiwari/tabular-playground-series-22

if len(test_prediction_list) > 0:
    # Create the submission file
    submission = pd.DataFrame(data=np.zeros((sub.shape[0], 2)), index=sub.index.tolist(), columns=[ID,column_y])
    submission[ID] = sub[ID]
    submission[column_y] = test_prediction_list
    submission[column_y] = geometric_round(submission[column_y]).astype(int) # https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
    submission.to_csv('submission.csv', index=False)

    # Plot the distribution of the test predictions
    plt.figure(figsize=(16,3))
    plt.hist(
        train_df[column_y],
        bins=np.linspace(0, 3000, 201),
        density=True,
        label='Training')
    plt.hist(
        submission[column_y],
        bins=np.linspace(0, 3000, 201),
        density=True,
        rwidth=0.5,
        label='Test predictions')
    plt.xlabel(column_y)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
display(submission.head(30))
display(submission.tail(30))