log
- ceil to round up prediction
- long form seems to be better for mutli-series prediction than wide form since each point is a different sample, 
while in wide form the same feature set is used for all the different series
- modified the boosted hybird class to use hyperparams tuning
- log transform the target (for feature-transforming model) improves the result a bit
- use gdp (since the model is using the same trend coeff for all the series) improves the score significantly
- scoring (smaple vs default) in cv search: seems to return the same hyperparams

what to try next
- stacking/ensemble (catboost, etc)
- hyperparams tuning strategy
- use all features for both X1,X2 in boosted model
- plot resid vs date 
- feature - moving average, ema, etc
- feature - lag
- feature - holiday effect instead of only the holiday

trying:
- hyperparams tuning strategy

In [None]:
!pip install scikit-learn  -U
!pip install scikit-optimize -U

In [None]:
import itertools
from math import ceil
from functools import partial

import pandas as pd
import numpy as np
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from statsmodels.tsa.deterministic import DeterministicProcess
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
import seaborn as sns
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# modified from https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def smape_loss(y_true, y_pred, ne=False):
    """
    SMAPE Loss
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    Returns
    -------
    loss : float or ndarray of floats
        If multioutput is 'raw_values', then mean absolute error is returned
        for each output separately.
        If multioutput is 'uniform_average' or an ndarray of weights, then the
        weighted average of all output errors is returned.
        SMAPE output is non-negative floating point. The best value is 0.0.

    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    if ne:
        result = -np.mean(diff)
    else:
        result = np.mean(diff)
    return result



In [None]:
TRAIN_PATH = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TEST_PATH = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'

TRAIN_DTYPE = {
        'row_id': 'uint32',
        'country': 'category',
        'store': 'category',
        'product': 'category',
        'num_sold': 'uint32'
    }
TEST_DTYPE = {
        'row_id': 'uint32',
        'country': 'category',
        'store': 'category',
        'product': 'category',
    }

CAT_COLS = ['country', 'store', 'product']
TS_COLS = ['country', 'store', 'product']
TARGET_COL = 'num_sold'
DATE_COL = 'date'
DATE_FREQ = 'D'
ID_COL = 'row_id'

FOURIER_ORDER = 10
LOG_TARGET = True

In [None]:
train = pd.read_csv(TRAIN_PATH, dtype=TRAIN_DTYPE, parse_dates=[DATE_COL], infer_datetime_format=True,)
test = pd.read_csv(TEST_PATH, dtype=TEST_DTYPE, parse_dates=[DATE_COL],infer_datetime_format=True,)
train[DATE_COL] = train[DATE_COL].dt.to_period(DATE_FREQ)
test[DATE_COL] = test[DATE_COL].dt.to_period(DATE_FREQ)
train = train.set_index(TS_COLS+[DATE_COL]).sort_index()
test = test.set_index(TS_COLS+[DATE_COL]).sort_index()
data = pd.concat([train,test])
gdp = pd.read_csv("../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv", )
holiday = pd.read_csv("../input/nordic-holidays/nordic_holidays.csv",index_col=0, parse_dates=[DATE_COL],infer_datetime_format=True,)
holiday[DATE_COL] = holiday[DATE_COL].dt.to_period(DATE_FREQ)


In [None]:
y = data.drop('row_id', axis=1)
y = y.unstack(TS_COLS)

## features

#### trend, seasonality

In [None]:
fourier_A = CalendarFourier("A", order=FOURIER_ORDER)
fourier_M = CalendarFourier("M", order=FOURIER_ORDER)
fourier_W = CalendarFourier("W", order=FOURIER_ORDER)


dp = DeterministicProcess(
    index=y.index,
    # the intercept term in linear regression
    constant=False,
    # The order of the tim trend to include, e.g. 1 means linear, 2 means quadratic
    order=1,
    # below two combines to create weekly seasonal indicator
    seasonal=True, 
    period=7, 
    additional_terms=[fourier_A, fourier_M, fourier_W],
    # drop the perfect collinearity column
    drop=True,
)


X = dp.in_sample()

In [None]:
X.head(10)

In [None]:
# # need to create long form for X (replicate this for all series)
X = pd.merge(data.reset_index()[TS_COLS+[DATE_COL]].set_index('date'),
        X,
        how='left',
        left_index=True,
        right_index=True)

In [None]:
X_trend = X.reset_index().set_index([DATE_COL]+TS_COLS)[['trend']]
X_seasonal = X.reset_index().set_index([DATE_COL]+TS_COLS).drop(['trend'],axis=1)

#### series label

In [None]:
# one hot
X_series = pd.DataFrame(index=X_seasonal.index).reset_index()[TS_COLS]
X_series = X_series.apply(lambda x: '-'.join(x), axis=1).to_frame(name='series_name')
X_series = X_series.set_index(X_seasonal.index)
X_series = pd.get_dummies(X_series.series_name, drop_first=False)

# # label
# le = LabelEncoder()
# X_series = pd.DataFrame(index=X_seasonal.index).reset_index()[TS_COLS]
# X_series = X_series.apply(lambda x: '-'.join(x), axis=1).to_frame(name='series_name')
# X_series = X_series.set_index(X_seasonal.index)
# X_series = X_series.assign(series_name=lambda x: le.fit_transform(x.series_name))

In [None]:
X_non_trend = pd.merge(
    X_seasonal,
    X_series,
    how='left',
    left_index=True,
    right_index=True)

#### holiday

In [None]:
holiday = holiday.set_index(['date','country'])
holiday = pd.get_dummies(holiday.holiday, drop_first=False)

X_non_trend = pd.merge(
    X_non_trend.reset_index(['store','product']),
    holiday,
    how='left',
    left_index=True,
    right_index=True
)

In [None]:
X_non_trend = X_non_trend.reset_index().set_index([DATE_COL]+TS_COLS)

#### gdp

In [None]:
results = []
for yr in gdp.year:
    for ctry in ['Finland', 'Norway', 'Sweden']:
        results.append([yr, ctry, np.squeeze(gdp[gdp.year==yr][f'GDP_{ctry}'])])
gdp = pd.DataFrame(results, columns=['year', 'country', 'gdp']).set_index(['year', 'country'])

X_trend = X_trend.reset_index()
X_trend = X_trend.assign(year=X_trend['date'].dt.year).set_index(['year', 'country'])
X_trend = pd.merge(
    X_trend,
    gdp,
    how='left',
    left_index=True,
    right_index=True)

X_trend['gdp'] = np.log(X_trend['gdp'])
X_trend = X_trend.reset_index().set_index([DATE_COL]+TS_COLS).sort_index()
X_trend.drop([
    'trend', 
    'year',
], axis=1, inplace=True)

In [None]:
y = y.stack(TS_COLS).sort_index()

#### visualize the data

In [None]:
X_trend.head(10)

In [None]:
X_non_trend.head(10)

In [None]:
y.head(10)

## Model and Prediction

In [None]:
ne_smape_loss = partial(smape_loss, ne=True)
cv_score = make_scorer(ne_smape_loss)

class BoostedHybrid:
    def __init__(self, model_1, model_2, model_1_param_grid, model_2_param_grid, log_transform):
        self.model_1 = model_1
        self.model_2 = model_2
        self.model_1_param_grid = model_1_param_grid
        self.model_2_param_grid = model_2_param_grid
        self.log_transform = log_transform

    def fit(self, X_1, X_2, y):
        self.model_1_gcv = GridSearchCV(
            self.model_1, 
           self.model_1_param_grid, 
           cv=TimeSeriesSplit(), 
           verbose=3, 
           scoring=cv_score,
        )
                    
        self.model_1_gcv.fit(X_1, self.log_transform_switch(y,inverse=False))
        model_1_y_fit = self.log_transform_switch(self.model_1_gcv.best_estimator_.predict(X_1), inverse=True)
        model_1_y_resid = y - model_1_y_fit
        
        self.model_2_gcv = GridSearchCV(
            self.model_2, 
            self.model_2_param_grid, 
            cv=5, 
            verbose=3, 
#             scoring=cv_score,
        )
        self.model_2_gcv.fit(X_2, model_1_y_resid)
        model_2_y_fit = self.model_2_gcv.best_estimator_.predict(X_2)
        model_2_y_resid = model_1_y_resid - model_2_y_fit

        self.model_1_y_fit = model_1_y_fit
        self.model_1_y_resid = model_1_y_resid
        self.model_2_y_fit = model_2_y_fit
        self.model_2_y_resid = model_2_y_resid
        
        print(f"model_1_best_params_: {self.model_1_gcv.best_params_}")
        print(f"model_2_best_params_: {self.model_2_gcv.best_params_}")

    def predict(self, X_1, X_2):
        y_pred = self.log_transform_switch(np.squeeze(self.model_1_gcv.best_estimator_.predict(X_1)), inverse=True)
        y_pred += self.model_2_gcv.best_estimator_.predict(X_2)
        return y_pred
    
    def log_transform_switch(self, value, inverse):
        if not self.log_transform:
            return value
        else:
            return np.expm1(value) if inverse else np.log1p(value)

In [None]:
y_train, y_test = y.loc["2015-01-01":"2018-12-31"].num_sold, y.loc["2019-01-01":"2019-12-31"].num_sold
X_trend_train, X_trend_test =  X_trend.loc["2015-01-01":"2018-12-31"], X_trend.loc["2019-01-01":"2019-12-31"]
X_non_trend_train, X_non_trend_test =  X_non_trend.loc["2015-01-01":"2018-12-31"], X_non_trend.loc["2019-01-01":"2019-12-31"]

In [None]:
trend_pipe = Pipeline(steps=[('preprocessor', StandardScaler()),
                              ('ridge', Ridge())
                             ])
trend_param_grid = [{
    'ridge__alpha': [0.01, 0.1, 0.5, 1.0, 10.],
              }]

# trend_gcv = GridSearchCV(
#     trend_pipe, 
#     trend_param_grid, 
#     cv=TimeSeriesSplit(), 
#     verbose=3, 
#     scoring=cv_score,
#                   )
# trend_gcv.fit(X_trend_train, y_train)
# trend_best_model = trend_gcv.best_estimator_

# print(f"gcv.best_params_: {trend_gcv.best_params_}")
# print(f"gcv.best_score_: {trend_gcv.best_score_}")
# print(f"smaple_loss: {smape_loss(y_train, trend_best_model.predict(X_trend_train))}")

In [None]:
non_trend_pipe = Pipeline(steps=[
                              ('xgb', XGBRegressor())
                             ])
non_trend_param_grid = {
    'xgb__learning_rate': [.03,  0.1, ],
    'xgb__n_estimators': [1000],
}

# non_trend_gcv = GridSearchCV(
#     non_trend_pipe, 
#     non_trend_param_grid, 
#     cv=TimeSeriesSplit(),
#     verbose=3, 
# #                    scoring=cv_score,
#                   )
# y_resid = y_train - trend_best_model.predict(X_trend_train)
# non_trend_gcv.fit(X_non_trend_train, y_resid)
# non_trend_best_model = non_trend_gcv.best_estimator_

In [None]:
boosted_model = BoostedHybrid(trend_pipe, non_trend_pipe, trend_param_grid, non_trend_param_grid, log_transform=LOG_TARGET)
boosted_model.fit(X_trend_train, X_non_trend_train, y_train)
print(f"smaple_loss: {smape_loss(y_train, boosted_model.predict(X_trend_train, X_non_trend_train))}")

In [None]:
y_train_pred = boosted_model.predict(X_trend_train, X_non_trend_train)
y_test_pred = np.ceil(boosted_model.predict(X_trend_test, X_non_trend_test))

In [None]:
train_result = pd.DataFrame(index=X_trend_train.index)
train_result['y_train'] = y_train
train_result['y_train_pred'] = y_train_pred
train_result = train_result.reset_index()

## Error analysis

In [None]:
plt.figure(figsize = (16,16))
ax=sns.scatterplot(data=train_result, x="y_train_pred", y="y_train", legend='brief')
plt.xlim(0, 3000)
plt.ylim(0, 3000)

In [None]:
def get_ts(df, date_start, date_end, country, store, product, set_index='date'):
    ts_df = df.query(f"date >='{date_start}' and date<='{date_end}' and country=='{country}' and store=='{store}' and product=='{product}'")
    if set_index:
        ts_df = ts_df.set_index('date')
    return ts_df

In [None]:
combinations = [train_result['country'].unique(),train_result['store'].unique(), train_result['product'].unique()]
combinations=list(itertools.product(*combinations))

fig, axes = plt.subplots(int(ceil(len(combinations)/3)), 3, sharex=False, sharey=False, figsize=(16*3, 9*3))
for ts_comb, ax in zip(combinations, axes.flatten()):
    ts_df = get_ts(train_result,'2015-01-01','2018-12-31',*ts_comb)
    ts_df['y_train'].plot(alpha=0.5, label='train',title='-'.join(ts_comb), ylabel="items sold",ax=ax)
    ts_df['y_train_pred'].plot(ax=ax, label='pred')

In [None]:
resid = y_train-y_train_pred
resid_df = pd.DataFrame(
    {'resid':resid},
    index=y_train.index
)

In [None]:
plt.figure(figsize = (16,9))
sns.displot(data=resid_df, x="resid", kde=True,stat='density')

In [None]:
plt.figure(figsize = (16,9))
ax = sns.boxplot(y=resid_df["resid"])

## Submission

In [None]:
plt.figure(figsize = (16,9))
ax = pd.Series(y_train_pred).hist(label='train',)
pd.Series(y_test_pred).hist(label='test', ax=ax)
ax.set_title('Distributions of predctions')
plt.legend()

In [None]:
submission = pd.DataFrame(index=X_trend_test.index)
submission['num_sold'] = y_test_pred
submission_output = pd.merge(submission, test, how='inner', left_index=True, right_index=True)
submission_output = submission_output.reset_index(drop=True).sort_values(ID_COL)[[ID_COL, TARGET_COL]]
submission_output.to_csv('submission.csv', index=False)
submission_output.head()