In [None]:
%reset -sf

In [None]:
# Upgrade scikit
#!pip uninstall scikit-learn -y
!pip install -U scikit-learn

In [None]:
# Patch Xeon Intel OneAPI Scikit accelerator
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor

import matplotlib.pyplot as plt

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
path = Path().cwd().parent / 'input'
files = list(path.rglob('*'))
files

In [None]:
train = pd.read_csv(files[2])
train['date'] = pd.to_datetime(train['date'], errors='coerce')
#train = train.set_index('date')

test = pd.read_csv(files[3])
test['date'] = pd.to_datetime(test['date'], errors='coerce')
#test = test.set_index('date')

train
test

In [None]:
def pre_process(df):
    df = df.copy()
    #
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['date'] = pd.to_datetime(df['date'])
    df['week']= df['date'].dt.week
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df['daysinmonth'] = df['date'].dt.days_in_month
    df['dayofweek'] = df['date'].dt.dayofweek
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    df['weekend'] = (df['date'].dt.weekday >=5).astype(int)
    df['weekday'] = df['date'].dt.weekday
    #
    df['country'] = pd.Categorical(df['country']).codes
    df['store'] = pd.Categorical(df['store']).codes
    df['product'] = pd.Categorical(df['product']).codes
    #
    df = df.drop(columns=['date', 'row_id'])
    return df

train_ = pre_process(train)
test_ = pre_process(test)

train_
test_

In [None]:
# Helpers

def lag_data(data, num_lags):
    """Create lagged features"""
    data = data.copy()
    lags = []
    for i in range(num_lags):
        lagged = data.shift(i+1).fillna(method='bfill')
        lags.append(lagged)
    data = pd.concat([data, *lags], axis=1)
    return data

def smape(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
# Estimator and params
ts_params = {'loss': 'squared_error',
              'learning_rate': 0.2900707070707071,
              'max_iter': 60,
              'max_leaf_nodes': 52,
              'max_depth': 86,
              'min_samples_leaf': 7,
              'l2_regularization': 0.590040404040404,
              'max_bins': 224,
              'tol': 0.009000000000000001,
              'random_state': 32}

ts_est = HistGradientBoostingRegressor(**ts_params)
ts_est

In [None]:
# Forecast and see

# Full data
train_X, train_y = train_.drop('num_sold', axis=1), train_['num_sold']

train_X = lag_data(train_X, 2)
test__ = lag_data(test_, 2)

# Fit and forecast
ts_est.fit(train_X, train_y)
forec = ts_est.predict(test__)
test['num_sold'] = forec

# Vis
train['type'] = 'train'
test['type'] = 'test'

temp = train.append(test)
for c in temp['country'].unique():
    for s in temp['store'].unique():
        for p in temp['product'].unique():
            fig, ax = plt.subplots(figsize=(15,6))
            mask = (temp['country']==c) & (temp['store']==s) & (temp['product']==p)
            sub = temp[mask]
            sub = sub.set_index('date')
            sub_ = sub['num_sold'][sub['type']=='train']
            _ = ax.plot(sub_, label=f'{c} | {s} | {p}')
            sub_ = sub['num_sold'][sub['type']=='test']
            _ = ax.plot(sub_)
            _ = ax.legend()
            plt.show()
            plt.clf()
            plt.close()

In [None]:
# I'll be looking at ways to enhance it
# Feel free to edit yourself.

In [None]:
# ~~~ END ~~~