# Let's go!
## Imports and Set Up
___

In [None]:
import warnings
from itertools import product
from pathlib import Path
from copy import deepcopy

import holidays
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import Parallel, delayed

from sklearn.base import (
    BaseEstimator, TransformerMixin, RegressorMixin
)
from sklearn.compose import (
    make_column_transformer, make_column_selector,
)

from sklearn.compose import (
    make_column_transformer,
    TransformedTargetRegressor
)
from sklearn.ensemble import (
    VotingRegressor,
    HistGradientBoostingRegressor
)
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import (
    ElasticNet, LinearRegression,
    HuberRegressor
)
from sklearn.metrics import (
    mean_absolute_percentage_error,
)
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import (
    cross_val_predict, cross_val_score,
    LeavePGroupsOut, TimeSeriesSplit,
    GridSearchCV
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler, OneHotEncoder,
    KBinsDiscretizer,
    OrdinalEncoder,
    FunctionTransformer,
    PolynomialFeatures,
    SplineTransformer
)
from sklearn.utils.validation import (
    check_X_y, check_array, check_is_fitted
)

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_style("ticks")

INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e1'
TRAIN_PATH = INPUT_PATH / "train.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

## Markdown
___
**Observations** 
* Target has `int` dtype
* Target contains `nan`s
* Target distribution is positively skewed
* Time-wise, *looks* like all test data occur after train data
* No missing values, each entry corresponds to "num_sold" of a given product at the given shop in the given country (5 x 3 x 6 = 90 combinations)
* Looks like evaluation metric does not account for missing values

**Assumptions**
* `nan` in target is equivalent to `0` (i.e., absence is due to lack of sales)
* no hierarchical data

**To Do**
* [X] **EDA**
    * [X] Confirm if test data contains same categories as train data
* [ ] **FE**
    * [X] Time feature preprocessing
    * [X] Encode Christmas and other festive seasons in Western countries
    * ~[ ] Country preprocessing: hemisphere, nordic or not?~
    * ~[ ] Bring basic country data~
    * [X] Decide how to model iso week # (OneHot or Ordinal)
    * [ ] Figure out how to apply rolling/lagging features
* [ ] **Modelling**
    * [X] Tran without missing entries?
    * [X] Consider preprocessing target with `TransformedTargetRegressor`
    * ~[ ] Use `TimeSeriesSplit` or `LeavePGroupsOut` for cross val~
    * [X] Use a baseline model to compare
    * [X] Try an ensamble of linear models trained on different levels of grouping
    * [X] Try an outlier-resistant linear model
    * [X] Try bayesian models?
    * [X] Consider rounding predictions
    * [X] 90 linear models using one-hot encoded months, days(weekends?) and normalized years?

In [None]:
X_data = pd.read_csv(TRAIN_PATH, index_col="date", parse_dates=True)
X_test = pd.read_csv(TEST_PATH, index_col="date", parse_dates=True)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_train = X_data.drop(columns=["id", "num_sold"]).copy()
X_test.drop(columns="id", inplace=True)

y_train = X_data["num_sold"].copy()
y_train.fillna(1, inplace=True) ## fill with ones to allow log

## EDA
___
### dtype, nunique, notnulls

In [None]:
info_df = (
    pd.DataFrame(
        [
            X_train.dtypes,
            X_train.nunique(),
            X_train.notnull().sum(axis=0)
        ],
        index=["dtype", "nunique", "not_null"]
    )
    .T
    .sort_values("nunique", ascending=False)
)
info_df

### Categories

In [None]:
cat_cols = ["country", "store", "product"]
(
    X_data
    .groupby(cat_cols)["num_sold"]
    .count()
    .to_frame()
    .T
)

### Target

In [None]:
y_train.describe()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16,3), tight_layout=True)
sns.histplot(
    y_train,
    binrange=(0,6000),
    binwidth=100,
    ax=axes[0]
);
sns.histplot(
    y_train[y_train <= 100],
    discrete=True,
    # binwidth=,
    ax=axes[1]
);
sns.histplot(
    y_train[y_train > 100],
    binrange=(100,5940),
    binwidth=10,
    ax=axes[2]
);
sns.histplot(
    np.log(y_train[y_train > 0]),
    # binrange=(100,5940),
    # binwidth=10,
    ax=axes[3]
);

In [None]:
def yearly_line_plot(data, color=None):
    resample = data.resample("Y")
    palette = sns.color_palette(
         "Blues", n_colors=resample.ngroups
    )
    shift = 0
    for i, (y, df) in enumerate(resample):
        sns.lineplot(
            df["num_sold"].shift(shift, freq="D"),
            color=palette[i],
            label=y.year
        )
        shift -= df.index.nunique()


In [None]:
    for country in X_data["country"].unique():
        q_ = (
            f"(country =='{country}')"
            # "and (store == 'Stickers for Less')"
            # "and (product == 'Kaggle')"
        )
        
        g = sns.FacetGrid(
            X_data.query(q_),
            row="product",
            col="store",
            aspect=4,
            height=2
        );
        g.map_dataframe(yearly_line_plot)
        g.add_legend();
        g.figure.suptitle(country, y=1.025)

In [None]:
(
    X_data
    .assign(year=X_data.index.year)
    .groupby(["product", "year"])["num_sold"]
    .mean()
    .unstack(level=0)
    .plot()
);

### Missing Target

In [None]:
X_data[X_data["num_sold"].isna()].groupby(cat_cols).size()

In [None]:
def yearly_heatmap(data, **heatmap_kws):
    pre_heat_df = pd.DataFrame(
        np.array([
            data.index.dayofyear,
            data.index.year,
            data["num_sold"]
        ]).T,
        columns=["dayofyear", "year", "num_sold"]
    )
    heat_df = (
        pre_heat_df
        .pivot_table(
            columns="dayofyear", index="year", aggfunc=np.sum
        )
        .replace(0.0, np.nan)
        .droplevel(0, axis=1)
    )
    heat_df.index = heat_df.index.astype(int)
    heat_df.columns = heat_df.columns.astype(int)

    sns.heatmap(
        heat_df,
        **heatmap_kws
    )

In [None]:
# if RUN_SLOW_CELLS:
for country in ["Canada", "Kenya"]:
    q_ = (
        f"(country =='{country}')"
        "and (store != 'Discount Stickers')"
        "and (product == 'Holographic Goose')"
    )
    
    g = sns.FacetGrid(
        X_data.query(q_),
        # col="country",
        row="store",
        aspect=8,
        height=2
    );
    g.map_dataframe(
        yearly_heatmap,
        vmin=X_data.query(q_)["num_sold"].min(),
        vmax=X_data.query(q_)["num_sold"].max(),
        cbar=False,
        lw=0.005,
        linecolor="k"
    )
    g.add_legend();
    g.figure.suptitle(country, y=1.025)
    g.figure.colorbar(
        g.figure.axes[0].collections[0],
        ax=g.figure.axes,
        orientation='vertical',
        aspect=100,
        fraction=0.025,
        pad=0.01
    );


In [None]:
mean_absolute_percentage_error(y_train, y_train)

## FE
___

In [None]:
def transform_day(df):
    return np.array(
        [
            df.index.dayofweek <  4,  # mon-thu
            df.index.dayofweek == 4,  # fri
            df.index.dayofweek >  4,  # weekend
        ]
    ).T

day_names_out = ["mon-thu", "fri", "weekend"]
day_tr = FunctionTransformer(
    transform_day,
    feature_names_out = lambda self, names_in: day_names_out 
)
day_tr.set_output(transform="pandas")

day_tr.fit_transform(X_train).shape

In [None]:
country_holidays = {
    'Canada':    holidays.Canada(),
    'Finland':   holidays.Finland(),
    'Italy':     holidays.Italy(),
    'Kenya':     holidays.Kenya(),
    'Norway':    holidays.Norway(),
    'Singapore': holidays.Singapore(),
}

def transform_holiday(df):
    is_holiday = []
    for idx, row in df.iterrows():
        date = idx  # date is in the index
        ctry = row['country']  # must exist in the DF as a column
        if ctry in country_holidays:
            is_holiday.append(date in country_holidays[ctry])
        else:
            is_holiday.append(False)
    return np.array(is_holiday).reshape(-1, 1)

holiday_tr = FunctionTransformer(
    transform_holiday,
    feature_names_out=lambda self, names_in: ["is_holiday"]
)
holiday_tr.set_output(transform="pandas")

holiday_tr.fit_transform(X_train).value_counts()

In [None]:
def transform_day_of_year(df):
    day_of_year = df.index.dayofyear
    sin_part = np.sin(2 * np.pi * day_of_year / 365)
    cos_part = np.cos(2 * np.pi * day_of_year / 365)
    return np.column_stack([sin_part, cos_part])

day_of_year_tr = FunctionTransformer(
    transform_day_of_year, 
    feature_names_out=lambda self, names_in: ["doy_sin", "doy_cos"]
)
day_of_year_tr.fit_transform(X_train).shape

In [None]:
def transform_week(df):
    return np.array(
        [
            df.index.isocalendar().week,
        ]
    ).T

week_tr = FunctionTransformer(
    transform_week,
    feature_names_out = lambda self, names_in: ["week"],
)
week_tr.set_output(transform="pandas")

week_pl = make_pipeline(
    week_tr,
    PolynomialFeatures(
        degree=2, include_bias=False,
    ),
    MinMaxScaler()
)

week_pl.fit_transform(X_train).shape

In [None]:
# def transform_festive_week(df):
#     return np.array(
#         [
#             df.index.isocalendar().week == 52,
#             df.index.isocalendar().week == 53,
#             df.index.isocalendar().week == 1,
#         ]
#     ).T

# festive_week_tr = FunctionTransformer(
#     transform_festive_week,
#     feature_names_out = lambda self, names_in: ["week52", "week53", "week1"],
# )
# festive_week_tr.set_output(transform="pandas")

# festive_week_tr.fit_transform(X_train).shape


In [None]:
# def transform_month(df):
#     return np.array(
#         [
#             np.sin(2 * np.pi * df.index.month / 12),
#             np.cos(2 * np.pi * df.index.month / 12)
#         ]
#     ).T

# month_tr = FunctionTransformer(
#     transform_month,
#     feature_names_out = lambda self, names_in: ["month_sin", "month_cos"],
# )
# month_tr.set_output(transform="pandas")

# month_tr.fit_transform(X_train).shape

In [None]:
def transform_month(df):
    return np.array(
        [
            df.index.month,
        ]
    ).T

month_tr = FunctionTransformer(
    transform_month,
    feature_names_out = lambda self, names_in: ["month"] 
)
month_tr.set_output(transform="pandas")

month_pl = make_pipeline(
    month_tr,
    OneHotEncoder(drop=None, sparse_output=False)
)

month_pl.fit_transform(X_train).shape

In [None]:
def transform_season(df):
    # Map month -> season
    def month_to_season(m):
        if m in [3, 4, 5]:
            return "spring"
        elif m in [6, 7, 8]:
            return "summer"
        elif m in [9, 10, 11]:
            return "autumn"
        else:
            return "winter"

    seasons = [month_to_season(m) for m in df.index.month]
    return np.array(seasons).reshape(-1, 1)

season_tr = FunctionTransformer(
    transform_season,
    feature_names_out=lambda self, names_in: ["season"]
)
season_tr.set_output(transform="pandas")

season_pl = make_pipeline(
    season_tr,
    OneHotEncoder(drop=None, sparse_output=False)
)

In [None]:
season_pl.fit_transform(X_train)

In [None]:
# def transform_q(df):
#     return np.array(
#         [
#             df.index.quarter,
#         ]
#     ).T

# q_tr = FunctionTransformer(
#     transform_q,
#     feature_names_out = lambda self, names_in: ["quarter"] 
# )
# q_tr.set_output(transform="pandas")

# q_pl = make_pipeline(
#     q_tr,
#     OneHotEncoder(drop=None, sparse_output=False)
# )

# q_pl.fit_transform(X_train).shape

In [None]:
def transform_year(df, min_year=2010, max_year=2019):
    y1 = (df.index.year - min_year) / (max_year - min_year)
    return np.array(
        [
            y1**2,  # y1**3
        ]
    ).T

year_tr = FunctionTransformer(
    transform_year,
    feature_names_out = lambda self, names_in: ["year^2"],
)
year_tr.set_output(transform="pandas")

year_tr.fit_transform(X_train)
pd.DataFrame(year_tr.transform(X_test))["year^2"].value_counts()

In [None]:
def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )

In [None]:
def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )
def transform_splines(df):
    dow_tf = periodic_spline_transformer(7)
    month_tf = periodic_spline_transformer(12)
    return np.hstack(
        [
            dow_tf.fit_transform(df.index.dayofweek.values.reshape(-1,1)),
            month_tf.fit_transform(df.index.month.values.reshape(-1,1))
        ]
    )

splines_names_out =  [f"dow_spline{x}" for x in range(7)]
splines_names_out +=  [f"month_spline{x}" for x in range(12)]

spline_tr = FunctionTransformer(
    transform_splines,
    feature_names_out = lambda self, names_in: splines_names_out,
)
spline_tr.set_output(transform="pandas")

spline_tr.fit_transform(X_train)

In [None]:
baseline_model = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(dtype=int), cat_cols),  # for grouping by
        (day_tr, cat_cols),
        (holiday_tr, cat_cols),
        (day_of_year_tr, cat_cols),
        (week_pl, cat_cols),
        # (festive_week_tr, cat_cols),
        # (month_pl, cat_cols),
        # (q_pl, cat_cols),
        (year_tr, cat_cols),
        (spline_tr, cat_cols)
    ),
    ElasticNet(alpha=.1, l1_ratio=0.75)
    # HistGradientBoostingRegressor()
)
# baseline_model.fit(X_train, y_train)

In [None]:
class ResidualEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, model, cols):
        self.model = deepcopy(model)
        self.cols = cols.copy()

    def fit(self, X, y):
        def q_group(s):
            qcut_ = pd.qcut(s, 11, labels=np.arange(-1,1.2, 0.2))
            qcut_.name = "qcut"
            return qcut_.astype(float)
            
        X_ = X.copy()
        self.model.fit(X_, y)
        X_["residual"] = y - self.model.predict(X)
        X_["doy"] = X_.index.dayofyear
        self.mean_ = (
            X_
            .groupby(self.cols + ["doy"])["residual"]
            .mean()
            .to_frame("resid_mean")
        )
        self.qcut_ = (
            self.mean_
            .groupby(self.cols)["resid_mean"]
            .transform(q_group)
            .to_frame("resid_qbin")
        )
        return self
        
    def transform(self, X):
        X_ = X.copy()
        idx_ = X_.index
        X_["doy"] = X_.index.dayofyear
        X_ = pd.merge(
            X_, self.qcut_,
            on=self.cols + ["doy"],
            how="left"
        )
        return (
            X_
            .drop(columns="doy")
            .fillna(0)
            .set_index(idx_)
        )

In [None]:
pre_proc = make_column_transformer(
    (OrdinalEncoder(dtype=int), cat_cols),  # for grouping by
    # (day_tr, cat_cols),
    (holiday_tr, cat_cols),
    (day_of_year_tr, cat_cols),
    (week_pl, cat_cols),
    # (festive_week_tr, cat_cols),
    # (month_pl, cat_cols),
    # (q_pl, cat_cols),
    (season_pl, cat_cols),
    (year_tr, cat_cols),
    (spline_tr, cat_cols),
)

pre_proc.set_output(transform="pandas")
# X_test_pp.sample(5).T

In [None]:
X_train_pp = pre_proc.fit_transform(X_train)
X_test_pp = pre_proc.transform(X_test)

## Modelling
___

In [None]:
class GroupRegression(BaseEstimator, RegressorMixin):
    """
    A scikit-learn style estimator that fits separate regression models 
    for each unique combination of categorical columns (groupby_cols).
    
    Parameters
    ----------
    groupby_cols : list
        Column names in X to use for grouping. A separate model will be
        fit for each unique combination of these columns.

    base_estimator : estimator, default=None
        If None, uses ElasticNet(**base_estimator_kws) as the default model.
        Otherwise, use your own regressor.

    n_jobs : int, default=1
        Number of CPU cores for parallel fitting.

    **base_estimator_kws : dict
        Additional keyword args passed to the default ElasticNet if base_estimator is None.
    """
    def __init__(
        self,
        groupby_cols,
        base_estimator=None,
        n_jobs=-1,
        **base_estimator_kws
    ):
        self.n_jobs = n_jobs
        self.groupby_cols = groupby_cols
        if base_estimator is None:
            self.base_estimator = ElasticNet(**base_estimator_kws)
        else:
            self.base_estimator = base_estimator
            self.base_estimator.set_params(**base_estimator_kws)
        self.base_estimator_kws = base_estimator_kws

    def fit(self, X, y):
        """Fit separate estimators for each group."""
        check_X_y(X, y, dtype=None)  # Allow non-numerical grouping columns
        X = pd.DataFrame(X).reset_index(drop=True)
        y = pd.Series(y.values, index=X.index)

        for col in self.groupby_cols:
            if col not in X.columns:
                raise KeyError(f"X does not contain the grouping column: {col}")

        self.n_features_in_ = X.shape[1] - len(self.groupby_cols)
        self.estimators_ = {}

        def fit_one_group(group_key, df):
            estimator = deepcopy(self.base_estimator)
            X_local = df.drop(columns=self.groupby_cols)
            y_local = y.loc[df.index]
            estimator.fit(X_local, y_local)
            return (group_key, estimator)

        results = Parallel(n_jobs=self.n_jobs)(
            delayed(fit_one_group)(g, df) for g, df in X.groupby(self.groupby_cols)
        )

        for group_key, estimator in results:
            self.estimators_[group_key] = estimator

        return self

    def predict(self, X):
        """Predict using the group-specific model if available; else default to zeros."""
        check_is_fitted(self, 'estimators_')
        X = pd.DataFrame(X).reset_index(drop=True)

        for col in self.groupby_cols:
            if col not in X.columns:
                raise KeyError(f"X does not contain the grouping column: {col}")

        y_pred = np.zeros(X.shape[0], dtype=float)

        def predict_one_group(group_key, df):
            if group_key in self.estimators_:
                est = self.estimators_[group_key]
                return est.predict(df.drop(columns=self.groupby_cols))
            else:
             # return defaults for Hologoose in Kenya and Canada
                default = np.ones(len(df), dtype=float)
                if "Canada" in group_key:
                    default *= 200
                if "Kenya" in group_key:
                    default *= 5
                return default

        for group_key, df in X.groupby(self.groupby_cols):
            idx = df.index
            y_pred[idx] = predict_one_group(group_key, df)

        return y_pred

    def get_params(self, deep=True):
        """Return parameters, including nested base_estimator params."""
        params = super().get_params(deep=False)
        if deep and hasattr(self.base_estimator, 'get_params'):
            for k, v in self.base_estimator.get_params(deep=True).items():
                params[f'base_estimator__{k}'] = v
        return params

    def set_params(self, **params):
        """Set parameters, parsing out base_estimator__ params."""
        base_estimator_params = {}
        for key, val in list(params.items()):
            if key.startswith('base_estimator__'):
                base_estimator_params[key[len('base_estimator__'):]] = val
                del params[key]
        super().set_params(**params)
        if base_estimator_params and hasattr(self.base_estimator, 'set_params'):
            self.base_estimator.set_params(**base_estimator_params)
        return self


In [None]:
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import make_scorer
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [None]:
 groupby_cols_pp = [
    "ordinalencoder__country",
    "ordinalencoder__store",
    "ordinalencoder__product"
]

# groupby_cols = [
#     "functiontransformer__ordinalencoder__country",
#     "functiontransformer__ordinalencoder__store",
#     "functiontransformer__ordinalencoder__product"
# ]

# kernel_ = Nystroem(
#     kernel="rbf", gamma=1.0, n_components=100,
#     random_state=17, n_jobs=-1
# )
# kernel_.set_output(transform="pandas")

# post_proc = make_column_transformer(
#     (FunctionTransformer(feature_names_out="one-to-one"), groupby_cols_pp),
#     remainder = kernel_
# )
# post_proc.set_output(transform="pandas")
# main_pipeline = make_pipeline(
#     post_proc,
#     GroupRegression(
#         groupby_cols, n_jobs=-1,
#         alpha=.1, l1_ratio=0.9
#     )
# )

# main_pipeline.fit(X_train_pp, y_train)

In [None]:
gr_lin = GroupRegression(
    groupby_cols_pp, n_jobs=-1,
    alpha=.1, l1_ratio=0.5
)

# gr = GroupRegression(
#     groupby_cols_pp, n_jobs=-1,
#     base_estimator=HistGradientBoostingRegressor(),
#     l2_regularization=0.01,
#     learning_rate=0.05,
#     max_bins=64,
#     max_depth=2,
#     max_iter=400,
#     min_samples_leaf=2,
#     random_state=17
# )

In [None]:
notnull_mask = X_data["num_sold"].notnull()

In [None]:
re = ResidualEncoder(gr_lin, groupby_cols_pp)

In [None]:
re.fit(
    X_train_pp.loc[notnull_mask],
    y_train.loc[notnull_mask]
)

In [None]:
X_train_pp = re.transform(X_train_pp)
X_test_pp = re.transform(X_test_pp)

### Validation 1

In [None]:
# gr = GroupRegression(
#     groupby_cols_pp, n_jobs=-1,
#     base_estimator=HistGradientBoostingRegressor(),
#     l2_regularization=0.01,
#     learning_rate=0.05,
#     max_bins=64,
#     max_depth=2,
#     max_iter=400,
#     min_samples_leaf=2,
#     random_state=17
# )

In [None]:
# "2010":"2014"
# gr.fit(
#     X_train_pp.loc[notnull_mask].loc["2010":"2014"],
#     y_train.loc[notnull_mask].loc["2010":"2014"]
# )

In [None]:
# y_pred = gr.predict(X_train_pp[notnull_mask].loc["2015":])
# y_true = y_train[notnull_mask].loc["2015":]
# mean_absolute_percentage_error(y_true, y_pred)

### Validation 2

In [None]:
def custom_time_splits(X, notnull_mask=None, train_on_null=True):
    """
    Generate 5 time-based splits:
      - Train = 3 years
      - Validation = 0.5 year
      - Test = 1.5 years
    Each split is shifted by 0.5 year from the previous split.

    Returns:
      An iterator of (train_idx, val_idx, test_idx) tuples.
    """

    # The earliest date in the dataset
    min_date = X.index[0]

    if notnull_mask is None:
        notnull_mask = pd.Series([True] * X.shape[0])

    # We'll define lengths in months:
    train_months = 36   # 3 years
    val_months   = 6    # 0.5 year
    test_months  = 18   # 1.5 years
    shift_months = 6    # shift each split by 0.5 year

    # We'll produce 4 splits total
    for i in range(5):
        # Compute start/end boundaries for each window
        train_start = min_date + pd.DateOffset(months=shift_months * i)
        train_end   = train_start + pd.DateOffset(months=train_months)

        val_start   = train_end
        val_end     = val_start + pd.DateOffset(months=val_months)

        test_start  = val_end
        test_end    = test_start + pd.DateOffset(months=test_months)

        # Create boolean masks
        train_mask = (X.index >= train_start) & (X.index < train_end)
        if not train_on_null:
            train_mask &= notnull_mask
        val_mask   = (X.index >= val_start)   & (X.index < val_end) & notnull_mask
        test_mask  = (X.index >= test_start)  & (X.index < test_end) & notnull_mask

        # Convert masks to integer indices
        train_idx = np.where(train_mask)[0]
        val_idx   = np.where(val_mask)[0]
        test_idx  = np.where(test_mask)[0]

        # yield (train_idx, val_idx, test_idx)
        yield (train_idx, test_idx)


In [None]:
BEST_SCORE_LOG = {"mean_test_score":np.inf}
def evaluate_time_splits(
    X, y, splitter, model_factory,
    metric=mean_absolute_percentage_error
):
    """
    Iterates over time-based splits, trains a model for each split,
    logs train/validation/test scores, and returns a results DataFrame.
    """
    global BEST_SCORE_LOG
    cv_records = []
    
    for i, (train_idx, val_idx, test_idx) in enumerate(splitter, start=1):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val,   y_val   = X.iloc[val_idx],   y.iloc[val_idx]
        X_test,  y_test  = X.iloc[test_idx],  y.iloc[test_idx]

        # print(f"Split {i}")
        # print(f"Train size: {X_train.shape}, from {X_train.index.min():%F} to {X_train.index.max():%F}")
        # print(f"Val size: {X_val.shape}, from {X_val.index.min():%F} to {X_val.index.max():%F}")
        # print(f"Test size: {X_test.shape}, from {X_test.index.min():%F} to {X_test.index.max():%F}")

        model = model_factory()
        model.fit(X_train, y_train)

        train_score = metric(y_train, model.predict(X_train))
        val_score   = metric(y_val,   model.predict(X_val))
        test_score  = metric(y_test,  model.predict(X_test))

        cv_records.append({
            'split': i,
            'train_score': train_score,
            'val_score': val_score,
            'test_score': test_score
        })
    
    results_df = pd.DataFrame(cv_records)
    report_df = results_df.mean().to_frame().T.iloc[:,1:]
    mean_test_score = report_df["test_score"].values[0]
    if BEST_SCORE_LOG["mean_test_score"] > mean_test_score:
        BEST_SCORE_LOG["mean_test_score"] = mean_test_score
        print(report_df, "best yet!")
    else:
        print(report_df)
    # for col in ["train_score", "val_score", ""]
    return results_df


In [None]:
def format_cv_results(
    cv_dfs, param_product:"itertools.product", param_names:list
):
    """
    Concatenates CV DataFrames (columns: split, train_score, val_score, test_score)
    into a wide format with one row per run, columns for each split's scores,
    plus mean/std columns for train/val/test.
    """
    records = []
    for i, (df, params) in  enumerate(zip(cv_dfs, param_product), start=1):
        row_dict = {'run': i}
        for param_name in param_names:
            row_dict[param_name] = params[0]

        # row_dict["l1_ratio"]   = params[1]
        # row_dict["tol"]        = params[3]
        for _, r in df.iterrows():
            s = int(r['split'])
            row_dict[f'train_split{s}'] = r['train_score']
            row_dict[f'val_split{s}']   = r['val_score']
            row_dict[f'test_split{s}']  = r['test_score']

        row_dict['train_mean'] = df['train_score'].mean()
        row_dict['train_std']  = df['train_score'].std()
        row_dict['val_mean']   = df['val_score'].mean()
        row_dict['val_std']    = df['val_score'].std()
        row_dict['test_mean']  = df['test_score'].mean()
        row_dict['test_std']   = df['test_score'].std()
        records.append(row_dict)

    return pd.DataFrame(records)

In [None]:
# splitter_ = custom_time_splits(
#     X_train_pp,
#     notnull_mask=notnull_mask,
#     train_on_null=False
# )
# results_df = evaluate_time_splits(
#     X_train, y_train, splitter_,
#     lambda: deepcopy(baseline_model)
# )

In [None]:
# splitter_ = custom_time_splits(
#     X_train_pp,
#     notnull_mask=notnull_mask,
#     train_on_null=False
# )
# results_df = evaluate_time_splits(
#     X_train_pp, y_train, splitter_,
#     lambda: deepcopy(gr)
# )

In [None]:
param_spaces = {
    'base_estimator__alpha': Real(1e-2, 1e2, prior='log-uniform'),
    'base_estimator__l1_ratio': [.1, .5, .7, .9, .95, .99, 1],
}

# param_spaces = {
#     "base_estimator__learning_rate": Real(1e-2, 1e0, prior='log-uniform'),
#     "base_estimator__max_iter": Integer(1e1, 1e3, prior="log-uniform"),
#     "base_estimator__max_depth": Integer(2, 12, prior="uniform"),
#     "base_estimator__min_samples_leaf": Integer(2, 300, prior="uniform"),
#     "base_estimator__l2_regularization": Real(1e-3, 1e2, prior="log-uniform"),
#     "base_estimator__max_bins": Integer(8, 64, prior="log-uniform", base=2),
# }

param_spaces = {
    "base_estimator__learning_rate": Real(0.01, 0.06, prior='log-uniform'),
    "base_estimator__max_iter": Integer(200, 600, prior="log-uniform"),
    "base_estimator__max_depth": Integer(2, 4, prior="uniform"),
    "base_estimator__min_samples_leaf": Integer(2, 50, prior="uniform"),
    "base_estimator__l2_regularization": Real(1e-2, 1e1, prior="log-uniform"),
    "base_estimator__max_bins": Integer(32, 128, prior="log-uniform", base=2),
}


# param_spaces = {
#     # 'columntransformer__remainder__coef0': None,
#     # 'columntransformer__remainder__degree': None,
#     'columntransformer__remainder__gamma': Real(1e-2, 1e0, prior='log-uniform'),
#     # 'columntransformer__remainder__kernel': 'rbf',
#     'columntransformer__remainder__n_components': Integer(100, 200),
    
#     'groupregression__base_estimator__alpha': [0.01],
#     'groupregression__base_estimator__l1_ratio': [0.99],
# }

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)


In [None]:
# splitter_ = custom_time_splits(
#     X_train_pp,
#     notnull_mask=notnull_mask,
#     train_on_null=False
# )
# search = BayesSearchCV(
#     estimator=gr,
#     search_spaces=param_spaces,
#     scoring=mape_scorer,            # or another metric
#     n_iter=10,                      # number of parameter settings to sample
#     cv=splitter_,                   # or your custom time-based CV
#     n_jobs=-1,                      # parallel
#     random_state=425,
#     return_train_score=True
# )

# search.fit(X_train_pp, y_train)
# pd.DataFrame(search.cv_results_).sort_values("rank_test_score").head()

In [None]:
gr_lin.set_params(**{"base_estimator__alpha": 0.01, "base_estimator__l1_ratio":0.9})

In [None]:
params0 = {
    "groupby_cols": groupby_cols_pp,
    "base_estimator": HistGradientBoostingRegressor(),
    "l2_regularization": 0.001,
    "learning_rate": 0.040055,
    "max_bins": 64,
    "max_depth": 2,
    "max_iter": 363,
    "min_samples_leaf": 2,
    "n_jobs":-1,
    "random_state": 17
}

params1 = {
    "groupby_cols": groupby_cols_pp,
    "base_estimator": HistGradientBoostingRegressor(),
    'l2_regularization': 0.02361333850069575,
    'learning_rate': 0.017910156239942278,
    'max_bins': 52,
    'max_depth': 3,
    'max_iter': 260,
    'min_samples_leaf': 11,
    "n_jobs":-1,
    "random_state": 17
}

params2 = {
    "groupby_cols": groupby_cols_pp,
    "base_estimator": HistGradientBoostingRegressor(),
    'l2_regularization': 0.8420811471068569,
    'learning_rate': 0.039984339306610134,
    'max_bins': 104,
    'max_depth': 2,
    'max_iter': 237,
    'min_samples_leaf': 11,
    "n_jobs":-1,
    "random_state": 17
}

params3 = {
    "groupby_cols": groupby_cols_pp,
    "base_estimator": HistGradientBoostingRegressor(),
    'l2_regularization': 0.601718,
    'learning_rate': 0.017478,
    'max_bins': 113,
    'max_depth': 2,
    'max_iter': 570,
    'min_samples_leaf': 49,
    "n_jobs":-1,
    "random_state": 425
}

lin_params = {
    "groupby_cols": groupby_cols_pp,
    "base_estimator": None,
    'alpha': 0.01,
    'l1_ratio': 0.9,
}

In [None]:
splitter_ = custom_time_splits(
    X_train_pp,
    notnull_mask=notnull_mask,
    train_on_null=False
)
# gr.set_params(**search.best_params_)
cross_val_score(
    gr_lin, X_train_pp, y_train,
    scoring="neg_mean_absolute_percentage_error",
    cv=splitter_
).mean()

In [None]:
# vr = VotingRegressor(
#     [
#         ("config0", GroupRegression(**params0)),
#         ("config1", GroupRegression(**params1)),
#         ("config2", GroupRegression(**params2))
#     ]
# )


In [None]:
# alphas = [0.01, 0.05, 0.1, 0.2]
# l1_ratios = [0.8, 0.85]
# epsilons = [1, 1.35, 10, 25, 50, 100]
# max_iters = [100, 1000]
# tols = [1e-4, 1e-6]

# results_df_log = []
# for a, e, mi in product(alphas, epsilons, max_iters):
#     print(a, e, mi)

#     splitter_ = custom_time_splits(
#         X_train,
#         notnull_mask=notnull_mask,
#         train_on_null=False
#     )

#     gr = GroupRegression(
#         groupby_cols,
#         n_jobs=-1,
#         base_estimator=HuberRegressor(epsilon=e, max_iter=mi, alpha=a)
#         # alpha=alpha,
#         # l1_ratio=l1,
#         # max_iter=max_iter,
#         # tol=tol
#     )

#     results_df = evaluate_time_splits(
#         X_train_pp, y_train, splitter_,
#         lambda: deepcopy(gr)
#     )
#     results_df_log.append(results_df)
# format_cv_results(results_df_log).sort_values("test_mean")

## Diagnostics
___

In [None]:
gr_lin.set_params(**{"base_estimator__alpha": 0.01, "base_estimator__l1_ratio":0.9})
gr_lin.fit(X_train_pp.loc[notnull_mask], y_train.loc[notnull_mask])

In [None]:
fig, ax =plt.subplots(figsize=(20,14))
df = pd.DataFrame(
    [e.coef_ for e in gr_lin.estimators_.values()],
    index=gr_lin.estimators_.keys(),
    columns=X_train_pp.columns[3:]
)
sns.heatmap(df[np.abs(df)>5], ax=ax, cmap="Spectral_r", center=0, annot=True, fmt=".0f");

In [None]:
# maep_ = abs(y_true - y_pred) / y_true
# maep_.hist();

## Submission
___

In [None]:
y_preds = []
for params in [params3, lin_params]:
    gr_ = GroupRegression(**params)
    gr_.fit(
        X_train_pp.loc[notnull_mask],
        y_train.loc[notnull_mask]
    )
    y_preds.append(gr_.predict(X_test_pp))

In [None]:
# gr_lin.fit(X_train_pp.loc[notnull_mask], y_train.loc[notnull_mask])
y_test["num_sold"] = np.array(y_preds).T.mean(axis=1)  #gr_lin.predict(X_test_pp)  # np.array(y_preds).T.mean(axis=1)
y_test.to_csv('submission.csv', index=False)