# Let's go!
## Imports and Set Up
___

In [None]:
import warnings
from pathlib import Path
from copy import deepcopy

import numpy as np
import pandas as pd
from scipy.stats import norm, uniform, gennorm
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.express import parallel_coordinates

from time import time
import pprint
import joblib
from functools import partial
#

from sklearn.base import (
    BaseEstimator, TransformerMixin, RegressorMixin
)
from sklearn.compose import (
    make_column_transformer, make_column_selector,
    TransformedTargetRegressor
)
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.impute import (
    SimpleImputer
)
from sklearn.linear_model import (
    ElasticNet, LinearRegression
)
from sklearn.metrics import (
    PredictionErrorDisplay,
    mean_squared_error,
    get_scorer_names,
    make_scorer
)
from sklearn.model_selection import (
    cross_val_predict, cross_val_score,
    LeavePGroupsOut, ShuffleSplit, KFold,
    GridSearchCV
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler, OneHotEncoder,
    OrdinalEncoder,
    FunctionTransformer,
    PolynomialFeatures
)
from sklearn.utils.validation import (
    check_X_y, check_array, check_is_fitted
)
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Integer


In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_style("ticks")

INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e2'
TRAIN_PATH_CORE = INPUT_PATH / "train.csv"
TRAIN_PATH_EXTRA = INPUT_PATH / "training_extra.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

In [None]:
X_data_core = pd.read_csv(TRAIN_PATH_CORE)
X_data_extra = pd.read_csv(TRAIN_PATH_EXTRA)
X_test = pd.read_csv(TEST_PATH)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_data = pd.concat(
    [
        X_data_core.copy(),
        X_data_extra.copy()
    ],
    ignore_index = True
)

In [None]:
X_train_core = X_data_core.drop(columns=["id", "Price"]).copy()
X_train_extra = X_data_extra.drop(columns=["id", "Price"]).copy()
X_train = pd.concat([X_train_core, X_train_extra], ignore_index = True)

In [None]:
y_train_core = X_data_core["Price"].copy()
y_train_extra = X_data_extra["Price"].copy()
y_train = X_data["Price"].copy()

In [None]:
X_test.drop(columns="id", inplace=True)

## Markdown
___
**Observations** 
* Price capped at 150.
* `HistGradientBoostingRegressor` prediction's distribution is far from target distribution 

**Assumptions**
* [ ] assume

**To Do**
* [ ] **EDA**
    * [ ] adveserial validation
    * [X] check if missing data correlates with target
    * [ ] check if gorups with 0 std have more than 1 entry
* [ ] **FE**
    * [X] compartments per size?
    * [ ] target encoding
    * [ ] residual biining
    * [ ] meta features
    * [ ] feature importance with SHAP
* [ ] **Modelling**
    * [ ] remove entries with target capped at 150?
    * [ ] Use a clustering technique to arrive at representative samples?
    * [ ] cap predictions at 150
    * [ ] stratified cv?
    * [ ] cv groups?
    * [ ] nested cv?
    * [ ] submit vote-predictions with final models from cross validation
    * [ ] **try CatBoost**
    * [ ] **try XGBoost**
    * [ ] **try LightGBM**
    * [ ] Stacking
    * [ ] Target transformation ("calibration")
    * [ ] Post process predicted test target with group's meta means (if such groups have 0 std)

## EDA
___
### dtype, nunique, notnulls

In [None]:
info_df = (
    pd.DataFrame(
        [
            X_train.dtypes,
            X_train.nunique(),
            X_train.notnull().sum(axis=0)
        ],
        index=["dtype", "nunique", "not_null"]
    )
    .T
    .sort_values("nunique", ascending=False)
)
info_df

In [None]:
sns.displot(X_data_core, x="Weight Capacity (kg)", col="Brand");

In [None]:
sns.displot(X_data_core, x="Compartments", col="Brand", discrete=True);

In [None]:
X_data_core["Size"].value_counts(dropna=False)

In [None]:
pd.Series(
    OrdinalEncoder(
        categories=[["X-Small", "Small", "Medium", "Large"]]
                  )
    .fit_transform(X_data_core[["Size"]].dropna()).ravel()
).value_counts()

In [None]:
missing_X = X_train_core.loc[:, X_train_core.isna().any()].isna()
missing_X = missing_X.join(y_train).astype(int)
sns.heatmap(
    missing_X.corr(),
    vmin=-1, vmax=1, cmap="Spectral_r",
    annot=True, fmt=".2f", annot_kws = {"fontsize":"x-small"}
);

### Target

In [None]:
y_train.isnull().any()

In [None]:
y_train.describe()

In [None]:
sns.displot(X_data, x="Price", col="Brand")

In [None]:
 pd.cut(X_data["Weight Capacity (kg)"], 4).value_counts()

In [None]:
grouped_df = (
    X_data
    # .assign(wc_bins=lambda df: pd.cut(df["Weight Capacity (kg)"], 4))
    .groupby(
        [
            "Brand", "Material", "Style", "Color",
            "Size", "Waterproof",
            "Laptop Compartment", "Compartments",
            # "wc_bins"
        ], 
        dropna=False
    )
)

In [None]:
grouped_df["Price"].agg(["mean"]).query("mean>=150")

In [None]:
stats_df = (
    grouped_df
        .agg(["mean","std", "count"])["Price"]
        .fillna(0)
        .assign(
            null_any = grouped_df["Price"].mean().reset_index().isnull().any(axis=1).values,
            null_count = grouped_df["Price"].mean().reset_index().isnull().sum(axis=1).values,
            big_group = lambda df: df["count"] >= 38,
        )
)
stats_df.columns.name="stat"

In [None]:
stats_df.query("null_count==0")["count"].min()

In [None]:
stats_df.query("std==0")["count"].max()

In [None]:
stats_df.query("big_group")["count"].sum()

In [None]:
3288503/3994318

In [None]:
sns.displot(
    stats_df.melt(
        var_name="stat", id_vars=["null_any", "null_count", "big_group"]
    ), x="value", col="stat", col_wrap=3, hue="null_count", multiple="stack",
    palette="Spectral_r",
    height=5, ec="k"
)


In [None]:
# greenish_palette = sns.color_palette(["#2A4A06", "#75A84C"])  # Dark green and muted emerald
greenish_palette = sns.blend_palette(["#2A4A06", "#75A84C"], as_cmap=True)

sns.relplot(
    (
        stats_df
        .sample(10_000, random_state=17)
        .sort_values("null_count", ascending=False)
    ),
    x="mean", y="std", 
    hue="null_count",
    alpha=.5,
    style="big_group",
    palette="Spectral_r",
    ec="k"
)

In [None]:
mean_squared_error(
    y_train, grouped_df["Price"].transform("mean")
) ** .5

## FE
---

In [None]:
cat_cols = ["Brand", "Material", "Laptop Compartment", "Waterproof", "Style", "Color"]
num_cols = ["Weight Capacity (kg)", "Compartments"]

In [None]:
def ratio_(x):
    if isinstance(x, pd.DataFrame):
        return (x.iloc[:,0] / x.iloc[:,1]).to_frame()
    return (x[:,0] / x[:,1]).reshape(-1, 1)

def nulls_(x):
    if not isinstance(x, pd.DataFrame):
        x = pd.DataFrame(x)
    # nulls_ = x.isnull()
    any_ = x.isnull().any(axis=1).astype(int)
    sum_ = x.isnull().sum(axis=1)
    any_.name=None
    sum_.name=None
    return pd.concat([any_, sum_], axis=1)

def metas_(x, grouper = None, meta_cols=None):
    if not isinstance(x, pd.DataFrame):
        x = pd.DataFrame(x)
    if grouper is None:
        raise ValueError("GroupBy object was not passed.")
    if meta_cols is None:
        meta_cols = [
            "Brand", "Material", "Style", "Color",
            "Size", "Waterproof", "Laptop Compartment",
            "Compartments", 
        ]
    return (
        x
        .set_index(meta_cols)
        .join(grouped_df["Price"].agg(["mean", "std"]))[["mean", "std"]]
        .reset_index(drop=True)
    )

In [None]:
ratio_ft = FunctionTransformer(
    ratio_, feature_names_out=lambda self, names_in: ["ratio"]
)

nulls_ft = FunctionTransformer(
    nulls_, feature_names_out=lambda self, names_in: ["nulls_any", "nulls_sum"]
)

metas_ft = FunctionTransformer(
    metas_, kw_args={"grouper":grouped_df},
    feature_names_out=lambda self, names_in: ["meta_means", "meta_stds"]
)

In [None]:
pre_proc_t = make_column_transformer(
    (OrdinalEncoder(categories=[["X-Small", "Small", "Medium", "Large", np.nan]],), ["Size"]),
    (OneHotEncoder(drop=[np.nan]*len(cat_cols), sparse_output=False), cat_cols),
    (FunctionTransformer(lambda x:x, feature_names_out="one-to-one"), num_cols),
    (ratio_ft, num_cols),
    (nulls_ft, X_train.columns),
    # (metas_ft, X_train.columns),
    remainder = "passthrough"  # Compartments already ordinally encoded
)

pre_proc_t.fit(X_train, y_train)

In [None]:
# np.isclose(
#     metas_ft.fit_transform(X_train).values.ravel(),
#     grouped_df["Price"].transform("mean").values.ravel()
# ).all()

In [None]:
pre_proc_t.get_feature_names_out()

## Modelling
___

In [None]:
model_pl = make_pipeline(
    pre_proc_t,
    HistGradientBoostingRegressor(
        scoring = "neg_root_mean_squared_error",
        random_state=1717,
    )
    # SimpleImputer(strategy="most_frequent"),
    # ElasticNet()
)

In [None]:
model_pl.fit(X_train_core, y_train_core)

In [None]:
param_spaces = {
    "histgradientboostingregressor__learning_rate": Real(1e-2, 1e0, prior='log-uniform'),
    "histgradientboostingregressor__max_iter": Integer(1e1, 1e4, prior="log-uniform"),
    "histgradientboostingregressor__max_depth": Integer(2, 12, prior="uniform"),
    "histgradientboostingregressor__min_samples_leaf": Integer(2, 300, prior="uniform"),
    "histgradientboostingregressor__l2_regularization": Real(0, 1e2, prior="uniform"),
    "histgradientboostingregressor__max_bins": Integer(32, 255, prior="log-uniform", base=2),
}

In [None]:
rmse_scorer = make_scorer(
    lambda y, y_pred: mean_squared_error(y, y_pred)**.5, greater_is_better=False
)
cv = KFold(5, shuffle=True, random_state=171717)

In [None]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title="model", callbacks=None):
    """
    A wrapper for measuring time and performance of optimizers
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_


    print((
        f"{title} took {time() - start:.2f} seconds, "
        f"candidates checked: {len(optimizer.cv_results_['params'])}.\n"
        f"Best CV score: {best_score}" + u" \u00B1"+f"{best_score_std:.3f}"
    ))

    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [None]:
search = BayesSearchCV(
    estimator=model_pl,
    search_spaces=param_spaces,
    scoring=rmse_scorer,
    n_iter=50,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    optimizer_kwargs={'base_estimator': 'GP'},
    random_state=1717171717,
    return_train_score=True
)

# search.fit(X_train, y_train)
# pd.DataFrame(search.cv_results_).sort_values("rank_test_score").head()

In [None]:
def verbose_callback(search_cv, param_names):
    # Get the latest evaluated candidate's parameters and corresponding score.
    current_params = pd.Series(dict(zip(param_names, search_cv.x)))
    current_score = search_cv.func_vals[-1]
    iteration = len(search_cv.func_vals)
    # Build a formatted string for the parameters.
    lines = []
    for key, val in current_params.items():
        if isinstance(val, float) and val.is_integer():
            val_str = f"{int(val)}"
        elif isinstance(val, float):
            val_str = f"{val:.6f}"
        else:
            val_str = str(val)
        lines.append(f"  {key}: {val_str}")
    
    formatted_params = "\n".join(lines)
    
    flag = " BEST SCORE YET!" if current_score <= min(search_cv.func_vals) else ""
    
    print(f"Iteration {iteration}:")
    print("Parameters:")
    print(formatted_params)
    print(f"Score (objective value): {current_score:.3f}{flag}\n")

hgbr_params = sorted((
    "learning_rate",
    "max_iter",
    "max_depth",
    "min_samples_leaf",
    "l2_regularization",
    "max_bins",
))

verbose_callback_partial = partial(verbose_callback, param_names=hgbr_params)
hgbr_params

In [None]:
# Running the optimizer
overdone_control = DeltaYStopper(delta=0.0001)
# We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60*60*11)
# We impose a time limit (6 hours)
best_params = report_perf(
    search, X_train, y_train,'HGBRegressor',
    callbacks=[verbose_callback_partial, overdone_control, time_limit_control]
)

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.columns = [c.split("ssor__")[-1] for c in cv_results.columns]
cv_results.to_csv("hgbr_cv_results.csv", index=False)


In [None]:
model_pl.set_params(**search.best_params_)

In [None]:
# cv_score_ = cross_val_score(
#     model_pl, X_train_core, y_train_core,
#     scoring="neg_root_mean_squared_error",
#     cv=KFold(5, shuffle=True, random_state=171717),
#     # cv=ShuffleSplit(5, random_state=171717),
#     n_jobs=-1
# )
# cv_score_

In [None]:
y_pred_cv = cross_val_predict(
    model_pl, X_train_core, y_train_core,
    cv=KFold(5, shuffle=True, random_state=171717),
    n_jobs=-1
)

In [None]:
mean_squared_error(y_train_core, y_pred_cv) ** .5

In [None]:
UNI_ = uniform(*uniform.fit(y_train))
NORM_ = norm(*norm.fit(y_pred_cv))
GENNORM_ = gennorm(*gennorm.fit(y_pred_cv))

In [None]:
pd.Series(y_pred_cv).describe()

In [None]:
x_ = np.linspace(75,90,1000)
sns.histplot(y_pred_cv, stat="density")
sns.lineplot(x=x_, y=GENNORM_.pdf(x_), color="k");

In [None]:
y_ft = FunctionTransformer(
    func = lambda y: GENNORM_.ppf(UNI_.cdf(y)),
    inverse_func = lambda y: UNI_.ppf(GENNORM_.cdf(y))
)

# ttr = TransformedTargetRegressor(
#     regressor = HistGradientBoostingRegressor(
#         scoring="neg_root_mean_squared_error",
#         random_state=1717
#     ),
#     transformer = y_ft
# )

# model_pl = make_pipeline(pre_proc_t, ttr)

np.isclose(y_ft.inverse_transform(y_ft.transform(y_train)), y_train).all()
y_train_t =  y_ft.transform(y_train)

In [None]:
sns.histplot(y_train_core, binwidth=5); 
sns.histplot(y_pred_cv, binwidth=5);
mean_squared_error(y_train_core, y_pred_cv) ** .5

In [None]:
sns.histplot(y_train_core, binwidth=5); 
sns.histplot(y_ft.inverse_transform(y_pred_cv), binwidth=5);
mean_squared_error(y_train_core, y_ft.inverse_transform(y_pred_cv)) ** .5

In [None]:
sns.histplot(y_train_core, binwidth=5); 
sns.histplot(MinMaxScaler((15,150)).fit_transform(y_pred_cv.reshape(-1,1)).ravel(), binwidth=5);
mean_squared_error(y_train_core, MinMaxScaler((15,150)).fit_transform(y_pred_cv.reshape(-1,1)).ravel()) ** .5

## Submission
___

In [None]:
model_pl.fit(X_train, y_train)
y_test["Price"] = model_pl.predict(X_test)
# y_test["Price"] = metas_ft.transform(X_test)["mean"].values
# y_test.loc[y_test["Price"].isnull(), "Price"] = y_train.mean()
y_test.to_csv('submission.csv', index=False)