# Let's go!
## Imports and Set Up
___

In [None]:
!pip install skrub skore optuna_integration --upgrade scikit-learn

In [None]:
# %load_ext cuml.accel

In [None]:
import warnings
from copy import deepcopy
from pathlib import Path
from typing import Union, Optional, Final

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, check_is_fitted, clone
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import (
    HistGradientBoostingClassifier, HistGradientBoostingRegressor,
    ExtraTreesRegressor, RandomForestRegressor
)
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import (
    root_mean_squared_log_error, root_mean_squared_error, roc_auc_score, make_scorer
)
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold, GroupKFold
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

import skrub, skore

import catboost as cb

import optuna
from optuna.samplers import TPESampler

In [None]:
RANDOM_STATE: Final[int] = 17_17_17_17_17
EXPLORE_BABY_EXPLORE: Final[bool] = False  # run or not time-consuming EDA cells
OPTIMIZE_BABY_OPTIMIZE: Final[bool] = True  # run or not hp optimization cells
N_JOBS: Final[int] = -1

warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_theme(style="ticks")

In [None]:
INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e5'
TRAIN_PATH = INPUT_PATH / "train.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

In [None]:
X_data = pd.read_csv(TRAIN_PATH)
X_test = pd.read_csv(TEST_PATH)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_train = X_data.drop(columns=["id", "Calories"]).copy()
y_train = np.log1p(X_data["Calories"]).copy()
X_test.drop(columns="id", inplace=True)

## Markdown
___
**Observations** 
* 

**Assumptions/Ideas**
* 

**To Do**
* **EDA**
    * [X] adversarial validation
    * [ ] target
* [ ] **FE**
    * [ ] weight per height
    * [ ] temp increase: (temp - min(temp)) / duration
    * [ ] heart rate times duration
    * [ ] 
    * [ ] target encoding
    * [ ] residual binning (by OHE columns)
    * [ ] meta features:
        * [ ] frequency encoding 
    * [ ] feature importance
    * [ ] consider dim reduction/clustering
* [ ] **Modelling**
    * [ ] cv groups?
    * [ ] nested cv?
    * [ ] submit vote-predictions with final models from cross validation
    * [ ] tune HGBR
    * [ ] tune CatBoost
    * [ ] ensemble

**Lessons Learnt**
* 

## Exploratory Data Analysis
--- 
### Overview

In [None]:
report = skrub.TableReport(
    X_data.drop(columns="id"),
    )
report

In [None]:
if EXPLORE_BABY_EXPLORE:
    range_ = range(20, 90, 10)
    X_data_melt = (
        X_data
        .assign(
            age_group=pd.cut(
                X_data["Age"], range_,
                right=False, labels=[f"{l}s" for l in range_[:-1]]
                ),
            duration=X_data["Duration"].astype(int)
                )
        .sample(
            10_000, replace=False, random_state=RANDOM_STATE,
            weights="Age", axis=0
        )
        .melt(
            id_vars=["Sex", "duration", "age_group"],
            value_vars=["Calories", "Body_Temp", "Heart_Rate"]
            )
        )
    g = sns.catplot(
        X_data_melt, x="duration", y="value",
        col="variable", hue="Sex", row="age_group", kind="box",
        sharey=False
        )
    g.tick_params(axis="x", rotation=90)

### Adversarial Validation [PASS]

In [None]:
if EXPLORE_BABY_EXPLORE:
    X_av = pd.concat([X_train, X_test]).copy()
    y_av = ["train"] * len(X_train) + ["test"] * len(X_test)
    for col in X_av.columns[X_av.dtypes == object]:
        X_av[col], _ = pd.factorize(X_av[col], sort=True)

    hgbc_av = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    cv_preds_av = cross_val_predict(
        hgbc_av, X_av, y_av,
        cv=KFold(5, shuffle=True, random_state=RANDOM_STATE),
        n_jobs=N_JOBS, method='predict_proba'
        )

    # expect ~0.5 if there is no difference
    print(roc_auc_score(y_true=y_av, y_score=cv_preds_av[:,1]))  # 0.49920813329866665

In [None]:
if EXPLORE_BABY_EXPLORE:
    # cols = X_train.columns[X_train.dtypes == object]
    g = sns.displot(
        (
            X_av
            .assign(dataset = y_av)
            .melt(id_vars="dataset", var_name="column")
        ),
        x="value", hue="dataset", col="column", col_wrap=4, height=4,
        stat="density", common_norm=False, bins=50,
        facet_kws=dict(sharey=False, sharex=False)
    )

## Feature Engineering
---

In [None]:
def fe(df, hr_rest=60):
    """
    Feature‐engineer workout DataFrame with columns:
      'Sex'         → "male"/"female"
      'Age'         → years
      'Height'      → cm
      'Weight'      → kg
      'Duration'    → minutes
      'Heart_Rate'  → bpm (average)
      'Body_Temp'   → °C (end workout)
    hr_rest: assumed resting heart rate (bpm)
    """
    df_pp = df.copy()
    
    # ---- Binary encode sex ----
    df_pp["Sex"] = df_pp["Sex"].map({"male": 1, "female": 0})
    
    # ---- Anthropometrics ----
    df_pp["bmi"]            = df_pp["Weight"] / (df_pp["Height"] / 100) ** 2
    df_pp["bsa_du_bois"]    = 0.007184 * df_pp["Weight"]**0.425 * df_pp["Height"]**0.725
    df_pp["bsa_mosteller"]  = np.sqrt(df_pp["Height"] * df_pp["Weight"] / 3600)
    df_pp["wt_ht_ratio"]    = df_pp["Weight"] / df_pp["Height"]
    df_pp["wt_log"]         = np.log(df_pp["Weight"] + 1e-6)
    df_pp["ht_sqrt"]        = np.sqrt(df_pp["Height"])
    
    # ---- Basal Metabolic Rate (Mifflin–St Jeor) ----
    #   male:  10*W + 6.25*H - 5*A + 5
    #   female:10*W + 6.25*H - 5*A - 161
    df_pp["bmr"] = (
        10 * df_pp["Weight"]
      + 6.25 * df_pp["Height"]
      - 5    * df_pp["Age"]
      + (5 * df_pp["Sex"] - 161 * (1 - df_pp["Sex"]))
    )
    df_pp["bmr_per_min"] = df_pp["bmr"] / 1440
    
    # ---- Heart‐rate features ----
    df_pp["hr_max"]           = 220 - df_pp["Age"]
    df_pp["rel_effort"]       = df_pp["Heart_Rate"] / df_pp["hr_max"]
    df_pp["hr_reserve"]       = df_pp["Heart_Rate"] - hr_rest
    df_pp["hr_reserve_frac"]  = df_pp["hr_reserve"] / df_pp["hr_max"]
    df_pp["hr_rel_effort"]    = df_pp["Duration"] * df_pp["rel_effort"]
    df_pp["hr_reserve_dur"]   = df_pp["hr_reserve"] * df_pp["Duration"]
    df_pp["hr_weight"]        = df_pp["Heart_Rate"] * df_pp["Weight"]
    df_pp["total_heart_beats"]= df_pp["Heart_Rate"] * df_pp["Duration"]
    
    # transforms of Heart_Rate
    df_pp["hr_sq"]   = df_pp["Heart_Rate"] ** 2
    df_pp["hr_sqrt"] = np.sqrt(df_pp["Heart_Rate"])
    df_pp["hr_log"]  = np.log(df_pp["Heart_Rate"] + 1e-6)
    
    # ---- Thermal features ----
    df_pp["temp_delta"]        = df_pp["Body_Temp"] - 37
    df_pp["temp_store"]        = df_pp["temp_delta"] * df_pp["Weight"]
    df_pp["temp_delta_dur"]    = df_pp["temp_delta"] * df_pp["Duration"]
    df_pp["temp_delta_hr"]     = df_pp["temp_delta"] * df_pp["Heart_Rate"]
    df_pp["temp_increase_rate"]= df_pp["temp_delta"] / df_pp["Duration"]
    df_pp["temp_delta_sq"]     = df_pp["temp_delta"]**2
    
    # ---- Duration features ----
    df_pp["dur_sq"]   = df_pp["Duration"] ** 2
    df_pp["dur_log"]  = np.log(df_pp["Duration"] + 1e-6)
    
    # ---- Age interactions ----
    df_pp["age_sq"]  = df_pp["Age"] ** 2
    df_pp["age_hr"]  = df_pp["Age"] * df_pp["Heart_Rate"]
    
    # ---- Mixed interactions ----
    df_pp["weight_dur"]      = df_pp["Weight"] * df_pp["Duration"]
    df_pp["height_hr"]       = df_pp["Height"] * df_pp["Heart_Rate"]
    df_pp["rel_effort_bmr"]  = df_pp["rel_effort"] * df_pp["bmr"]
    df_pp["sex_hr"]          = df_pp["Sex"] * df_pp["Heart_Rate"]
    df_pp["sex_bmr"]         = df_pp["Sex"] * df_pp["bmr"]
    
    # ---- (Optional) Composite effort index ----
    # Uncomment & tune k1, k2 via regression
    # k1, k2 = 1.0, 1.0
    # df_pp["effort_index"] = (
    #     (df_pp["bmr_per_min"] + k1 * df_pp["rel_effort"] + k2 * df_pp["temp_delta"])
    #     * df_pp["Duration"]
    # )
    
    return df_pp


In [None]:
X_train_pp = fe(X_train)
X_test_pp = fe(X_test)

In [None]:
# skrub.TableReport(X_train_pp)

## Modelling
---

In [None]:
hgbr = HistGradientBoostingRegressor(
    scoring = "neg_root_mean_squared_error",
    random_state=RANDOM_STATE,
    **{
        'learning_rate': 0.10718881083516875,
        'l2_regularization': 90.04903358862431,
        'max_iter': 905,
        'max_depth': 7,
        'max_bins': 242,
        'min_samples_leaf': 217
    }
    )

cbr = cb.CatBoostRegressor(
    random_seed=RANDOM_STATE,
    verbose=False,
)

# rfr = RandomForestRegressor(random_state=RANDOM_STATE)

In [None]:
def rmsle(y_true, y_pred):
    lhs = np.log(1 + y_pred.clip(min=1))
    rhs = np.log(1 + y_true)
    return np.sqrt(
        np.mean((lhs - rhs)**2)
    )

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
sfs = SequentialFeatureSelector(
    hgbr, cv=3, scoring="neg_root_mean_squared_error",
    n_features_to_select="auto", tol=1e-6,
    n_jobs=-1
    )

In [None]:
# if OPTIMIZE_BABY_OPTIMIZE:
#     sfs.fit(X_train_pp, y_train)  # note that support_ will be boolean mask
# else:
    # # sock hgbr, tol=1e-5
    # sfs.support_ = [
    #     False, False, False, False, False, False,  True, False, False,
    #     False, False, False, False, False, False,  True,  True, False,
    #     True,  True,  True, False, False, False, False, False, False,
    #     False, False, False, False, False,  True, False, False,  True,
    #     False, False,  True,  True, False
    # ]

# optimised hgbr, tol=1e-6, processed target
sfs.support_ = [
    False, False, False, False, False,  True, False, False, False,
    False, False, False, False, False, False, False, False, False,
    True,  True,  True, False,  True, False, False, False, False,
    False, False, False,  True, False, False, False, False,  True,
    True, False, False,  True,  True
]

X_train_pp = X_train_pp.loc[:, sfs.support_]
X_test_pp = X_test_pp.loc[:, sfs.support_]
sfs.support_

In [None]:
report_pp = skrub.TableReport(X_train_pp)
report_pp

In [None]:
# def objective(trial):
#     learning_rate = trial.suggest_float("learning_rate", 1e-2, 1, log=True)
#     l2_regularization = trial.suggest_float("l2_regularization", 0, 1e2, log=False)
#     max_iter = trial.suggest_int("max_iter", 10, 1000, log=False)
#     max_depth = trial.suggest_int("max_depth", 2, 12, log=False)
#     max_bins = trial.suggest_int("max_bins", 32, 255, log=False)
#     min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 300, log=False)
    
#     hgbr.set_params(
#         learning_rate=learning_rate,
#         l2_regularization=l2_regularization,
#         min_samples_leaf=min_samples_leaf,
#         max_depth=max_depth,
#         max_iter=max_iter,
#         max_bins=max_bins
#     )

#     cv_scores = cross_val_score(
#         hgbr, X_train_pp, y_train,
#         n_jobs=N_JOBS,
#         verbose=2,
#         scoring=rmsle_scorer
#     )
#     return - cv_scores.mean()

In [None]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 10, 1000),
        "depth": trial.suggest_int("depth", 1, 8),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
        "random_strength": trial.suggest_loguniform("random_strength", 1e-9, 10.0),
        "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 30),
    }
    cbr.set_params(**params)
    
    cv_scores = cross_val_score(
        cbr, X_train_pp, y_train,
        n_jobs=N_JOBS,
        scoring="neg_root_mean_squared_error"
    )
    return - cv_scores.mean()

In [None]:
sampler=TPESampler(n_startup_trials=30, seed=RANDOM_STATE)
study=optuna.create_study(direction="minimize", sampler=sampler)

In [None]:
if OPTIMIZE_BABY_OPTIMIZE:
    study.optimize(
        objective, n_trials=60, timeout=6 * 3600, n_jobs=N_JOBS,
        show_progress_bar=True,
    )

In [None]:
if OPTIMIZE_BABY_OPTIMIZE:
    best_params = study.best_params
    print("Best parameters for current outer fold:", best_params)
    
    # Set the pipeline with the best found hyperparameters.
else:
    # best_params = {
    #     'learning_rate': 0.03422794020038449,
    #     'l2_regularization': 23.644550174414004,
    #     'max_iter': 790,
    #     'max_depth': 6,
    #     'max_bins': 185,
    #     'min_samples_leaf': 291
    # }
    best_params = {
        'iterations': 916,
        'depth': 8,
        'learning_rate': 0.3113330330400601, 
        'random_strength': 1.6421426577407286e-06,
        'bagging_temperature': 0.2732614883982688,
        'border_count': 189,
        'l2_leaf_reg': 5
    }

cbr.set_params(**best_params)

In [None]:
# if OPTIMIZE_BABY_OPTIMIZE:
#     optuna.visualization.plot_parallel_coordinate(study)
#     optuna.visualization.plot_optimization_history(study)

In [None]:
cbr.fit(X_train_pp, y_train)

## Diagnostics
---

In [None]:
# cvp = cross_val_predict(
#     cbr, X_train_pp, y_train,
#     n_jobs=N_JOBS,
#     verbose=2,
# )
# root_mean_squared_log_error(y_train, cvp.clip(min=1))

In [None]:
cvs = cross_val_score(
    cbr, X_train_pp, y_train,
    n_jobs=N_JOBS,
    verbose=2,
    scoring="neg_root_mean_squared_error",
    # cv=KFold(shuffle=True, random_state=RANDOM_STATE)
)
cvs

## Submission
---

In [None]:
def test_vote(model, X_train, y_train, X_test, cv):
    model_ = deepcopy(model)
    preds = []
    for train_idx, _ in cv.split(X_train, y_train):
        model_.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        preds.append(model_.predict(X_test))
        
    preds = np.array(preds).mean(axis=0)
    
    return preds

In [None]:
# preds = test_vote(cbr, X_train_pp, y_train, X_test_pp, KFold(3))
# preds = hgbr.predict(X_test_pp)

# preds = preds.clip(1)

In [None]:
# male_model = deepcopy(hgbr)
# female_model = deepcopy(hgbr)

male_model = deepcopy(cbr)
female_model = deepcopy(cbr)

In [None]:
male_idx = X_data.loc[X_data["Sex"] == "male"].index
female_idx = X_data.loc[X_data["Sex"] == "female"].index

In [None]:
male_model.fit(X_train_pp.loc[male_idx], y_train.loc[male_idx])
female_model.fit(X_train_pp.loc[female_idx], y_train.loc[female_idx])

In [None]:
# male_preds = male_model.predict(X_test_pp.loc[X_test["Sex"] == "male"])
# female_preds = female_model.predict(X_test_pp.loc[X_test["Sex"] == "female"])

In [None]:
male_preds = test_vote(
    male_model,
    X_train_pp.loc[male_idx],
    y_train.loc[male_idx],
    X_test_pp.loc[X_test["Sex"] == "male"],
    KFold(3)
)

In [None]:
female_preds = test_vote(
    female_model,
    X_train_pp.loc[female_idx],
    y_train.loc[female_idx],
    X_test_pp.loc[X_test["Sex"] == "female"],
    KFold(3)
)

In [None]:
X_test.loc[X_test["Sex"] == "male", "Calories"] = male_preds
X_test.loc[X_test["Sex"] == "female", "Calories"] = female_preds

preds = X_test.pop("Calories")
# preds.clip(1, inplace=True)
preds = np.expm1(preds)

In [None]:
y_test["Calories"] = preds  #hgbr.predict(X_test)
y_test.to_csv('submission.csv', index=False)
y_test.head()

## Graveyard
---

In [None]:
# std_cols = ["Weight", "Height", "Heart_Rate"]
# norm_cols = ["Age", "Duration", "Body_Temp", "total_heart_beats"]

# col_tf = make_column_transformer(
#     (StandardScaler(), std_cols),
#     (MinMaxScaler(), norm_cols),
#     remainder="passthrough"
# )

# col_tf.fit(X_train_pp)
# col_tf.transform(X_test_pp)