# Let's go!
## Imports and Set Up
___

In [None]:
!pip install skrub skore optuna_integration --upgrade scikit-learn

In [None]:
import warnings
from copy import deepcopy
from pathlib import Path
from typing import Union, Optional, Literal, Final

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, check_is_fitted
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.metrics import PredictionErrorDisplay, root_mean_squared_error, roc_auc_score
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold, GroupKFold
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import make_pipeline

import skrub, skore

import catboost as cb

import wandb
import optuna
from optuna.samplers import TPESampler
from optuna_integration.wandb import WeightsAndBiasesCallback

from kaggle_secrets import UserSecretsClient

In [None]:
RANDOM_STATE: Final[int] = 17_17_17_17_17
EXPLORE_BABY_EXPLORE: Final[bool] = False  # run or not time-consuming EDA cells
OPTIMIZE_BABY_OPTIMIZE: Final[bool] = False  # run or not hp optimization cells
N_JOBS: Final[int] = -1

TASK_TYPE: Final[Literal[None, "GPU"]] = None  # Catboost hyperparam to enable GPU
WANDB_CONFIG: Final[dict] = {
    "model": "cat_pl",
    "features": {
        "categorical": "onehot",
        "numeric": "passthrough",
        "special": ["episode_number"],
        # "dropped": ["Podcast_Name"],
        "new": [
            "popularity_ratio",
            "adds_per_min",
            # "resid_bin",
            # "Episode_Length_minute_median",
            # "Episode_Length_minute_std",
            # "Episode_Length_minute_mean",
            # "Episode_Length_minute_min",
            # "Episode_Length_minute_max",
        ]
    },
    "cv": {
        "type": "GroupKFold",
        "cv_groups": "Podcast_Name",
        "nested_cv": False,
        "outer_folds": 0,  # only applicable when nested CV is used
        "inner_folds": 5
    },
    "optimization": {
        "optimizer": "optuna",
        "n_trials": 50,  # total number of optimization trials performed
        "objective_metric": "neg_root_mean_squared_error",
        # "search_space": {
        #     "iterations": {"type": "int", "min": 10, "max": 1000, "scale": "linear"},
        #     "depth": {"type": "int", "min": 1, "max": 8, "scale": "linear"},
        #     "learning_rate": {"type": "float", "min": 1e-2, "max": 1, "scale": "log"},
        #     "random_strength": {"type": "float", "min": 1e-9, "max": 10, "scale": "log"},
        #     "bagging_temperature": {"type": "float", "min": 0, "max": 1, "scale": "linear"},
        #     "border_count": {"type": "int", "min": 1, "max": 255, "scale": "linear"},
        #     "l2_leaf_reg": {"type": "int", "min": 2, "max": 30, "scale": "linear"}
        # }
        # # HGBR
        # {
        #     "lr": {"type": "float", "min": 1e-2, "max": 1, "scale": "log"},
        #     "l2_regularization": {"type": "float", "min": 0, "max": 1e2, "scale": "linear"},
        #     "max_iter": {"type": "int", "min": 10, "max": 1000, "scale": "linear"},
        #     "max_depth": {"type": "int", "min": 2, "max": 12, "scale": "linear"},
        #     "max_bins": {"type": "int", "min": 32, "max": 255, "scale": "linear"},
        #     "min_samples_leaf": {"type": "int", "min": 2, "max": 300, "scale": "linear"}
        # }
    },
    "runtime": {
        "seed": RANDOM_STATE,
        "n_jobs": N_JOBS
    },
    "notes": (
        "Stacked HGBC and Catboost Classifier"
    )
}

warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_theme(style="ticks")

In [None]:
INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e4'
TRAIN_PATH = INPUT_PATH / "train.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

# for f in INPUT_PATH.iterdir():
#     print(f)

In [None]:
X_data = pd.read_csv(TRAIN_PATH)
X_test = pd.read_csv(TEST_PATH)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_train = X_data.drop(columns=["id", "Listening_Time_minutes"]).copy()
y_train = X_data["Listening_Time_minutes"].copy()
X_test.drop(columns="id", inplace=True)

## Markdown
___
**Observations** 
* there are 2568 entries where "Episode_Length_minutes" < "Listening_Time_minutes"
    * pct change is lower than -1% for 642 entries, suggesting some "re-listens" (up to 93 repeats!)
* each podcast (podcast name) covers multiple genres
* there are two odd entries in test data with episode length > 7000 mins

**Assumptions/Ideas**
* use grouping by podcast name

**To Do**
* [ ] **EDA**
    * [X] adversarial validation
    * [X] check if missing data correlates with target
    * [ ] target
* [ ] **FE**
    * [ ] weekend or workday
    * [X] popularity percentage ratio
    * [ ] target encoding
    * [X] residual binning (by OHE columns)
    * [ ] meta features:
        * [ ] nans
        * [X] episode length stats by podcast name
        * [ ] frequency encoding 
    * [X] feature importance
    * [ ] consider dim reduction/clustering
    * [X] drop podcast name?
* [ ] **Modelling**
    * [ ] cap predictions at "Episode_Length_minutes"
    * [X] cv groups?
        * [X] by podcast names
    * [ ] nested cv?
    * [X] submit vote-predictions with final models from cross validation
    * [X] tune HGBR
    * [X] tune CatBoost
    * [X] ensemble

**Lessons Learnt**
* Depth-first search were I fine-tune a model after each feature alteration is probably inefficient.
* I should have logged not just CV's mean, but scatter (std) too.

## Exploratory Data Analysis
--- 
### Overview

In [None]:
skrub.TableReport(X_data.drop(columns="id"))

In [None]:
# X_train.groupby("Podcast_Name")["Genre"].nunique()
X_test.nlargest(5, "Episode_Length_minutes")

### Adversarial Validation [PASS]

In [None]:
if EXPLORE_BABY_EXPLORE:
    X_av = pd.concat([X_train, X_test]).copy()
    y_av = ["train"] * len(X_train) + ["test"] * len(X_test)
    for col in X_av.columns[X_av.dtypes == object]:
        X_av[col], _ = pd.factorize(X_av[col], sort=True)
    
    hgbc_av = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    cv_preds_av = cross_val_predict(
        hgbc_av, X_av, y_av,
        cv=KFold(5, shuffle=True, random_state=RANDOM_STATE),
        n_jobs=N_JOBS, method='predict_proba'
        )
    
    # expect ~0.5 if there is no difference
    print(roc_auc_score(y_true=y_av, y_score=cv_preds_av[:,1]))  # 0.499698298352

In [None]:
if EXPLORE_BABY_EXPLORE:
    g = sns.displot(
        (
            X_av
            .assign(dataset = y_av)
            .melt(id_vars="dataset", var_name="column")
            .sample(1_000_000, random_state=RANDOM_STATE)
        ),
        x="value", hue="dataset", col="column", col_wrap=2, height=4,
        stat="density", common_norm=False, bins=50,
        facet_kws=dict(sharey=False, sharex=False)
    )

### Missing Data Correlation with Target [NOPE]

In [None]:
if EXPLORE_BABY_EXPLORE:
    missing_X = X_train.loc[:, X_train.isna().any()].isna()
    missing_X = missing_X.join(y_train).astype(int)
    sns.heatmap(
        missing_X.corr(), vmin=-1, vmax=1, cmap="Spectral_r", annot=True, fmt=".2f"
    );

In [None]:
X_data[X_data["Number_of_Ads"].isnull()]

### Podcasts with Listening Time longer than Duration

In [None]:
def explore_neg_diff():
    mask = X_data["Episode_Length_minutes"] < X_data["Listening_Time_minutes"]
    diff = X_data[mask]["Episode_Length_minutes"] - X_data[mask]["Listening_Time_minutes"]
    ratio = diff / X_data[mask]["Episode_Length_minutes"]
    ratio.where(ratio > -np.inf, 0, inplace=True)
    return ratio

neg_ratio = explore_neg_diff()
len(neg_ratio)

In [None]:
# podcasts re-listened more than 50 times
X_data.loc[neg_ratio[neg_ratio < -50].index].T

## Feature Engineering
---

In [None]:
def extract_eps_num(data:Union[pd.DataFrame, np.ndarray]):
    # in case if it fed by one array with n rows and one column
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data, columns=["Episode_Title"])
    return (
        data["Episode_Title"]
        .str
        .extract(r"Episode ([\d]*)")
        .astype(int)
        )

extract_eps_num_ft = FunctionTransformer(
    extract_eps_num, feature_names_out=lambda self, names_in: ["episode_number"]
)

# extract_eps_num_ft.fit_transform(X_train)
# extract_eps_num_ft.transform(X_test)

In [None]:
def ratio(data: Union[pd.DataFrame, np.ndarray]):
    # in case if it fed by one array with n rows and two columns
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(
            data,
            columns=[["Host_Popularity_percentage", "Guest_Popularity_percentage"]],
        )
    ratio_ = data["Guest_Popularity_percentage"] / data["Host_Popularity_percentage"]
    return ratio_.to_frame("popularity_ratio")

ratio_ft = FunctionTransformer(
    ratio, feature_names_out=lambda self, names_in: ["popularity_ratio"]
)

# ratio_ft.fit_transform(X_train)
# ratio_ft.transform(X_test)

In [None]:
def adds_per_min(data: Union[pd.DataFrame, np.ndarray]):
    # in case if it fed by one array with n rows and two columns
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(
            data,
            columns=[["Episode_Length_minutes", "Number_of_Ads"]],
        )
    adds_per_min_ = data["Number_of_Ads"] / data["Episode_Length_minutes"]
    return adds_per_min_.to_frame("adds_per_min")

adds_per_min_ft = FunctionTransformer(
    adds_per_min, feature_names_out=lambda self, names_in: ["adds_per_min"]
)

# adds_per_min_ft.fit_transform(X_train)
# adds_per_min_ft.transform(X_test)

In [None]:
class PodcastMetaFeatures(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        value_col: str = "Episode_Length_minutes",
        group_cols: list = ["Genre", "Episode_Sentiment"],
        stats: Optional[list] = None
    ):
        self.group_cols = group_cols
        self.value_col = value_col
        self.stats = stats or ["mean","median","std","min","max"]

    def _convert_to_df(self, X):
        """
        If `np.ndarray` is passed, assume it contains the following columns:
            - the 1st corresponds to episode length (value column),
            - other columns are categorical, to be used for grouping.
        Because of the categorical columns, the array will be `object`, dtype.
        Hence, conversion of the `value_col` to float is needed.
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(data=X, columns=[self.value_col, *self.group_cols])
            X[self.value_col] = X[self.value_col].astype(float)
        return X.copy()

    def fit(self, X, y=None):
        df = self._convert_to_df(X)
        grouped = df.groupby(self.group_cols)[self.value_col]

        # Compute per‐podcast stats
        self.podcast_stats_ = grouped.agg(self.stats)

        # Compute global fallback stats
        self.global_stats_ = df[self.value_col].agg(self.stats)
        return self

    def transform(self, X):
        check_is_fitted(self)
        df = self._convert_to_df(X)
        out = (
            df.merge(
                self.podcast_stats_,
                left_on=self.group_cols,
                right_index=True,
                how="left",
            )
            .fillna(self.global_stats_)
            .drop(columns=[self.value_col, *self.group_cols])
        )
        out.columns = [f"{self.value_col}_{stat}" for stat in self.stats]
        return out

    def get_feature_names_out(self, input_features=None):
        return [f"{self.value_col}_{stat}" for stat in self.stats]

In [None]:
pmf = PodcastMetaFeatures()
pmf.fit(X_train[["Episode_Length_minutes", "Genre", "Episode_Sentiment"]])
df1 = pmf.transform(X_test[["Episode_Length_minutes", "Genre", "Episode_Sentiment"]])

In [None]:
def pass_through(data: Union[pd.DataFrame, np.ndarray]):
    return data

pass_through_ft = FunctionTransformer(
    pass_through, feature_names_out=lambda self, names_in: names_in
)

# pass_through_ft.fit_transform(X_train)
# pass_through_ft.transform(X_test)

In [None]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
pmf = PodcastMetaFeatures(stats=["std", "median"])

ohe_cols = [
    "Podcast_Name",
    "Genre", 
    "Episode_Sentiment",
    "Publication_Day",
    "Publication_Time",
    ]
pass_through_cols = [
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage",
    "Number_of_Ads"
]
col_tf = make_column_transformer(
    (extract_eps_num_ft, ["Episode_Title"]),
    (ratio_ft, ["Host_Popularity_percentage", "Guest_Popularity_percentage"]),
    (adds_per_min_ft, ["Episode_Length_minutes", "Number_of_Ads"]),
    # (pmf, ["Episode_Length_minutes", "Genre", "Episode_Sentiment"]),
    (pass_through_ft, pass_through_cols),
    (ohe, ohe_cols),
    remainder="passthrough",
    n_jobs=N_JOBS
)

col_tf.fit_transform(X_train)
col_tf.transform(X_test)
col_tf.get_feature_names_out()

## Modelling
---

In [None]:
hgbr = HistGradientBoostingRegressor(
    scoring = "neg_root_mean_squared_error",
    random_state=RANDOM_STATE,
    learning_rate=0.12518740640368126,
    l2_regularization=3.4687150262266466,
    max_iter=858,
    max_depth=9,
    max_bins=222,
    min_samples_leaf=237
    )

hgbr_pl = make_pipeline(col_tf, hgbr)

# hgbr_pl.fit(X_train, y_train)
# hgbr_pl.predict(X_test)

# cvp = cross_val_predict(
#     hgbr_pl, X_train, y_train,
#     groups=X_train["Podcast_Name"],
#     cv=GroupKFold(3),
#     n_jobs=N_JOBS,
#     verbose=2,
# )

In [None]:
# temp = X_train.copy()
# temp["residual"] = cvp - y_train

In [None]:
# group_stats = (
#     temp
#     .groupby(ohe_cols)["residual"]
#     .median()
#     .to_frame("median")
#     .assign(
#         resid_bin = lambda df: pd.qcut(df["median"], q=10, labels=False, duplicates="drop")
#     )
# )
# group_stats

In [None]:
# X_train = X_train.merge(
#     group_stats["resid_bin"],
#     left_on=ohe_cols,
#     right_index=True,
#     how="left"
# )
# X_test = X_test.merge(
#     group_stats["resid_bin"],
#     left_on=ohe_cols,
#     right_index=True,
#     how="left"
# )
# X_test["resid_bin"] = X_test["resid_bin"].fillna(0).astype(int)

In [None]:
cat_pl = make_pipeline(
    col_tf,
    cb.CatBoostRegressor(
        random_seed=RANDOM_STATE,
        verbose=False,
        task_type=TASK_TYPE
    )
)

# cat_pl.fit(X_train, y_train)
# cat_pl.predict(X_test)

In [None]:
# # HGBR
# def objective(trial):
#     learning_rate = trial.suggest_float("lr", 1e-2, 1, log=True)
#     l2_regularization = trial.suggest_float("l2_regularization", 0, 1e2, log=False)
#     max_iter = trial.suggest_int("max_iter", 10, 1000, log=False)
#     max_depth = trial.suggest_int("max_depth", 2, 12, log=False)
#     max_bins = trial.suggest_int("max_bins", 32, 255, log=False)
#     min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 300, log=False)
#     hgbr_pl.set_params(
#         histgradientboostingregressor__learning_rate=learning_rate,
#         histgradientboostingregressor__l2_regularization=l2_regularization,
#         histgradientboostingregressor__min_samples_leaf=min_samples_leaf,
#         histgradientboostingregressor__max_depth=max_depth,
#         histgradientboostingregressor__max_iter=max_iter,
#         histgradientboostingregressor__max_bins=max_bins
#     )
#     cv_scores = cross_val_score(
#         hgbr_pl, X_train, y_train,
#         groups=X_train["Podcast_Name"],
#         scoring="neg_root_mean_squared_error",
#         cv=GroupKFold(),
#         n_jobs=N_JOBS,
#         )
#     return - cv_scores.mean()

In [None]:
def objective(trial):
    params = {
        "catboostregressor__iterations": trial.suggest_int("iterations", 10, 1000),
        "catboostregressor__depth": trial.suggest_int("depth", 1, 8),
        "catboostregressor__learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
        "catboostregressor__random_strength": trial.suggest_loguniform("random_strength", 1e-9, 10.0),
        "catboostregressor__bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
        "catboostregressor__border_count": trial.suggest_int("border_count", 1, 255),
        "catboostregressor__l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 30),
    }
    cat_pl.set_params(**params)
    cv_scores = cross_val_score(
        cat_pl, X_train, y_train,
        groups=X_train["Podcast_Name"],
        scoring="neg_root_mean_squared_error",
        cv=GroupKFold(),
        n_jobs=N_JOBS,
        )
    return - cv_scores.mean()

In [None]:
sampler=TPESampler(n_startup_trials=25)
study=optuna.create_study(direction="minimize", sampler=sampler)

In [None]:
if OPTIMIZE_BABY_OPTIMIZE:
    user_secrets = UserSecretsClient()
    wandb.login(key=user_secrets.get_secret("WANDB_API_KEY"))
    wandb.init(project="kaggle_ps_s5_e4", entity="semyonb_kaggle", config=WANDB_CONFIG)

    wandb_callback = WeightsAndBiasesCallback(
        metric_name="rmse",
        wandb_kwargs={"project": "kaggle_ps_s5_e4", "entity":"semyonb_kaggle"}
    )

In [None]:
if OPTIMIZE_BABY_OPTIMIZE:
    study.optimize(
        objective, n_trials=50, timeout=11 * 3600, n_jobs=N_JOBS,
        show_progress_bar=True,
        callbacks=[wandb_callback]
    )
    wandb.finish()

In [None]:
if OPTIMIZE_BABY_OPTIMIZE:
    best_params = study.best_params
    print("Best parameters for current outer fold:", best_params)
    
    # Set the pipeline with the best found hyperparameters.
else:
    # # HGBR
    # best_params = {
    #     'lr': 0.24417156591631992,
    #     'l2_regularization': 78.61745467504426,
    #     'max_iter': 845,
    #     'max_depth': 6,
    #     'max_bins': 203,
    #     'min_samples_leaf': 279
    # }
    best_params = {
        'iterations': 996,
        'depth': 8,
        'learning_rate': 0.122876252138869,
        'random_strength': 6.79840365046226e-07,
        'bagging_temperature': 0.010319960715253762,
        'border_count': 254,
        'l2_leaf_reg': 17
    }


In [None]:
# hgbr_pl.set_params(
#         histgradientboostingregressor__learning_rate=best_params["lr"],
#         histgradientboostingregressor__l2_regularization=best_params["l2_regularization"],
#         histgradientboostingregressor__max_iter=best_params["max_iter"],
#         histgradientboostingregressor__max_depth=best_params["max_depth"],
#         histgradientboostingregressor__max_bins=best_params["max_bins"],
#         histgradientboostingregressor__min_samples_leaf=best_params["min_samples_leaf"]
#     )

cat_pl = make_pipeline(
    col_tf,
    cb.CatBoostRegressor(
        random_seed=RANDOM_STATE,
        verbose=False,
        task_type=TASK_TYPE,
        **best_params
    )
)

In [None]:
# if OPTIMIZE_BABY_OPTIMIZE:
#     optuna.visualization.plot_parallel_coordinate(study)
#     optuna.visualization.plot_optimization_history(study)

In [None]:
# hgbr_pl.fit(X_train, y_train)
cat_pl.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import StackingRegressor

In [None]:
boss_pl = StackingRegressor([("Hist", hgbr_pl), ("Cat", cat_pl)])
boss_pl.fit(X_train, y_train)

## Diagnostics
---

In [None]:
# cvs = cross_val_score(
#     cat_pl, X_train, y_train,
#     groups=X_train["Podcast_Name"],
#     scoring="neg_root_mean_squared_error",
#     cv=GroupKFold(3),
#     n_jobs=N_JOBS,
#     verbose=2
# )
# cvs

In [None]:
# def group_cv():
#     cv = GroupKFold()
#     groups = X_train["Podcast_Name"]
#     for train_idx, test_idx in cv.split(X_train, y_train, groups=groups):
#         yield train_idx, test_idx
# cv = group_cv()

In [None]:
# cv_report = skore.CrossValidationReport(cat_pl, X_train, y_train, cv, n_jobs=N_JOBS)
# cv_report.help()

In [None]:
# cv_report.metrics.report_metrics(aggregate=None)

In [None]:
# cv_pred_err_plot = cv_report.metrics.prediction_error(data_source="train")

In [None]:
# cv_pred_err_plot.plot(kind="actual_vs_predicted")

In [None]:
# cv_pred_err_plot.plot(kind="residual_vs_predicted")

In [None]:
# est_rep_split1 = cv_report.estimator_reports_[1]
# perm_imp_split1 = est_rep_split1.feature_importance.permutation(
#     scoring="rmse", seed=RANDOM_STATE, n_jobs=N_JOBS
# )
# perm_imp_split1.T.boxplot(vert=False)

In [None]:
boss_pl.final_estimator_.coef_

## Submission
---

In [None]:
def test_vote(model, X_train, y_train, X_test, cv):
    model_ = deepcopy(model)
    preds_ = []
    for train_idx, _ in cv.split(X_train, y_train, groups=X_train["Podcast_Name"]):
        model_.fit(X_train.loc[train_idx], y_train.loc[train_idx])
        preds_.append(model_.predict(X_test))
        
    preds_ = np.array(preds_).mean(axis=0)
    
    return preds_

In [None]:
# preds = test_vote(boss_pl, X_train, y_train, X_test, GroupKFold())

In [None]:
y_test["Listening_Time_minutes"] = boss_pl.predict(X_test)  # preds
y_test.to_csv('submission.csv', index=False)
y_test.head()

## Graveyard
---
### Quick Encoding with TV
* OHE raises warning
* Need new feature creation

In [None]:
# tv = skrub.TableVectorizer(
#     specific_transformers=[(extract_eps_num_ft, ["Episode_Title"])],
#     cardinality_threshold=50  # apply one-hot encoding to podcast name
# )

# tv.fit_transform(X_train)
# tv.transform(X_test)

### Reviewing CV Schemes

In [None]:
# cv = KFold(10)  # GroupKFold()
# cvs = cross_val_score(
#     hgbr_pl, X_train, y_train,
#     # groups=X_train["Genre"],
#     scoring="neg_root_mean_squared_error",
#     cv=cv,
#     n_jobs=-1,
#     verbose=2
# )

# cv = GroupKFold(10)
# cvs_group_by_genre = cross_val_score(
#     hgbr_pl, X_train, y_train,
#     groups=X_train["Genre"],
#     scoring="neg_root_mean_squared_error",
#     cv=cv,
#     n_jobs=-1,
#     verbose=2
# )

# # Pick this one
# cvs_group_by_name = cross_val_score(
#     hgbr_pl, X_train, y_train,
#     groups=X_train["Podcast_Name"],
#     scoring="neg_root_mean_squared_error",
#     cv=cv,
#     n_jobs=-1,
#     verbose=2
# )


# train_groups = X_train["Podcast_Name"] + " - " + X_train["Genre"]  # 407 unique groups
# cvs_group_by_name_genre = cross_val_score(
#     hgbr_pl, X_train, y_train,
#     groups=train_groups,
#     scoring="neg_root_mean_squared_error",
#     cv=cv,
#     n_jobs=-1,
#     verbose=2
# )


# cv_results = pd.DataFrame(
#     {
#         "base": - cvs,
#         "groupby_genre": - cvs_group_by_genre,
#         "groupby_name": - cvs_group_by_name,
#         "groupby_name_genre": - cvs_group_by_name_genre
#     }
# )
# ax = sns.boxplot(
#     cv_results.melt(), y="variable", x="value", orient="horizontal"
# )
# sns.swarmplot(
#     cv_results.melt(), y="variable", x="value", orient="horizontal", ax=ax, color="k"
# )
# cv_results.mean()

# cv_results.to_csv("cv_results.csv", index=False)