### Импорты

In [1]:
%load_ext jupyter_black

In [2]:
from graphviz import Digraph
from IPython.display import display
from typing import Any, Callable, Dict, List, Union
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, TargetEncoder
from sklearn.tree import DecisionTreeRegressor as SDecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor as SGradientBoostingRegressor
import os
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import pandas as pd

In [3]:
def root_mean_squared_error(x1: npt.ArrayLike, x2: npt.ArrayLike) -> float:
    return np.sqrt(np.power(x1 - x2, 2).mean())

### Дерево регрессии

In [4]:
class DecisionTreeRegressor(BaseEstimator, RegressorMixin):
    """
    Decision tree regressor, something to build gradient boosting algorithms off of.
    """

    SPLIT_CRITERIA = {
        "mae": lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred)),
        "mse": lambda y_true, y_pred: np.mean((y_true - y_pred) ** 2),
    }

    def __init__(
        self,
        max_depth: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
        criterion: str = None,
    ) -> None:
        if criterion is None:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA["mse"]
        else:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA[criterion.lower()]
        self.max_depth = -1 if max_depth is None else max_depth
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list = None,
        cat_features: list = None,
    ) -> None:
        """Fit the decision tree to passed data."""

        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            X = X.to_numpy()
            y = y.to_numpy()

        if features is None:
            self.features_ = [f"x_{i}" for i in range(X.shape[1])]
        else:
            self.features_ = features

        if cat_features is None:
            self.cat_features_ = []
        else:
            self.cat_features_ = cat_features

        self.tree_ = DecisionTreeRegressor.build_tree_(
            X,
            y,
            self.features_,
            depth=0,
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            min_samples_split=self.min_samples_split,
            cat_features=self.cat_features_,
            criterion=self.criterion,
        )

        self.n_leaves_ = DecisionTreeRegressor.count_leaves_(self.tree_)

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        """Predict class labels for given data."""
        ans = np.zeros((X.shape[0],))
        for i in range(ans.shape[0]):
            node = self.tree_
            while not node["is_leaf"]:
                # Get the index a the feature on which the split was performed
                feature_idx = self.features_.index(node["feature"])

                if node["feature"] in self.cat_features_:
                    # If the category has not appeared in training set, the tree
                    # traversal is terminated and the current node value is used
                    if node["children"].get(X[i, feature_idx], False):
                        node = node["children"].get(X[i, feature_idx], False)
                    else:
                        break
                else:
                    if X[i, feature_idx] <= node["threshold"]:
                        node = node["children"]["lower"]
                    else:
                        node = node["children"]["upper"]

            ans[i] = node["value"]
        return ans

    def get_n_leaves(self):
        return self.n_leaves_

    # =============================================================================
    # Tree construction
    # =============================================================================

    def build_tree_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        depth: int,
        max_depth: int,
        min_samples_split: int,
        min_samples_leaf: int,
        cat_features: list,
        criterion: Callable,
    ) -> Dict[str, Any]:
        """Recursively build a regression tree."""

        default_value = np.mean(y)

        # Terminate if there are no more features to split on
        if X.shape[1] == 0 or len(features) == 0:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if all the targets are duplicates of eachother
        if np.unique(y).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if all the datapoints are duplicates of eachother
        if np.unique(X, axis=0).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if node does not contain enough elements to split
        if y.shape[0] < min_samples_split:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if max tree depth is reached
        if depth == max_depth:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        best_id, best_feature, threshold = DecisionTreeRegressor.select_best_feature_(
            X, y, features, cat_features, criterion
        )
        new_features = [feature for feature in features if feature != best_feature]

        tree = {
            "depth": depth,
            "feature": best_feature,
            "is_leaf": False,
            "value": default_value,
            "children": {},
        }

        if best_feature in cat_features:
            categories = np.unique(X[:, best_id])

            for category in categories:

                mask = X[:, best_id] == category

                X_sub, y_sub = (
                    np.delete(X, best_id, axis=1)[mask],
                    y[mask],
                )

                # If there is not enough samples to make a leaf the split is aborted and
                # the node is considered a leaf
                if len(y_sub) < min_samples_leaf:
                    tree["is_leaf"] = True
                    tree["children"] = {}
                    return tree

                tree["children"][category] = DecisionTreeRegressor.build_tree_(
                    X_sub,
                    y_sub,
                    new_features,
                    depth=depth + 1,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    min_samples_split=min_samples_split,
                    cat_features=cat_features,
                    criterion=criterion,
                )
        else:
            mask_left = X[:, best_id] <= threshold
            mask_right = X[:, best_id] > threshold

            tree["threshold"] = threshold

            # If there is not enough samples to make a leaf the split is aborted and
            # the node is considered a leaf
            if (
                mask_left.sum() < min_samples_leaf
                or mask_right.sum() < min_samples_leaf
            ):
                tree["is_leaf"] = True
                return tree

            X_sub = np.delete(X, best_id, axis=1)

            tree["children"]["lower"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_left],
                y[mask_left],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
            )

            tree["children"]["upper"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_right],
                y[mask_right],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
            )
        return tree

    def feature_score_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        feature_idx: int,
        is_cat_feature: bool,
        criterion: Callable,
    ) -> float:
        """Calculate score for a given feature."""

        if is_cat_feature is None or is_cat_feature == False:

            uniques = np.unique(X[:, feature_idx])

            # Splits are not done if all the feature values are the same
            if uniques.shape[0] == 1:
                return {"value": np.inf, "threshold": None}

            thresholds = [
                0.5 * (curr + prev) for prev, curr in zip(uniques, uniques[1:])
            ]
            split_scores = []

            for theta in thresholds:

                mask_left = X[:, feature_idx] <= theta
                mask_right = X[:, feature_idx] > theta

                # Elements lower and higher than threshold are compared to their respective means
                y_pred = np.where(
                    mask_left, np.mean(y[mask_left]), np.mean(y[mask_right])
                )
                split_scores += [criterion(y_pred, y)]

            best_split_id = np.argmin(split_scores)
            best_threshold = thresholds[best_split_id]
            return {"value": split_scores[best_split_id], "threshold": best_threshold}

        else:
            categories = np.unique(X[:, feature_idx])
            y_pred = np.zeros_like(y)

            for category in categories:

                mask = X[:, feature_idx] == category
                # Elements in each(surviving!) category are compared to their respective means
                y_pred = np.where(mask, np.mean(y[mask]), y_pred)

            score = criterion(y_pred, y)
            return {"value": score, "threshold": None}

    def select_best_feature_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        cat_features: list,
        criterion: Callable,
    ) -> list:
        """Select the feature with the highest information gain."""
        scores = [
            DecisionTreeRegressor.feature_score_(
                X, y, i, feature in cat_features, criterion
            )
            for i, feature in enumerate(features)
        ]

        best_idx = np.argmin([score["value"] for score in scores])
        return [best_idx, features[best_idx], scores[best_idx]["threshold"]]

    # =============================================================================
    # Tree pruning
    # =============================================================================

    def count_leaves_(tree: Dict) -> int:
        """Count the number of leaf nodes in a (sub)tree."""
        if tree["is_leaf"]:
            return 1

        total_leaves = 0
        for child in tree["children"]:
            total_leaves += DecisionTreeRegressor.count_leaves_(tree["children"][child])

        return total_leaves

    # =============================================================================
    # Tree visualization
    # =============================================================================

    def show_tree(self):
        """Visualize the decision tree."""
        dot = DecisionTreeRegressor.visualize_tree_(self.tree_, self.features_)
        display(dot)

    def visualize_tree_(
        tree: Dict[str, Any],
        feature_names: list,
        dot: Digraph = None,
        parent: str = None,
        edge_label: str = None,
    ) -> Digraph:
        """Recursively visualize the decision tree using Graphviz."""
        if dot is None:
            dot = Digraph(comment="Decision Tree Regessor")

        # Create a unique node ID
        node_id = str(id(tree))

        # Add the current node
        if tree["is_leaf"]:
            node_label = f"Value: {tree['value']:0.2f}"
        else:
            node_label = f"Feature: {tree['feature']}"
            if tree.get("threshold", False):
                node_label += f"<={tree['threshold']:0.2f}"
        dot.node(node_id, node_label)

        # Connect to parent node if exists
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)

        # Recursively add children
        if "children" in tree:
            for value, child in tree["children"].items():
                DecisionTreeRegressor.visualize_tree_(
                    child,
                    feature_names,
                    dot,
                    node_id,
                    str(value) if not isinstance(value, str) else value,
                )

        return dot

### Градиентный бустинг

In [5]:
class GradientBoostingRegressor(BaseEstimator, RegressorMixin):

    def __init__(
        self,
        learning_rate: float = None,
        max_depth: int = None,
        n_estimators: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
        criterion: str = None,
        tol: float = None,
    ) -> None:
        self.criterion = criterion
        self.tol = 1e-4 if tol is None else tol
        self.max_depth = 4 if max_depth is None else max_depth
        self.n_estimators = 100 if n_estimators is None else n_estimators
        self.learning_rate = 0.1 if learning_rate is None else learning_rate
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
    ) -> None:
        """Fit the regressor to data. Note that it requires the categorical features
        to be already encoded since otherwise the won't be processed.
        This is done in order to reduce the tree sizes and increase learning speeds."""
        self.trees_ = [
            DecisionTreeRegressor(
                max_depth=0,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                criterion=self.criterion,
            )
        ]
        self.trees_[0].fit(X, y)
        curr_guess = self.trees_[0].predict(X)
        prev_guess = np.copy(y)
        residue = np.sqrt(np.power((prev_guess - curr_guess), 2).sum())
        num_estimators = 1

        while num_estimators < self.n_estimators:
            if residue < self.tol:
                break

            self.trees_ += [
                DecisionTreeRegressor(
                    max_depth=self.max_depth,
                    min_samples_split=self.min_samples_split,
                    min_samples_leaf=self.min_samples_leaf,
                    criterion=self.criterion,
                )
            ]
            self.trees_[-1].fit(X, y - curr_guess, cat_features=cat_features)
            prev_guess = np.copy(curr_guess)
            curr_guess += self.learning_rate * self.trees_[-1].predict(X)
            residue = np.sqrt(np.power((prev_guess - curr_guess), 2).sum())
            num_estimators += 1

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        ans = self.trees_[0].predict(X)
        for tree in self.trees_[1:]:
            ans += tree.predict(X) * self.learning_rate
        return ans

### Сравнение различных методов на датасете

Рассмотрим признаки, имеющиеся в датасете:

In [None]:
df = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))
df.columns

Удалим признаки с дублирующие информацию или не несущие практической пользы:

In [None]:
drop_list = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "track_album_name",
    "analysis_url",
    "track_id",
    "track_name",
    "track_artist",
    "track_album_id",
    "track_album_release_date",
    "id",
    "playlist_id",
    "playlist_name",
]

df = df.drop(
    drop_list + ["Unnamed: 0"],
    axis=1,
)
cat_features = [
    "playlist_genre",
    "time_signature",
    "playlist_subgenre",
    "mode",
    "key",
]

In [None]:
df.head()

Очистим датасет от пустых и повторяющихся строк:

In [None]:
print("Размер датасета до чистки: ", df.shape)
df = df.dropna()
print("Размер датасета после удаления пустых строк: ", df.shape)
df = df.drop_duplicates()
print("Размер датасета после удаления повторяющихся строк: ", df.shape)

Из-за ограничений имплементации дерева регрессии, созданного в рамках данной работы, категориальные признаки не получится использовать напрямую --- их придется кодировать ординальным кодированием (меняем `str` на `int`, при этом `int`'ы восприниматся как названия категорий, не их порядок/номер). Также полезным будет убрать значения категориальных признаков, которые встречаются крайне редко($\leq 2$ раз на всем сете): 

In [None]:
X, y = (df.drop("popularity", axis=1), df["popularity"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=12389014
)

# Change unpopular values to "other"
for feature in cat_features:
    for val, count in X_train[feature].value_counts().items():
        if count <= 2:
            X_train.loc[X_train[feature] == val, feature] = "other"
            X_test.loc[X_test[feature] == val, feature] = "other"

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
for feature in cat_features:
    encoder.fit(X_train[[feature]])
    X_train[[feature]] = encoder.transform(X_train[[feature]])
    X_test[[feature]] = encoder.transform(X_test[[feature]])

X_train.head()

In [None]:
X_test.head()

In [None]:
regressor = DecisionTreeRegressor(
    max_depth=200, min_samples_leaf=3, min_samples_split=10
)
sregressor = SDecisionTreeRegressor(
    max_depth=200, min_samples_leaf=3, min_samples_split=10
)

regressor.fit(
    X_train.to_numpy(),
    y_train.to_numpy().flatten(),
    features=list(X_train.columns),
    cat_features=cat_features,
)

sregressor.fit(
    X_train,
    y_train,
)

print(
    f"RMSE(dummy regressor): {root_mean_squared_error(y_test, np.ones_like(y_test)*y_train.mean()):0.4f}"
)
print(
    f"RMSE(DecisionTreeRegressor): {root_mean_squared_error(y_test, regressor.predict(X_test.to_numpy())):0.4f}"
)
print(
    f"RMSE(Sklearn DecisionTreeRegressor): {root_mean_squared_error(y_test, sregressor.predict(X_test)):0.4f}"
)

### Поиск лучших параметров для регрессионного дерева

In [6]:
df = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))

drop_list = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "track_album_name",
    "analysis_url",
    "track_id",
    "track_name",
    "track_artist",
    "track_album_id",
    "track_album_release_date",
    "id",
    "playlist_id",
    "playlist_name",
]

df = df.drop(
    drop_list + ["Unnamed: 0"],
    axis=1,
)
cat_features = [
    "playlist_genre",
    "time_signature",
    "playlist_subgenre",
    "mode",
    "key",
]
num_features = [
    col for col in df.columns if (not col in cat_features) and col != "popularity"
]
df = df.dropna()
df = df.drop_duplicates()
X, y = (df.drop("popularity", axis=1), df["popularity"])

n_folds = 2
kf = KFold(n_splits=n_folds, shuffle=True)
scores = []

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
params_grid = [
    {"max_depth": 200, "min_samples_split": 2, "min_samples_leaf": 1},
    {"max_depth": 200, "min_samples_split": 6, "min_samples_leaf": 5},
    {"max_depth": 200, "min_samples_split": 10, "min_samples_leaf": 1},
    {"max_depth": 200, "min_samples_split": 10, "min_samples_leaf": 2},
    {"max_depth": 200, "min_samples_split": 10, "min_samples_leaf": 3},
    {"max_depth": 100, "min_samples_split": 10, "min_samples_leaf": 3},
    {"max_depth": 50, "min_samples_split": 10, "min_samples_leaf": 3},
    {"max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 3},
    {"max_depth": 200, "min_samples_split": 10, "min_samples_leaf": 4},
    {"max_depth": 200, "min_samples_split": 12, "min_samples_leaf": 5},
    {"max_depth": 200, "min_samples_split": 15, "min_samples_leaf": 5},
]

for params in params_grid:
    dtree = DecisionTreeRegressor(**params)
    local_scores = np.zeros(n_folds)
    for idx, (train_ids, val_ids) in enumerate(kf.split(df)):

        X_train, X_val, y_train, y_val = (
            X.iloc[train_ids].copy(),
            X.iloc[val_ids].copy(),
            y.iloc[train_ids].copy(),
            y.iloc[val_ids].copy(),
        )

        for feature in cat_features:
            for val, count in X_train[feature].value_counts().items():
                if count <= 2:
                    X_train.loc[X_train[feature] == val, feature] = "other"
                    X_val.loc[X_val[feature] == val, feature] = "other"

        enc.fit(X_train[cat_features])
        X_train[cat_features] = enc.transform(X_train[cat_features])
        X_val[cat_features] = enc.transform(X_val[cat_features])

        dtree.fit(
            X_train.to_numpy(),
            y_train.to_numpy(),
            features=list(X.columns),
            cat_features=cat_features,
        )

        local_scores[idx] = root_mean_squared_error(
            y_val, dtree.predict(X_val.to_numpy())
        )

    print(
        f"Scores for {params}:\n\t{local_scores}\nMean score: {local_scores.mean():0.4f}\tstd: {local_scores.std():0.4f}\n\n"
    )
    scores += [local_scores.mean()]

best_tree_params = params_grid[np.argmin(scores)]
print(f"Best parameter set for a RegressionTree: {best_tree_params}")

Scores for {'max_depth': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}:
	[15.14138396 14.56156765]
Mean score: 14.8515	std: 0.2899


Scores for {'max_depth': 200, 'min_samples_split': 6, 'min_samples_leaf': 5}:
	[19.9463916 19.8297314]
Mean score: 19.8881	std: 0.0583


Scores for {'max_depth': 200, 'min_samples_split': 10, 'min_samples_leaf': 1}:
	[14.32201009 12.92747257]
Mean score: 13.6247	std: 0.6973


Scores for {'max_depth': 200, 'min_samples_split': 10, 'min_samples_leaf': 2}:
	[12.88444371 13.16525306]
Mean score: 13.0248	std: 0.1404


Scores for {'max_depth': 200, 'min_samples_split': 10, 'min_samples_leaf': 3}:
	[12.7806464  13.08928303]
Mean score: 12.9350	std: 0.1543


Scores for {'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 3}:
	[13.01920023 12.39410481]
Mean score: 12.7067	std: 0.3125


Scores for {'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 3}:
	[13.76449921 12.53628452]
Mean score: 13.1504	std: 0.6141


Scores for {'max_depth': 

### Реализация `K-Fold Target encoder`

In [7]:
class KFoldTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        cv: int = None,
        handle_unknown: str = None,
        unknown_value: float = None,
        shuffle: bool = None,
    ) -> None:
        self.encoded_missing_value_ = np.nan
        self.cv_ = 5 if cv is None else cv
        self.shuffle_ = False if shuffle is None else shuffle

        # Check if `handle_unknown` is set to a known value:
        if handle_unknown is None:
            self.handle_unknown_ = "mean"
        else:
            if handle_unknown == "mean":
                self.handle_unknown_ = "mean"
            elif handle_unknown == "use_encoded_value":
                self.handle_unknown_ = "use_encoded_value"
                self.unknown_value_ = 0.0 if unknown_value is None else unknown_value

    def fit(
        self,
        data: Union[npt.ArrayLike, pd.Series, pd.DataFrame],
        target: npt.ArrayLike,
    ):
        self.data_ = data
        self.target_ = target

        if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
            self.data_ = self.data_.to_numpy(dtype=np.object_)

        self.transformed_data_ = np.zeros_like(self.data_, dtype=float)

        if isinstance(target, pd.DataFrame):
            self.target_ = self.target_.to_numpy(dtype=np.object_)

        if self.handle_unknown_ == "mean":
            self.unknown_value_ = self.target_.mean()

        kf = KFold(n_splits=self.cv_, shuffle=self.shuffle_)

        for train_idx, val_idx in kf.split(self.data_):

            X_train, X_val = (self.data_[train_idx, :], self.data_[val_idx, :])
            y_train = self.target_[train_idx]

            for feature in range(X_train.shape[1]):
                for unique_val in set(X_val[:, feature]):
                    train_mask = X_train[:, feature] == unique_val
                    weight = (
                        train_mask.sum()
                        * (
                            y_train[train_mask].mean()
                            if len(y_train[train_mask]) >= 1
                            else 0.0
                        )
                        + y_train.mean() * (X_train.shape[0] - train_mask.sum())
                    ) / X_train.shape[0]
                    self.transformed_data_[val_idx, feature] = np.where(
                        X_val[:, feature] == unique_val,
                        weight,
                        self.transformed_data_[val_idx, feature],
                    )

        return self

    def fit_transform(
        self,
        data: Union[npt.ArrayLike, pd.Series, pd.DataFrame],
        target: npt.ArrayLike,
    ) -> npt.ArrayLike:
        self.fit(data, target)
        return self.transformed_data_

    def transform(self, X: npt.ArrayLike) -> npt.ArrayLike:

        ans = np.zeros_like(X)

        for feature in range(self.data_.shape[1]):
            for unique_val in set(X[:, feature]):
                mask = self.data_[:, feature] == unique_val
                weight = (
                    self.unknown_value_
                    if mask.sum() == 0
                    else self.target_[mask].mean()
                )
                ans[:, feature] = np.where(
                    X[:, feature] == unique_val, weight, ans[:, feature]
                )
        return ans
        # for col in self.columns_:

        #     encoded = col + "_" + "kfold_target"
        #     col_mean = self.data_[[col, encoded]].groupby(col).mean().reset_index()

        #     replacements = {}
        #     for index, row in col_mean.iterrows():
        #         replacements[row[col]] = row[encoded]

        #     # X.insert(len(X.columns), encoded, X[col])
        #     X.insert(
        #         len(X.columns),
        #         encoded,
        #         X[col].map(
        #             lambda x: (
        #                 self.unknown_value_
        #                 if not x in replacements
        #                 else replacements[x]
        #             )
        #         ),
        #     )
        #     X[col].where(X[col].isin(replacements), other=self.unknown_value_)
        #     X = X.replace({encoded: replacements}).infer_objects(copy=False)

        #     if self.inplace_:
        #         X = X.drop([col], axis=1)

        # return X

### Тестируем бустинг с `K-Fold Target Encoder`

In [9]:
drop_list_fold = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "analysis_url",
    "track_id",
    "track_album_id",
    "track_name",
    "track_album_name",
    "id",
    "playlist_id",
]

cat_features_fold = [
    "playlist_name",
    "playlist_genre",
    "track_artist",
    "time_signature",
    "playlist_subgenre",
    "track_album_release_date",
    "mode",
    "key",
]

df_train = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))
df_train = df_train.drop(
    drop_list_fold + ["Unnamed: 0"],
    axis=1,
)
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()

df_train.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_train["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

gbX, gby = (df_train.drop("popularity", axis=1), df_train["popularity"])

gbX_train, gbX_test, gby_train, gby_test = train_test_split(
    gbX, gby, train_size=0.7, random_state=42
)

# encoder = TargetEncoder(cv=5, target_type="continuous")
# gbX_train.loc[:, cat_features_fold] = encoder.fit_transform(
#     gbX_train[cat_features_fold], gby_train
# )
# gbX_test.loc[:, cat_features_fold] = encoder.transform(gbX_test[cat_features_fold])
encoder = KFoldTargetEncoder(cv=5)
gbX_train.loc[:, cat_features_fold] = encoder.fit_transform(
    gbX_train[cat_features_fold].to_numpy(),
    gby_train.to_numpy(),
)
gbX_test.loc[:, cat_features_fold] = encoder.transform(
    gbX_test[cat_features_fold].to_numpy()
)
# encoder = KFoldTargetEncoder(inplace=True, n_folds=10)
# encoder.fit(gbX_train[cat_features_fold], gby_train)
# gbX_train = gbX_train.join(
#     encoder.data_.drop(cat_features_fold + ["popularity"], axis=1), how="outer"
# ).drop(cat_features_fold, axis=1)
# gbX_test = gbX_test.join(
#     encoder.transform(gbX_test[cat_features_fold]), how="outer"
# ).drop(cat_features_fold, axis=1)

In [10]:
gbX_train.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,track_artist,time_signature,speechiness,playlist_name,track_album_release_date,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre
3086,0.779,128.028,0.937,54.88526,-3.539,0.0408,0.95,54.676091,54.939152,0.329,55.083493,54.654061,6.8e-05,54.42528,54.57865,155625.0,0.22,55.083493
499,0.538,146.933,0.759,55.135882,-8.256,0.126,0.831,54.676091,54.939152,0.384,54.76064,54.391507,0.0,54.42528,54.773891,184490.0,0.151,54.76064
1353,0.878,129.996,0.521,54.407245,-4.424,0.124,0.563,54.671788,54.939152,0.0273,54.652924,54.025048,0.000299,54.926903,54.433759,219723.0,0.158,54.652924
436,0.541,140.021,0.662,54.88526,-7.236,0.208,0.167,54.676091,54.939152,0.0663,54.61174,54.025048,0.0,54.926903,54.630279,144583.0,0.449,54.683622
365,0.598,90.029,0.695,54.619229,-8.186,0.112,0.376,54.679724,54.939152,0.136,54.619229,54.654061,6e-06,54.926903,54.773891,127333.0,0.33,54.619229


In [11]:
gbX_test.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,track_artist,time_signature,speechiness,playlist_name,track_album_release_date,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre
3042,0.703,108.123,0.669,50.680851,-6.113,0.157,0.762,48.714286,55.284762,0.0581,46.984,54.842262,1.9e-05,54.581478,52.937824,164213.0,0.0788,46.984
1005,0.802,96.015,0.729,30.702479,-6.346,0.0905,0.705,54.994447,55.284762,0.179,27.833333,48.43662,4e-06,55.32618,52.907258,233177.0,0.0924,27.833333
2038,0.581,130.033,0.613,50.680851,-8.588,0.25,0.551,76.5,55.284762,0.0424,69.333333,48.43662,0.000345,54.581478,53.568182,239560.0,0.537,69.333333
3156,0.327,90.025,0.707,50.487179,-17.176,0.155,0.181,70.0,55.284762,0.0687,50.487179,54.842262,0.0148,54.581478,53.568182,176000.0,0.185,50.487179
1454,0.814,107.094,0.688,53.136364,-4.614,0.274,0.761,54.994447,55.284762,0.0452,53.136364,54.842262,0.0,55.32618,56.259494,183193.0,0.0196,62.214765


In [12]:
dt_regressor = DecisionTreeRegressor(**best_tree_params)
dt_regressor.fit(
    gbX_train.to_numpy(dtype=float),
    gby_train.to_numpy(dtype=float),
)

print(
    f"RMSE(DecisionTreeRegressor): {root_mean_squared_error(gby_test, dt_regressor.predict(gbX_test.to_numpy())):0.4f}"
)

RMSE(DecisionTreeRegressor): 19.6699


In [17]:
gbbig_regressor = GradientBoostingRegressor(
    n_estimators=25,
    max_depth=5,
    min_samples_leaf=3,
    min_samples_split=6,
    learning_rate=0.1,
    tol=1e-4,
)
gbbig_regressor.fit(
    gbX_train.to_numpy(dtype=float),
    gby_train.to_numpy(dtype=float),
)

sbig_regressor = SGradientBoostingRegressor(
    n_estimators=25,
    max_depth=5,
    min_samples_leaf=3,
    min_samples_split=6,
    learning_rate=0.1,
    tol=1e-4,
)
sbig_regressor.fit(
    gbX_train,
    gby_train,
)

print(
    f"RMSE(dummy regressor): {root_mean_squared_error(gby_test, np.ones_like(gby_test)*gby_train.mean()):0.4f}"
)
print(
    f"RMSE(GradientBoostingRegressor): {root_mean_squared_error(gby_test, gbbig_regressor.predict(gbX_test.to_numpy(dtype=float))):0.4f}"
)
print(
    f"RMSE(GradientBoostingRegressor | train set): {root_mean_squared_error(gby_train, gbbig_regressor.predict(gbX_train.to_numpy(dtype=float))):0.4f}"
)
print(
    f"RMSE(Sklearn GradientBoostingRegressor): {root_mean_squared_error(gby_test, sbig_regressor.predict(gbX_test)):0.4f}"
)
print(
    f"RMSE(Sklearn GradientBoostingRegressor | train set): {root_mean_squared_error(gby_train, sbig_regressor.predict(gbX_train)):0.4f}"
)

RMSE(dummy regressor): 20.5391
RMSE(GradientBoostingRegressor): 16.7948
RMSE(GradientBoostingRegressor | train set): 11.5138
RMSE(Sklearn GradientBoostingRegressor): 16.6782
RMSE(Sklearn GradientBoostingRegressor | train set): 10.4195


In [None]:
len(gbbig_regressor.trees_)

### Сбор результатов на отправку

Для бустинга:

In [None]:
drop_list_gb = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "analysis_url",
    "track_id",
    "track_album_id",
    "track_name",
    "track_album_name",
    "id",
    "playlist_id",
]

cat_features_gb = [
    "playlist_name",
    "playlist_genre",
    "track_artist",
    "time_signature",
    "playlist_subgenre",
    "track_album_release_date",
    "mode",
    "key",
]

df_train = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))
df_train = df_train.drop(
    drop_list_gb + ["Unnamed: 0"],
    axis=1,
)
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()

df_test = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "test.csv"))
df_test = df_test.drop(
    drop_list_gb,
    axis=1,
)
df_test = df_test.dropna()

df_train.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_train["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

df_test.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_test["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

gbX, gby = (df_train.drop("popularity", axis=1), df_train["popularity"])

encoder = TargetEncoder(cv=5, target_type="continuous")
gbX.loc[:, cat_features_gb] = encoder.fit_transform(gbX[cat_features_fold], gby)
df_test.loc[:, cat_features_gb] = encoder.transform(df_test[cat_features_fold])

gb_regressor = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=3,
    min_samples_split=10,
    learning_rate=0.1,
    tol=1e-4,
)
gb_regressor.fit(
    gbX.to_numpy(dtype=float),
    gby.to_numpy(dtype=float),
)

y_pred = pd.DataFrame(
    gb_regressor.predict(df_test.to_numpy(dtype=float)),
    index=pd.Index(df_test.index, name="Id"),
    columns=["popularity"],
)

In [None]:
y_pred.to_csv(
    path_or_buf=os.path.join(os.getcwd(), "data", "spotify-songs", "submission.csv")
)