### Импорты

In [1]:
from typing import Any, Callable, Dict, List, Union
import numpy as np
import numpy.typing as npt
import pandas as pd

In [2]:
train_file = "/kaggle/input/sporify-songs/train.csv"
test_file = "/kaggle/input/sporify-songs/test.csv"
submission_path = "/kaggle/working/submission.csv"

def root_mean_squared_error(x1: npt.ArrayLike, x2: npt.ArrayLike) -> float:
    return np.sqrt(np.power(x1 - x2, 2).mean())

### Дерево регрессии

In [3]:
class DecisionTreeRegressor():
    """
    Decision tree regressor, something to build gradient boosting algorithms off of.
    """

    SPLIT_CRITERIA = {
        "mae": lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred)),
        "mse": lambda y_true, y_pred: np.mean((y_true - y_pred) ** 2),
    }

    def __init__(
        self,
        max_depth: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
        criterion: str = None,
    ) -> None:
        if criterion is None:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA["mse"]
        else:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA[criterion.lower()]
        self.max_depth = -1 if max_depth is None else max_depth
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list = None,
        cat_features: list = None,
    ) -> None:
        """Fit the decision tree to passed data."""

        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            X = X.to_numpy()
            y = y.to_numpy()

        if features is None:
            self.features_ = [f"x_{i}" for i in range(X.shape[1])]
        else:
            self.features_ = features

        if cat_features is None:
            self.cat_features_ = []
        else:
            self.cat_features_ = cat_features

        self.tree_ = DecisionTreeRegressor.build_tree_(
            X,
            y,
            self.features_,
            depth=0,
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            min_samples_split=self.min_samples_split,
            cat_features=self.cat_features_,
            criterion=self.criterion,
        )

        self.n_leaves_ = DecisionTreeRegressor.count_leaves_(self.tree_)

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        """Predict class labels for given data."""
        ans = np.zeros((X.shape[0],))
        for i in range(ans.shape[0]):
            node = self.tree_
            while not node["is_leaf"]:
                # Get the index a the feature on which the split was performed
                feature_idx = self.features_.index(node["feature"])

                if node["feature"] in self.cat_features_:
                    # If the category has not appeared in training set, the tree
                    # traversal is terminated and the current node value is used
                    if node["children"].get(X[i, feature_idx], False):
                        node = node["children"].get(X[i, feature_idx], False)
                    else:
                        break
                else:
                    if X[i, feature_idx] <= node["threshold"]:
                        node = node["children"]["lower"]
                    else:
                        node = node["children"]["upper"]

            ans[i] = node["value"]
        return ans

    def get_n_leaves(self):
        return self.n_leaves_

    # =============================================================================
    # Tree construction
    # =============================================================================

    def build_tree_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        depth: int,
        max_depth: int,
        min_samples_split: int,
        min_samples_leaf: int,
        cat_features: list,
        criterion: Callable,
    ) -> Dict[str, Any]:
        """Recursively build a regression tree."""

        default_value = np.mean(y)

        # Terminate if there are no more features to split on
        if X.shape[1] == 0 or len(features) == 0:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if all the targets are duplicates of eachother
        if np.unique(y).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if all the datapoints are duplicates of eachother
        if np.unique(X, axis=0).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if node does not contain enough elements to split
        if y.shape[0] < min_samples_split:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if max tree depth is reached
        if depth == max_depth:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        best_id, best_feature, threshold = DecisionTreeRegressor.select_best_feature_(
            X, y, features, cat_features, criterion
        )
        new_features = [feature for feature in features if feature != best_feature]

        tree = {
            "depth": depth,
            "feature": best_feature,
            "is_leaf": False,
            "value": default_value,
            "children": {},
        }

        if best_feature in cat_features:
            categories = np.unique(X[:, best_id])

            for category in categories:

                mask = X[:, best_id] == category

                X_sub, y_sub = (
                    np.delete(X, best_id, axis=1)[mask],
                    y[mask],
                )

                # If there is not enough samples to make a leaf the split is aborted and
                # the node is considered a leaf
                if len(y_sub) < min_samples_leaf:
                    tree["is_leaf"] = True
                    tree["children"] = {}
                    return tree

                tree["children"][category] = DecisionTreeRegressor.build_tree_(
                    X_sub,
                    y_sub,
                    new_features,
                    depth=depth + 1,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    min_samples_split=min_samples_split,
                    cat_features=cat_features,
                    criterion=criterion,
                )
        else:
            mask_left = X[:, best_id] <= threshold
            mask_right = X[:, best_id] > threshold

            tree["threshold"] = threshold

            # If there is not enough samples to make a leaf the split is aborted and
            # the node is considered a leaf
            if (
                mask_left.sum() < min_samples_leaf
                or mask_right.sum() < min_samples_leaf
            ):
                tree["is_leaf"] = True
                return tree

            X_sub = np.delete(X, best_id, axis=1)

            tree["children"]["lower"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_left],
                y[mask_left],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
            )

            tree["children"]["upper"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_right],
                y[mask_right],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
            )
        return tree

    def feature_score_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        feature_idx: int,
        is_cat_feature: bool,
        criterion: Callable,
    ) -> float:
        """Calculate score for a given feature."""

        if is_cat_feature is None or is_cat_feature == False:

            uniques = np.unique(X[:, feature_idx])

            # Splits are not done if all the feature values are the same
            if uniques.shape[0] == 1:
                return {"value": np.inf, "threshold": None}

            thresholds = [
                0.5 * (curr + prev) for prev, curr in zip(uniques, uniques[1:])
            ]
            split_scores = []

            for theta in thresholds:

                mask_left = X[:, feature_idx] <= theta
                mask_right = X[:, feature_idx] > theta

                # Elements lower and higher than threshold are compared to their respective means
                y_pred = np.where(
                    mask_left, np.mean(y[mask_left]), np.mean(y[mask_right])
                )
                split_scores += [criterion(y_pred, y)]

            best_split_id = np.argmin(split_scores)
            best_threshold = thresholds[best_split_id]
            return {"value": split_scores[best_split_id], "threshold": best_threshold}

        else:
            categories = np.unique(X[:, feature_idx])
            y_pred = np.zeros_like(y)

            for category in categories:

                mask = X[:, feature_idx] == category
                # Elements in each(surviving!) category are compared to their respective means
                y_pred = np.where(mask, np.mean(y[mask]), y_pred)

            score = criterion(y_pred, y)
            return {"value": score, "threshold": None}

    def select_best_feature_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        cat_features: list,
        criterion: Callable,
    ) -> list:
        """Select the feature with the highest information gain."""
        scores = [
            DecisionTreeRegressor.feature_score_(
                X, y, i, feature in cat_features, criterion
            )
            for i, feature in enumerate(features)
        ]

        best_idx = np.argmin([score["value"] for score in scores])
        return [best_idx, features[best_idx], scores[best_idx]["threshold"]]

    # =============================================================================
    # Tree pruning
    # =============================================================================

    def count_leaves_(tree: Dict) -> int:
        """Count the number of leaf nodes in a (sub)tree."""
        if tree["is_leaf"]:
            return 1

        total_leaves = 0
        for child in tree["children"]:
            total_leaves += DecisionTreeRegressor.count_leaves_(tree["children"][child])

        return total_leaves

### Градиентный бустинг

In [38]:
class GradientBoostingRegressor():

    def __init__(
        self,
        learning_rate: float = None,
        max_depth: int = None,
        n_estimators: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
        criterion: str = None,
        tol: float = None,
    ) -> None:
        self.criterion = criterion
        self.tol = 1e-4 if tol is None else tol
        self.max_depth = 4 if max_depth is None else max_depth
        self.n_estimators = 100 if n_estimators is None else n_estimators
        self.learning_rate = 0.1 if learning_rate is None else learning_rate
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
    ) -> None:
        """Fit the regressor to data. Note that it requires the categorical features
        to be already encoded since otherwise the won't be processed.
        This is done in order to reduce the tree sizes and increase learning speeds."""
        self.trees_ = [
            DecisionTreeRegressor(
                max_depth=0,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                criterion=self.criterion,
            )
        ]
        self.trees_[0].fit(X, y)
        curr_guess = self.trees_[0].predict(X)
        prev_guess = np.copy(y)
        residue = np.sqrt(np.power((prev_guess - curr_guess), 2).sum())
        num_estimators = 1

        while num_estimators < self.n_estimators:
            if residue < self.tol:
                break

            self.trees_ += [
                DecisionTreeRegressor(
                    max_depth=self.max_depth,
                    min_samples_split=self.min_samples_split,
                    min_samples_leaf=self.min_samples_leaf,
                    criterion=self.criterion,
                )
            ]
            self.trees_[-1].fit(X, y - curr_guess)
            prev_guess = np.copy(curr_guess)
            curr_guess += self.learning_rate * self.trees_[-1].predict(X)
            residue = np.sqrt(np.power((prev_guess - curr_guess), 2).sum())
            num_estimators += 1

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        ans = self.trees_[0].predict(X)
        for tree in self.trees_[1:]:
            ans += tree.predict(X) * self.learning_rate
        return ans

### Реализация `OrdinalEncoder`

In [27]:
class OrdinalEncoder():
    def __init__(
        self,
        handle_unknown: str = None,
        unknown_value: int = None,
    ) -> None:
        self.encoded_missing_value_ = np.nan

        # Check if `handle_unknown` is set to a known value:
        if handle_unknown is None:
            self.handle_unknown_ = "error"
        else:
            if handle_unknown == "error":
                self.handle_unknown_ = "error"
            elif handle_unknown == "use_encoded_value":
                self.handle_unknown_ = "use_encoded_value"
                self.unknown_value_ = -1 if unknown_value is None else unknown_value

    def fit(
        self,
        data: Union[npt.ArrayLike, pd.DataFrame],
    ):
        self.feature_dict_ = {}
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()

        for feature in range(data.shape[1]):
            uniques = np.unique(data[:, feature])
            self.feature_dict_[feature] = {val: idx for idx, val in enumerate(uniques)}

        return self

    def fit_transform(
        self,
        data: Union[npt.ArrayLike, pd.Series, pd.DataFrame],
    ) -> npt.ArrayLike:
        self.fit(data)
        return self.transform(data)

    def transform(self, X: npt.ArrayLike) -> npt.ArrayLike:

        ans = np.zeros_like(X)

        for feature in range(X.shape[1]):
            for unique_val in set(X[:, feature]):
                substitute_value = self.feature_dict_[feature].get(unique_val, False) if self.feature_dict_[feature].get(unique_val, False) else self.unknown_value_
                ans[:, feature] = np.where(
                    X[:, feature] == unique_val, substitute_value, ans[:, feature]
                )
        return ans

### Реализация `K-Fold Target encoder`

In [35]:
### KFold(n_splits=self.cv_, shuffle=self.shuffle_)

class KFold():
    def __init__(self, n_splits: int=None, shuffle: bool=None, random_seed: int=None):
        self.n_splits_ = 2 if n_splits is None else n_splits
        self.shuffle_ = False if shuffle is None else shuffle
        self.random_seed_ = None if random_seed is None else random_seed

    def split(self, data):
        fold_size = len(data) // self.n_splits_
        indices = np.arange(len(data))

        if self.shuffle_:
            rng = np.random.default_rng(seed=self.random_seed_)
            rng.shuffle(indices)

        folds = []
        for i in range(self.n_splits_):
            test_indices = indices[i * fold_size: (i + 1) * fold_size]
            train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
            folds.append((train_indices, test_indices))
        return folds

In [36]:
class KFoldTargetEncoder():
    def __init__(
        self,
        cv: int = None,
        handle_unknown: str = None,
        unknown_value: float = None,
        shuffle: bool = None,
    ) -> None:
        self.encoded_missing_value_ = np.nan
        self.cv_ = 5 if cv is None else cv
        self.shuffle_ = False if shuffle is None else shuffle

        # Check if `handle_unknown` is set to a known value:
        if handle_unknown is None:
            self.handle_unknown_ = "mean"
        else:
            if handle_unknown == "mean":
                self.handle_unknown_ = "mean"
            elif handle_unknown == "use_encoded_value":
                self.handle_unknown_ = "use_encoded_value"
                self.unknown_value_ = 0.0 if unknown_value is None else unknown_value

    def fit(
        self,
        data: Union[npt.ArrayLike, pd.Series, pd.DataFrame],
        target: npt.ArrayLike,
    ):
        self.data_ = data
        self.target_ = target

        if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
            self.data_ = self.data_.to_numpy(dtype=np.object_)

        self.transformed_data_ = np.zeros_like(self.data_, dtype=float)

        if isinstance(target, pd.DataFrame):
            self.target_ = self.target_.to_numpy(dtype=np.object_)

        if self.handle_unknown_ == "mean":
            self.unknown_value_ = self.target_.mean()

        kf = KFold(n_splits=self.cv_, shuffle=self.shuffle_)

        for train_idx, val_idx in kf.split(self.data_):

            X_train, X_val = (self.data_[train_idx, :], self.data_[val_idx, :])
            y_train = self.target_[train_idx]

            for feature in range(X_train.shape[1]):
                for unique_val in set(X_val[:, feature]):
                    train_mask = X_train[:, feature] == unique_val
                    weight = (
                        train_mask.sum()
                        * (
                            y_train[train_mask].mean()
                            if len(y_train[train_mask]) >= 1
                            else 0.0
                        )
                        + y_train.mean() * (X_train.shape[0] - train_mask.sum())
                    ) / X_train.shape[0]
                    self.transformed_data_[val_idx, feature] = np.where(
                        X_val[:, feature] == unique_val,
                        weight,
                        self.transformed_data_[val_idx, feature],
                    )

        return self

    def fit_transform(
        self,
        data: Union[npt.ArrayLike, pd.Series, pd.DataFrame],
        target: npt.ArrayLike,
    ) -> npt.ArrayLike:
        self.fit(data, target)
        return self.transformed_data_

    def transform(self, X: npt.ArrayLike) -> npt.ArrayLike:

        ans = np.zeros_like(X)

        for feature in range(self.data_.shape[1]):
            for unique_val in set(X[:, feature]):
                mask = self.data_[:, feature] == unique_val
                weight = (
                    self.unknown_value_
                    if mask.sum() == 0
                    else self.target_[mask].mean()
                )
                ans[:, feature] = np.where(
                    X[:, feature] == unique_val, weight, ans[:, feature]
                )
        return ans

### Сбор результатов бустинга на отправку

In [39]:
drop_list_gb = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "analysis_url",
    "track_id",
    "track_album_id",
    "track_name",
    "track_album_name",
    "id",
    "playlist_id",
]

cat_features_gb = [
    "playlist_name",
    "playlist_genre",
    "track_artist",
    "time_signature",
    "playlist_subgenre",
    "track_album_release_date",
    "mode",
    "key",
]

df_train = pd.read_csv(train_file)
df_train = df_train.drop(
    drop_list_gb + ["Unnamed: 0"],
    axis=1,
)
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()

df_test = pd.read_csv(test_file)
df_test = df_test.drop(
    drop_list_gb,
    axis=1,
)
df_test = df_test.dropna()

df_train.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_train["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

df_test.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_test["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

gbX, gby = (df_train.drop("popularity", axis=1), df_train["popularity"])

encoder = KFoldTargetEncoder(cv=5)
gbX.loc[:, cat_features_gb] = encoder.fit_transform(gbX[cat_features_gb].to_numpy(), gby.to_numpy())
df_test.loc[:, cat_features_gb] = encoder.transform(df_test[cat_features_gb].to_numpy())

gb_regressor = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=40,
    min_samples_split=100,
    learning_rate=0.1,
    tol=1e-4,
)
gb_regressor.fit(
    gbX.to_numpy(dtype=float),
    gby.to_numpy(dtype=float),
)

y_pred_gb = pd.DataFrame(
    gb_regressor.predict(df_test.to_numpy(dtype=float)),
    index=pd.Index(df_test.index, name="Id"),
    columns=["popularity"],
)

### Сбор результатов дерева на отправку

In [28]:
drop_list_dt = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "track_album_name",
    "analysis_url",
    "track_id",
    "track_name",
    "track_artist",
    "track_album_id",
    "id",
    "playlist_id",
    "playlist_name",
]

cat_features_dt = [
    "playlist_genre",
    "time_signature",
    "playlist_subgenre",
    "mode",
    "key",
    "track_album_release_date",
]

df_train = pd.read_csv(train_file)
df_train = df_train.drop(
    drop_list_dt + ["Unnamed: 0"],
    axis=1,
)
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()

df_test = pd.read_csv(test_file)
df_test = df_test.drop(
    drop_list_dt,
    axis=1,
)
df_test = df_test.dropna()

df_train.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_train["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

df_test.loc[:, "track_album_release_date"] = (
    pd.to_datetime(
        df_test["track_album_release_date"], format="mixed", yearfirst=True
    ).dt.year
).astype(str)

dtX, dty = (df_train.drop("popularity", axis=1), df_train["popularity"])

for feature in cat_features_dt:
    for val, count in dtX[feature].value_counts().items():
        if count <= 2:
            dtX.loc[dtX[feature] == val, feature] = "other"
            df_test.loc[df_test[feature] == val, feature] = "other"

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
dtX[cat_features_dt] = enc.fit_transform(dtX[cat_features_dt].to_numpy())
df_test[cat_features_dt] = enc.transform(df_test[cat_features_dt].to_numpy())

best_tree_params = {'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 3}
dt_regressor = DecisionTreeRegressor(**best_tree_params)

dt_regressor.fit(    
    dtX.to_numpy(dtype=float),
    dty.to_numpy(dtype=float),
    features=list(dtX.columns),
    cat_features=cat_features_dt,
)

y_pred_dt = pd.DataFrame(
    dt_regressor.predict(df_test.to_numpy()),
    index=pd.Index(df_test.index, name="Id"),
    columns=["popularity"],
)

In [None]:
y_pred_dt.to_csv(
    path_or_buf=submission_path
)