In [1]:
%load_ext jupyter_black

In [2]:
from collections import Counter
from copy import deepcopy
from graphviz import Digraph
from IPython.display import display
from typing import Any, Callable, Dict, List, Union
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor as SDecisionTreeRegressor
import os
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import pandas as pd

In [13]:
class DecisionTreeRegressor(BaseEstimator, RegressorMixin):
    """
    Decision tree regressor, something to build gradient boosting algorithms off of.
    """

    SPLIT_CRITERIA = {
        "mae": lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred)),
        "mse": lambda y_true, y_pred: np.mean((y_true - y_pred) ** 2),
    }

    def __init__(
        self,
        max_depth: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
        criterion: str = None,
    ) -> None:
        if criterion is None:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA["mse"]
        else:
            self.criterion = DecisionTreeRegressor.SPLIT_CRITERIA[criterion.lower()]
        self.max_depth = -1 if max_depth is None else max_depth
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list = None,
        cat_features: list = None,
        sample_weight: list = None,
    ) -> None:
        """Fit the decision tree to passed data."""

        if features is None:
            self.features_ = [f"x_{i}" for i in range(X.shape[1])]
        else:
            self.features_ = features

        if cat_features is None:
            self.cat_features_ = []
        else:
            self.cat_features_ = cat_features

        self.weights_ = sample_weight

        self.tree_ = DecisionTreeRegressor.build_tree_(
            X,
            y,
            self.features_,
            depth=0,
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            min_samples_split=self.min_samples_split,
            cat_features=self.cat_features_,
            criterion=self.criterion,
            weights=self.weights_,
        )

        self.n_leaves_ = DecisionTreeRegressor.count_leaves_(self.tree_)

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        """Predict class labels for given data."""
        ans = np.zeros((X.shape[0],))
        for i in range(ans.shape[0]):
            node = self.tree_
            while not node["is_leaf"]:
                # Get the index a the feature on which the split was performed
                feature_idx = self.features_.index(node["feature"])

                if node["feature"] in self.cat_features_:
                    # If the category has not appeared in training set, the tree
                    # traversal is terminated and the current node value is used
                    if node["children"].get(X[i, feature_idx], False):
                        node = node["children"].get(X[i, feature_idx], False)
                    else:
                        break
                else:
                    if X[i, feature_idx] <= node["threshold"]:
                        node = node["children"]["lower"]
                    else:
                        node = node["children"]["upper"]

            ans[i] = node["value"]
        return ans

    def get_n_leaves(self):
        return self.n_leaves_

    # =============================================================================
    # Tree construction
    # =============================================================================

    def build_tree_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        depth: int,
        max_depth: int,
        min_samples_split: int,
        min_samples_leaf: int,
        cat_features: list,
        criterion: Callable,
        weights: list,
    ) -> Dict[str, Any]:
        """Recursively build an regression tree."""

        default_value = np.mean(y)

        # Terminate if there are no more features to split on
        if X.shape[1] == 0 or len(features) == 0:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if there all the targets are duplicates of eachother
        if np.unique(y).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if all the datapoints are duplicates of eachother
        if np.unique(X, axis=0).shape[0] == 1:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if node does not contain enough elements to split
        if y.shape[0] < min_samples_split:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        # Terminate if max tree depth is reached
        if depth == max_depth:
            return {"value": default_value, "depth": depth, "is_leaf": True}

        best_id, best_feature, threshold = DecisionTreeRegressor.select_best_feature_(
            X, y, features, cat_features, criterion, weights
        )
        new_features = [feature for feature in features if feature != best_feature]

        tree = {
            "depth": depth,
            "feature": best_feature,
            "is_leaf": False,
            "value": default_value,
            "children": {},
        }

        if best_feature in cat_features:
            categories = np.unique(X[:, best_id])

            for category in categories:

                mask = X[:, best_id] == category

                X_sub, y_sub = (
                    np.delete(X, best_id, axis=1)[mask],
                    y[mask],
                )

                # If there is not enough samples to make a leaf the split is aborted and
                # the node is considered a leaf
                if len(y_sub) < min_samples_leaf:
                    tree["is_leaf"] = True
                    tree["children"] = {}
                    return tree

                tree["children"][category] = DecisionTreeRegressor.build_tree_(
                    X_sub,
                    y_sub,
                    new_features,
                    depth=depth + 1,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    min_samples_split=min_samples_split,
                    cat_features=cat_features,
                    criterion=criterion,
                    weights=weights if weights is None else weights[mask],
                )
        else:
            mask_left = X[:, best_id] <= threshold
            mask_right = X[:, best_id] > threshold

            tree["threshold"] = threshold

            # If there is not enough samples to make a leaf the split is aborted and
            # the node is considered a leaf
            if (
                mask_left.sum() < min_samples_leaf
                or mask_right.sum() < min_samples_leaf
            ):
                tree["is_leaf"] = True
                tree["children"] = {}
                return tree

            X_sub = np.delete(X, best_id, axis=1)

            tree["children"]["lower"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_left],
                y[mask_left],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
                weights=weights if weights is None else weights[mask_left],
            )

            tree["children"]["upper"] = DecisionTreeRegressor.build_tree_(
                X_sub[mask_right],
                y[mask_right],
                new_features,
                depth=depth + 1,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                cat_features=cat_features,
                criterion=criterion,
                weights=weights if weights is None else weights[mask_right],
            )
        return tree

    def feature_score_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        feature_idx: int,
        is_cat_feature: bool,
        criterion: Callable,
    ) -> float:
        """Calculate gain for a given feature."""

        if is_cat_feature is None or is_cat_feature == False:

            uniques = np.unique(X[:, feature_idx])

            # Splits are not done if all the feature values are the same
            if uniques.shape[0] == 1:
                return {"value": np.inf, "threshold": None}

            thresholds = [
                0.5 * (curr + prev) for prev, curr in zip(uniques, uniques[1:])
            ]
            split_scores = []

            for theta in thresholds:

                mask_left = X[:, feature_idx] <= theta
                mask_right = X[:, feature_idx] > theta

                # Elements lower and higher than threshold are compared to their respective means
                y_pred = np.where(
                    mask_left, np.mean(y[mask_left]), np.mean(y[mask_right])
                )
                split_scores += [criterion(y_pred, y)]

            best_split_id = np.argmin(split_scores)
            best_threshold = thresholds[best_split_id]
            return {"value": split_scores[best_split_id], "threshold": best_threshold}

        else:
            categories = np.unique(X[:, feature_idx])
            y_pred = np.zeros_like(y)

            for category in categories:

                mask = X[:, feature_idx] == category
                # Elements in each(surviving!) category are compared to their respective means
                y_pred = np.where(mask, np.mean(y[mask]), y_pred)

            score = criterion(y_pred, y)
            return {"value": score, "threshold": None}

    def select_best_feature_(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        cat_features: list,
        criterion: Callable,
        weights: list,
    ) -> list:
        """Select the feature with the highest information gain."""
        scores = [
            DecisionTreeRegressor.feature_score_(
                X, y, i, feature in cat_features, criterion
            )
            for i, feature in enumerate(features)
        ]

        best_idx = np.argmin([score["value"] for score in scores])
        return [best_idx, features[best_idx], scores[best_idx]["threshold"]]

    # =============================================================================
    # Tree pruning
    # =============================================================================

    def count_leaves_(tree: Dict) -> int:
        """Count the number of leaf nodes in a (sub)tree."""
        if tree["is_leaf"]:
            return 1

        total_leaves = 0
        for child in tree["children"]:
            total_leaves += DecisionTreeRegressor.count_leaves_(tree["children"][child])

        return total_leaves

    # =============================================================================
    # Tree visualization
    # =============================================================================

    def show_tree(self):
        """Visualize the decision tree."""
        dot = DecisionTreeRegressor.visualize_tree_(self.tree_, self.features_)
        display(dot)

    def visualize_tree_(
        tree: Dict[str, Any],
        feature_names: list,
        dot: Digraph = None,
        parent: str = None,
        edge_label: str = None,
    ) -> Digraph:
        """Recursively visualize the decision tree using Graphviz."""
        if dot is None:
            dot = Digraph(comment="Decision Tree Regessor")

        # Create a unique node ID
        node_id = str(id(tree))

        # Add the current node
        if tree["is_leaf"]:
            node_label = f"Value: {tree['value']:0.2f}"
        else:
            node_label = f"Feature: {tree['feature']}"
            if tree.get("threshold", False):
                node_label += f"<={tree['threshold']:0.2f}"
        dot.node(node_id, node_label)

        # Connect to parent node if exists
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)

        # Recursively add children
        if "children" in tree:
            for value, child in tree["children"].items():
                DecisionTreeRegressor.visualize_tree_(
                    child,
                    feature_names,
                    dot,
                    node_id,
                    str(value) if not isinstance(value, str) else value,
                )

        return dot

In [3]:
class SymmetricTreeRegressor(BaseEstimator, RegressorMixin):
    


class CatBoostRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, estimator: object = None, n_estimators: int = None):
        self.n_estimators = 50 if n_estimators is None else n_estimators
        self.estimator = DecisionTreeClassifier if estimator is None else estimator
        self.classifier_weights_ = np.zeros(self.n_estimators)
        self.classifiers_ = [self.estimator for i in range(self.n_estimators)]  

    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike):
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Incorrect input array shape: {X.shape} and {y.shape}")

        n_samples = X.shape[0]

                

Рассмотрим признаки, имеющиеся в датасете:

In [74]:
df = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))
df.columns

Index(['Unnamed: 0', 'energy', 'tempo', 'danceability', 'playlist_genre',
       'loudness', 'liveness', 'valence', 'track_artist', 'time_signature',
       'speechiness', 'track_href', 'uri', 'track_album_name', 'playlist_name',
       'analysis_url', 'track_id', 'track_name', 'track_album_release_date',
       'instrumentalness', 'track_album_id', 'mode', 'key', 'duration_ms',
       'acousticness', 'id', 'playlist_subgenre', 'type', 'playlist_id',
       'popularity'],
      dtype='object')

Удалим признаки с дублирующие информацию или не несущие практической пользы:

In [75]:
drop_list = [
    "type",
    "track_href",
    "track_href",
    "uri",
    "track_album_name",
    "analysis_url",
    "track_id",
    "track_name",
    "track_artist",
    "track_album_id",
    "track_album_release_date",
    "id",
    "playlist_id",
    "playlist_name",
]

df = df.drop(
    drop_list + ["Unnamed: 0"],
    axis=1,
)
cat_features = [
    "playlist_genre",
    "time_signature",
    "playlist_subgenre",
    "mode",
    "key",
]

In [76]:
df.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre,popularity
0,0.00948,67.237,0.188,wellness,-37.23,0.109,0.0738,4.0,0.0376,0.96,0.0,7.0,151340.0,0.971,yoga,46
1,0.413,94.938,0.494,pop,-10.432,0.193,0.273,4.0,0.0254,6.7e-05,1.0,11.0,211979.0,0.682,global,93
2,0.386,142.127,0.519,world,-12.732,0.375,0.313,4.0,0.0375,0.773,0.0,3.0,124620.0,0.68,chinese,34
3,0.898,132.027,0.779,gospel,-4.589,0.182,0.675,4.0,0.085,0.142,1.0,8.0,1034000.0,0.022,modern,30
4,0.409,170.071,0.714,electronic,-6.476,0.0883,0.497,4.0,0.0787,0.0,1.0,0.0,136324.0,0.0821,french,17


Очистим датасет от пустых значений:

In [77]:
print("Размер датасета до чистки: ", df.shape)
df = df.dropna()
print("Размер датасета после чистки: ", df.shape)

Размер датасета до чистки:  (3623, 16)
Размер датасета после чистки:  (3622, 16)


Из-за ограничений имплементации дерева регрессии, созданного в рамках данной работы, категориальные признаки не получится использовать напрямую --- их придется кодировать ординальным кодированием (меняем `str` на `int`, при этом `int`'ы восприниматся как названия категорий, не их порядок/номер). Также полезным будет убрать значения категориальных признаков, которые встречаются крайне редко($\leq 2$ раз на всем сете): 

In [78]:
X, y = (df.drop("popularity", axis=1), df["popularity"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42
)

# Change unpopular values to "other"
for feature in cat_features:
    for val, count in X_train[feature].value_counts().items():
        if count <= 2:
            X_train.loc[X_train[feature] == val, feature] = "other"
            X_test.loc[X_test[feature] == val, feature] = "other"

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
for feature in cat_features:
    encoder.fit(X_train[[feature]])
    X_train[[feature]] = encoder.transform(X_train[[feature]])
    X_test[[feature]] = encoder.transform(X_test[[feature]])

X_train.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre
2934,0.448,95.05,0.599,25.0,-6.312,0.106,0.168,1.0,0.0232,0.0,1.0,8.0,263400.0,0.163,68.0
772,0.571,89.982,0.413,12.0,-7.329,0.0885,0.338,2.0,0.0485,0.126,1.0,9.0,250960.0,0.00236,69.0
2310,0.482,100.013,0.631,9.0,-6.761,0.0819,0.677,2.0,0.121,0.0,0.0,1.0,204453.0,0.428,34.0
3438,0.759,86.037,0.694,9.0,-5.231,0.255,0.777,2.0,0.124,0.0,0.0,2.0,175347.0,0.0756,34.0
2853,0.864,129.933,0.787,14.0,-4.665,0.107,0.877,2.0,0.0926,9e-05,1.0,7.0,155077.0,0.232,27.0


In [79]:
X_test.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,time_signature,speechiness,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre
1703,0.861,128.99,0.704,25.0,-4.221,0.0992,0.636,2.0,0.0493,0.0615,0.0,2.0,213394.0,0.587,75.0
1417,0.907,140.982,0.518,25.0,-3.732,0.0838,0.961,2.0,0.0343,1.2e-05,1.0,5.0,179013.0,0.00221,75.0
1074,0.706,123.036,0.804,2.0,-6.953,0.139,0.569,2.0,0.133,0.000658,1.0,7.0,143439.0,0.134,55.0
670,0.0851,129.103,0.307,6.0,-20.471,0.101,0.0388,2.0,0.0399,0.872,0.0,9.0,769213.0,0.968,30.0
1770,0.00251,136.472,0.151,33.0,-40.069,0.111,0.0546,1.0,0.0456,0.967,1.0,6.0,173000.0,0.977,80.0


In [80]:
regressor = DecisionTreeRegressor(
    max_depth=200, min_samples_leaf=2, min_samples_split=10
)
sregressor = SDecisionTreeRegressor(
    max_depth=200, min_samples_leaf=2, min_samples_split=10
)

regressor.fit(
    X_train.to_numpy(),
    y_train.to_numpy().flatten(),
    features=list(X_train.columns),
    cat_features=cat_features,
)

sregressor.fit(
    X_train,
    y_train,
)

print(
    f"RMSE(dummy regressor): {root_mean_squared_error(y_test, np.ones_like(y_test)*y_train.mean()):0.4f}"
)
print(
    f"RMSE(DecisionTreeRegressor): {root_mean_squared_error(y_test, regressor.predict(X_test.to_numpy())):0.4f}"
)
print(
    f"RMSE(Sklearn DecisionTreeRegressor): {root_mean_squared_error(y_test, sregressor.predict(X_test)):0.4f}"
)

RMSE(dummy regressor): 19.8065
RMSE(DecisionTreeRegressor): 11.7590
RMSE(Sklearn DecisionTreeRegressor): 16.1393


In [87]:
df_train = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "train.csv"))
df_train = df_train.drop(
    drop_list + ["Unnamed: 0"],
    axis=1,
)
df_train = df_train.dropna()
df_test = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "test.csv"))
df_test = df_test.drop(
    drop_list,
    axis=1,
)
df_test = df_test.dropna()

for feature in cat_features:
    for val, count in df_train[feature].value_counts().items():
        if count <= 2:
            df_train.loc[df_train[feature] == val, feature] = "other"
            df_test.loc[df_test[feature] == val, feature] = "other"

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
for feature in cat_features:
    encoder.fit(df_train[[feature]])
    df_train[[feature]] = encoder.transform(df_train[[feature]])
    df_test[[feature]] = encoder.transform(df_test[[feature]])

big_regressor = DecisionTreeRegressor(
    max_depth=200, min_samples_leaf=2, min_samples_split=10
)
big_regressor.fit(
    df_train.drop("popularity", axis=1).to_numpy(),
    df_train["popularity"].to_numpy(),
    features=[feature for feature in df_train.columns if feature != "popularity"],
    cat_features=cat_features,
)

y_pred = pd.DataFrame(
    big_regressor.predict(df_test.to_numpy()),
    index=pd.Index(df_test.index, name="Id"),
    columns=["popularity"],
)

In [88]:
y_pred.head()

Unnamed: 0_level_0,popularity
Id,Unnamed: 1_level_1
0,42.666667
1,87.4375
2,55.2
3,54.05
4,52.0


In [93]:
y_pred.to_csv(
    path_or_buf=os.path.join(os.getcwd(), "data", "spotify-songs", "submission.csv")
)

y1 = pd.read_csv(
    os.path.join(os.getcwd(), "data", "spotify-songs", "submission_first.csv")
)
y2 = pd.read_csv(os.path.join(os.getcwd(), "data", "spotify-songs", "submission.csv"))
print((y2 - y1).sum())

Id             0.000000
popularity    73.460972
dtype: float64
