In [1]:
%load_ext jupyter_black

In [5]:
from copy import deepcopy
from collections import Counter
from typing import Dict, Any, Union
from IPython.display import display
from graphviz import Digraph
from scipy.optimize import minimize
from sklearn.datasets import load_iris, make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier
import os
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import pandas as pd

In [3]:
def entropy(y: npt.ArrayLike, weights: npt.ArrayLike = None) -> float:
    """Calculate the entropy of a target array."""
    counts = np.unique(y, return_counts=True)
    if weights is None:
        probs = counts[1] / y.shape[0]
        ans = -np.sum(probs * np.log(probs + 1e-10))
    else:
        ans = 0.0
        total_weight = weights.sum()
        for value in counts[0]:
            prob = weights[y == value].sum() / total_weight
            ans -= prob * np.log(prob + 1e-10)
    return ans


class DecisionTreeClassifier(BaseEstimator, ClassifierMixin):
    """
    Decision tree classifier, which can be trained, can predict class labels(miraculously) and display itself if used in a frontend environment.
    """

    def __init__(
        self,
        max_depth: int = None,
        min_samples_split: int = None,
        min_samples_leaf: int = None,
    ):
        self.max_depth = -1 if max_depth is None else max_depth
        self.min_samples_split = 2 if min_samples_split is None else min_samples_split
        self.min_samples_leaf = 1 if min_samples_leaf is None else min_samples_leaf

    def fit(
        self,
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list = None,
        cat_features: list = None,
        sample_weight: list = None,
    ) -> None:
        """Fit the decision tree to passed data."""

        if features is None:
            self._features = [f"x_{i}" for i in range(X.shape[1])]
        else:
            self._features = features

        if cat_features is None:
            self._cat_features = []
        else:
            self._cat_features = cat_features

        self._weights = sample_weight

        if isinstance(y[0], str):
            self._prediction_dtype = "object"
        else:
            self._prediction_dtype = "int64"

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]

        self._tree = DecisionTreeClassifier._c45_algorithm(
            X,
            y,
            self._features,
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            min_samples_split=self.min_samples_split,
            cat_features=self._cat_features,
            weights=self._weights,
        )
        self._n_leaves = DecisionTreeClassifier._compute_subtree_leaves(self._tree)

    def predict(self, X: npt.ArrayLike) -> npt.ArrayLike:
        """Predict class labels for given data."""
        ans = np.zeros((X.shape[0],), dtype=self._prediction_dtype)
        for i in range(ans.shape[0]):
            node = self._tree
            while "children" in node:
                feature_id = self._features.index(node["feature"])
                if node["feature"] in self._cat_features:
                    node = node["children"][X[i, feature_id]]
                else:
                    node = (
                        node["children"][f"<={node['threshold']:0.2f}"]
                        if X[i, feature_id] <= node["threshold"]
                        else node["children"][f">{node['threshold']:0.2f}"]
                    )
            ans[i] = node["majority_class"]
        return ans

    # =============================================================================
    # Tree construction
    # =============================================================================

    def _c45_algorithm(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        max_depth: int,
        min_samples_split: int,
        min_samples_leaf: int,
        cat_features: list,
        weights: list,
    ) -> Dict[str, Any]:
        """Recursively build an C4.5 decision tree."""

        majority_class = DecisionTreeClassifier._majority_class(y)
        error = np.sum(y != majority_class)

        # Base case: all samples same class
        if len(set(y)) == 1 or X.shape[1] < 1:
            return {"majority_class": majority_class, "error": error}

        # Node contains not enough elements to split
        if len(y) < min_samples_split:
            return {"majority_class": majority_class, "error": error}

        # Max tree depth is reached
        if max_depth == 0:
            return {"majority_class": majority_class, "error": error}

        best_id, best_feature, threshold = DecisionTreeClassifier._select_best_feature(
            X, y, features, cat_features, weights
        )
        feature_values = np.unique(X[:, best_id])
        new_features = features.copy()
        new_features.remove(best_feature)

        tree = {
            "feature": best_feature,
            "majority_class": majority_class,
            "children": {},
            "error": error,
        }

        if best_feature in cat_features:
            for value in feature_values:
                mask = X[:, best_id] == value
                X_sub, y_sub, weights_sub = (
                    np.delete(X, best_id, axis=1)[mask],
                    y[mask],
                    weights[mask],
                )
                if len(y_sub) < min_samples_leaf:
                    child_majority_class = DecisionTreeClassifier._majority_class(y_sub)
                    tree["children"][value] = {
                        "majority_class": child_majority_class,
                        "error": np.sum(y_sub != child_majority_class),
                    }
                else:
                    tree["children"][value] = DecisionTreeClassifier._c45_algorithm(
                        X_sub,
                        y_sub,
                        new_features,
                        max_depth=max_depth - 1,
                        min_samples_leaf=min_samples_leaf,
                        min_samples_split=min_samples_split,
                        cat_features=cat_features,
                        weights=weights,
                    )
        else:
            mask = X[:, best_id] <= threshold
            not_mask = np.logical_not(mask)
            if (
                np.sum(mask) >= min_samples_leaf
                and (mask.shape[0] - np.sum(mask)) >= min_samples_leaf
            ):
                tree["threshold"] = threshold
                X_sub, y_sub, weights_sub = (
                    np.delete(X, best_id, axis=1)[mask],
                    y[mask],
                    weights[mask],
                )
                tree["children"][f"<={threshold:0.2f}"] = (
                    DecisionTreeClassifier._c45_algorithm(
                        X_sub,
                        y_sub,
                        new_features,
                        max_depth=max_depth - 1,
                        min_samples_leaf=min_samples_leaf,
                        min_samples_split=min_samples_split,
                        cat_features=cat_features,
                        weights=weights_sub,
                    )
                )

                X_sub, y_sub, weights_sub = (
                    np.delete(X, best_id, axis=1)[not_mask],
                    y[not_mask],
                    weights[not_mask],
                )
                tree["children"][f">{threshold:0.2f}"] = (
                    DecisionTreeClassifier._c45_algorithm(
                        X_sub,
                        y_sub,
                        new_features,
                        max_depth=max_depth - 1,
                        min_samples_leaf=min_samples_leaf,
                        min_samples_split=min_samples_split,
                        cat_features=cat_features,
                        weights=weights_sub,
                    )
                )

        return tree

    def _majority_class(y: npt.ArrayLike):
        return Counter(y).most_common(1)[0][0]

    def _gain_function(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        feature_idx: int,
        cat_feature: bool,
        weights: npt.ArrayLike,
    ) -> float:
        """Calculate gain for a given feature."""

        if cat_feature is None or cat_feature == False:

            def target_fn(
                theta: float, X: npt.ArrayLike, y: npt.ArrayLike, weights: npt.ArrayLike
            ):
                mask_left = X[:, feature_idx] <= theta
                mask_right = X[:, feature_idx] > theta
                # raise ValueError(
                #     f"mask:{mask_left.shape}\nw:{weights.shape}\ny:{y.shape}\n{weights[mask_left].shape}"
                # )
                if weights is None:
                    return mask_left.sum() / y.shape[0] * entropy(
                        y[mask_left],
                        weights=weights,
                    ) + mask_right.sum() / y.shape[0] * entropy(
                        y[mask_right],
                        weights=weights,
                    )
                else:
                    return mask_left.sum() / y.shape[0] * entropy(
                        y[mask_left],
                        weights=weights[mask_left],
                    ) + mask_right.sum() / y.shape[0] * entropy(
                        y[mask_right],
                        weights=weights[mask_right],
                    )

            target_fn0 = lambda T: target_fn(T, X, y, weights)

            minimize_result = minimize(target_fn0, x0=np.mean(X[:, feature_idx]))
            ans = {"value": minimize_result.fun, "threshold": minimize_result.x[0]}
        else:
            values, counts = np.unique(X[:, feature_idx], return_counts=True)
            probs = counts / y.shape[0]
            entropies = np.array(
                list(
                    map(
                        lambda x: entropy(
                            y[X[:, feature_idx] == x],
                            weights=weights[X[:, feature_idx] == x],
                        ),
                        values,
                    )
                )
            )
            ans = {"value": np.sum(probs * entropies), "threshold": None}

        return ans

    def _select_best_feature(
        X: npt.ArrayLike,
        y: npt.ArrayLike,
        features: list,
        cat_features: list,
        weights: list,
    ) -> list:
        """Select the feature with the highest information gain."""
        gains = [
            DecisionTreeClassifier._gain_function(
                X, y, i, feature in cat_features, weights
            )
            for i, feature in enumerate(features)
        ]
        best_idx = np.argmin(gain["value"] for gain in gains)
        return [best_idx, features[best_idx], gains[best_idx]["threshold"]]

    # =============================================================================
    # Tree pruning
    # =============================================================================

    def get_pruned_tree(self, alpha: float) -> Dict:
        """Prune the tree using cost-complexity pruning with parameter `alpha`."""
        ans = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            min_samples_split=self.min_samples_split,
        )
        ans._tree = deepcopy(self._tree)
        ans._features = deepcopy(self._features)
        ans._n_leaves = self._n_leaves
        ans._cat_features = deepcopy(self._cat_features)
        ans._prediction_dtype = deepcopy(self._prediction_dtype)
        return ans

    def prune_tree(self, alpha: float) -> None:
        """Prune the underlying decision tree using cost-complexity pruning with predefines `alpha`."""
        self._tree = DecisionTreeClassifier._cost_comprexity_pruning(
            self._tree, alpha, inplace=True
        )

    def _compute_subtree_error(tree: Dict) -> int:
        """Calculate the total misclassification error of a (sub)tree."""

        if not "children" in tree:
            return tree["error"]

        total_error = 0
        for child in tree["children"]:
            total_error += DecisionTreeClassifier._compute_subtree_error(
                tree["children"][child]
            )

        return total_error

    def _compute_subtree_leaves(tree: Dict) -> int:
        """Count the number of leaf nodes in a (sub)tree."""
        if not "children" in tree:
            return 1

        total_leaves = 0
        for child in tree["children"]:
            total_leaves += DecisionTreeClassifier._compute_subtree_leaves(
                tree["children"][child]
            )

        return total_leaves

    def _collect_pruning_candidates(tree: Dict, candidates: list) -> None:
        """Collect non-leaf nodes with their effective alpha values."""
        if not "children" in tree:
            return candidates

        subtree_error = DecisionTreeClassifier._compute_subtree_error(tree)
        complexity_error = DecisionTreeClassifier._compute_subtree_leaves(tree)
        R = tree["error"]
        effective_alpha = (R - subtree_error) / complexity_error

        for child in tree["children"]:
            DecisionTreeClassifier._collect_pruning_candidates(
                tree["children"][child], candidates
            )

        candidates.append((tree, effective_alpha))

        return candidates

    def _cost_comprexity_pruning(self, alpha: float, inplace: bool = None) -> dict:
        if inplace is True:
            tree_to_prune = deepcopy(self._tree)
        else:
            tree_to_prune = self._tree
        while True:
            candidates = []
            candidates = DecisionTreeClassifier._collect_pruning_candidates(
                tree_to_prune, candidates
            )
            candidates.sort(key=lambda x: x[1])

            if not candidates:
                break

            weakest_subtree, weakest_alpha = candidates[0]

            if weakest_alpha > alpha:
                break

            weakest_subtree["children"] = {}
            weakest_subtree.pop("feature")

        return tree_to_prune

    # =============================================================================
    # Tree visualization
    # =============================================================================

    def show_tree(self):
        """Visualize the decision tree."""
        dot = DecisionTreeClassifier._visualize_tree(self._tree, self._features)
        display(dot)

    def _visualize_tree(
        tree: Dict[str, Any],
        feature_names: list,
        dot: Digraph = None,
        parent: str = None,
        edge_label: str = None,
    ) -> Digraph:
        """Recursively visualize the decision tree using Graphviz."""
        if dot is None:
            dot = Digraph(comment="Decision Tree")

        # Create a unique node ID
        node_id = str(id(tree))

        # Add the current node
        if not "children" in tree:
            node_label = f"Class: {tree['majority_class']}\nError: {tree['error']}"
        else:
            node_label = f"Feature: {tree['feature']}\nError: {tree['error']}"
        dot.node(node_id, node_label)

        # Connect to parent node if exists
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)

        # Recursively add children
        if "children" in tree:
            for value, child in tree["children"].items():
                DecisionTreeClassifier._visualize_tree(
                    child, feature_names, dot, node_id, str(value)
                )

        return dot

    # =============================================================================
    # Utility functions
    # =============================================================================

In [4]:
class AdaBoostClassifierScratch(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator: object, n_estimators: int = None):
        self.n_estimators = 50 if n_estimators is None else n_estimators
        self.estimator = DecisionTreeClassifier if estimator is None else estimator
        self.classifier_weights_ = np.zeros(self.n_estimators)
        self.classifiers_ = [self.estimator for i in range(self.n_estimators)]

    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike):
        n_samples = X.shape[0]
        self.classes_ = np.unique(y)
        self.classifier_sample_weights_ = np.zeros((self.n_estimators, n_samples))
        self.classifier_sample_weights_[0, :] = np.ones(n_samples) / n_samples

        for i in range(self.n_estimators):
            self.classifiers_[i].fit(
                X, y, sample_weight=self.classifier_sample_weights_[i, :]
            )
            print(f"#{i+1}...OK!")
            preds = self.classifiers_[i].predict(X)
            errors_mask = preds != y

            # the error term is artificially increased so that the method continues working if classifier does perfect classification
            classifier_error = (
                np.sum(self.classifier_sample_weights_[i, errors_mask])
                / self.classifier_sample_weights_[i, :].sum()
            ) + 1e-10

            print(f"Error term: {classifier_error:0.2f}")
            if classifier_error < 1.0 - 1.0 / self.classes_.shape[0]:
                self.classifier_weights_[i] = (
                    np.log(1.0 - classifier_error)
                    - np.log(classifier_error)
                    + np.log(self.classes_.shape[0] - 1.0)
                )
                if i < self.n_estimators - 1:
                    self.classifier_sample_weights_[i + 1, :] = (
                        self.classifier_sample_weights_[i, :]
                        * np.exp(self.classifier_weights_[i] * errors_mask)
                    )

                    # Normalize the weights
                    self.classifier_sample_weights_[i + 1, :] = (
                        self.classifier_sample_weights_[i + 1, :]
                        / self.classifier_sample_weights_[i + 1, :].sum()
                    )
            else:
                print("Bad error value. Resetting the weights...")
                self.classifier_weights_[i] = 0.0
                if i < self.n_estimators - 1:
                    self.classifier_sample_weights_[i + 1, :] = (
                        np.ones(n_samples) / n_samples
                    )

    def predict(self, X):
        cls_predictions = np.zeros((self.n_estimators, X.shape[0]))
        for cls_num, cls in enumerate(self.classifiers_):
            cls_predictions[cls_num, :] = cls.predict(X)
        predictions = np.zeros(X.shape[0])
        for i in range(predictions.shape[0]):
            predictions[i] = self.classes_[
                np.argmax(
                    [
                        (
                            self.classifier_weights_
                            * (cls_predictions[:, i] == pred_class)
                        ).sum(axis=0)
                        for pred_class in self.classes_
                    ]
                )
            ]
        return predictions

In [45]:
songs_dir = os.path.join(os.getcwd(), "data", "sporify-songs")
df = pd.read_csv(os.path.join(songs_dir, "train.csv"))
df_test = pd.read_csv(os.path.join(songs_dir, "test.csv"))
trainonly_cols = [col for col in df.columns if not col in df_test]
print(
    "Столбцы возникающие только в train сете: ",
    [col for col in df.columns if not col in df_test],
)
df = df.drop("Unnamed: 0", axis=1)

Столбцы возникающие только в train сете:  ['Unnamed: 0', 'popularity']


In [46]:
print("Размер датасета: ", df.shape)

Размер датасета:  (3623, 29)


In [47]:
print("Множество столбцов в датасете:\n", df.columns)

Множество столбцов в датасете:
 Index(['energy', 'tempo', 'danceability', 'playlist_genre', 'loudness',
       'liveness', 'valence', 'track_artist', 'time_signature', 'speechiness',
       'track_href', 'uri', 'track_album_name', 'playlist_name',
       'analysis_url', 'track_id', 'track_name', 'track_album_release_date',
       'instrumentalness', 'track_album_id', 'mode', 'key', 'duration_ms',
       'acousticness', 'id', 'playlist_subgenre', 'type', 'playlist_id',
       'popularity'],
      dtype='object')


Логичным будет избавиться от категориальных признаков, имеющих уникальные значения от для каждого из треков: например, `analysis_url`, а также признаки, которые обозначают одни и те же данные (к примеру, `playlist_id` и `playlist_name`).

In [48]:
df["id"].value_counts().value_counts()

count
1    3244
2     171
3       8
4       3
Name: count, dtype: int64

In [49]:
df["playlist_id"].value_counts().value_counts()

count
11     7
20     5
6      5
9      5
41     4
15     4
14     4
10     4
22     4
19     3
34     3
38     3
16     3
18     3
17     3
13     3
30     3
50     2
51     2
7      2
33     2
32     2
2      2
4      2
23     2
8      2
3      2
21     2
27     2
48     1
42     1
47     1
82     1
73     1
103    1
125    1
119    1
104    1
149    1
182    1
148    1
62     1
56     1
71     1
69     1
63     1
74     1
81     1
25     1
35     1
37     1
39     1
40     1
12     1
Name: count, dtype: int64

In [50]:
df["playlist_name"].value_counts().value_counts()

count
11     7
20     5
6      5
15     5
9      5
41     4
14     4
10     4
22     4
23     3
30     3
34     3
16     3
18     3
17     3
13     3
19     3
50     2
51     2
7      2
33     2
38     2
2      2
4      2
27     2
8      2
3      2
21     2
32     2
48     1
42     1
47     1
82     1
73     1
103    1
125    1
119    1
104    1
149    1
182    1
148    1
62     1
56     1
71     1
69     1
63     1
74     1
81     1
25     1
35     1
37     1
39     1
40     1
12     1
Name: count, dtype: int64

In [51]:
df["track_album_name"].value_counts().value_counts()

count
1     2572
2      272
3       58
4       32
5       12
6        4
12       3
8        3
18       1
9        1
10       1
17       1
7        1
Name: count, dtype: int64

In [52]:
df["track_name"].value_counts().value_counts()

count
1    3109
2     208
3      26
4       5
Name: count, dtype: int64

In [53]:
df["track_id"].value_counts().value_counts()

count
1    3245
2     171
3       8
4       3
Name: count, dtype: int64

In [54]:
df["analysis_url"].value_counts().value_counts()

count
1    3244
2     171
3       8
4       3
Name: count, dtype: int64

In [55]:
df["uri"].value_counts().value_counts()

count
1    3244
2     171
3       8
4       3
Name: count, dtype: int64

In [56]:
df = df.drop(
    [
        "track_name",
        "track_id",
        "id",
        "track_href",
        "track_album_name",
        "track_album_id",
        "track_album_release_date",
        "analysis_url",
        "uri",
        "playlist_id",
        "type",
    ],
    axis=1,
)

In [63]:
df["track_artist"] = df["track_artist"].map(
    lambda track: "other" if df["track_artist"].value_counts()[track] < 3 else track
)

In [66]:
df["track_artist"].value_counts()

track_artist
other                      2784
Bad Bunny                    25
Ren Avel                     24
Bnxn                         16
Seyi Vibez                   15
                           ... 
Benjamin                      3
Felicia Takman                3
Lady Gaga                     3
Dropkick Murphys              3
21 Savage, Metro Boomin       3
Name: count, Length: 179, dtype: int64

In [67]:
cat_features = [
    "playlist_genre",
    "track_artist",
    "playlist_name",
    "playlist_subgenre",
    "key",
    "mode",
    "time_signature",
]

In [68]:
df.head()

Unnamed: 0,energy,tempo,danceability,playlist_genre,loudness,liveness,valence,track_artist,time_signature,speechiness,playlist_name,instrumentalness,mode,key,duration_ms,acousticness,playlist_subgenre,popularity
0,0.00948,67.237,0.188,wellness,-37.23,0.109,0.0738,other,4.0,0.0376,Yoga & Meditation,0.96,0.0,7.0,151340.0,0.971,yoga,46
1,0.413,94.938,0.494,pop,-10.432,0.193,0.273,Gigi Perez,4.0,0.0254,Global Top 50,6.7e-05,1.0,11.0,211979.0,0.682,global,93
2,0.386,142.127,0.519,world,-12.732,0.375,0.313,Yume.Play,4.0,0.0375,Chinese Traditional,0.773,0.0,3.0,124620.0,0.68,chinese,34
3,0.898,132.027,0.779,gospel,-4.589,0.182,0.675,other,4.0,0.085,Modern Gospel,0.142,1.0,8.0,1034000.0,0.022,modern,30
4,0.409,170.071,0.714,electronic,-6.476,0.0883,0.497,other,4.0,0.0787,French Touch,0.0,1.0,0.0,136324.0,0.0821,french,17


In [69]:
X_train, y_train, X_test, y_test = train_test_split(
    df.drop("popularity", axis=1), df["popularity"], test_size=0.7, random_state=42
)

In [70]:
adaboost = AdaBoostClassifierScratch(
    estimator=DecisionTreeClassifier(),
    n_estimators=4,
)
adaboost.fit(X_train, y_train)
y_pred = adaboost.predict(X_test)
accuracy_score(y_test, y_pred)