In [257]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt

In [181]:
# 勾配法と言いつつ、勾配は計算していない。残差による再学習を繰り返す。
class GBDT:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.F0 = y.mean()
        # 初期予測値は、目的変数の平均値
        Fm = np.repeat(self.F0, y.shape[0])

        # 残差を0に近づけるように学習を繰り返す
        # ポイントは、yを予測するモデルを作っているのではなく、残差を予測するモデルを作っているということ。
        # つまり、残差を予測するモデルを作って、それを予測値に足していくことで、残差を0に近づけている。
        for _ in range(self.n_estimators):
            # 残差を計算
            residual = y - Fm

            # 残差に対する決定木の学習
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            self.trees.append(tree)

            # 予測値を更新
            # predictは残差の予測値を返す。つまりFが残差分更新されると、次の残差は0に近づく。
            Fm += self.learning_rate * tree.predict(X)

    def predict(self, X):
        Fm = np.repeat(self.F0, X.shape[0])
        # 予測値は、初期予測値 + 残差の予測値の総和
        # 前のモデルの残差の残差を予測するモデルを作ったので、それらを足し合わせることで、より正確な残差を予測できる。
        # それを最初の予測値に足せば、予測値 + 正確な残差 = 正確な予測値が得られる。
        pred = Fm + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)
        return np.where(pred > 0, 1, 0)

In [182]:
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [183]:
gbdt = GBDT()
gbdt.fit(X_train, y_train)
y_pred = gbdt.predict(X_test)
scratch_score = accuracy_score(y_test, y_pred)

gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)
y_pred = gbdt.predict(X_test)
sklearn_score = accuracy_score(y_test, y_pred)

print(f"scratch: {scratch_score}, sklearn: {sklearn_score}")
print(f"diff: {scratch_score - sklearn_score}")

scratch: 0.8362573099415205, sklearn: 0.9590643274853801
diff: -0.12280701754385959


In [263]:
class Lad_GBDT(object):
    def __init__(self, n_estimators: int = 100, max_depth: int = 3) -> None:
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.trees = []
        self.regions = []
        self.gammas = []
        self.f0 = 0

    def __compute_gammas(self, y_pred: np.array, y: np.array, fm: np.array) -> tuple[np.array, dict]:
        # gammaは、残差を元に予測した値を使って、fmを更新する変化量
        gamma_jm = np.zeros(y.shape[0])
        regions = np.unique(y_pred)
        gamma = {}
        for r in regions:
            idx = y_pred == r
            fm_r = fm[idx]
            y_r = y[idx]
            gamma_r = np.median(y_r - fm_r)
            gamma_jm[idx] = gamma_r
            gamma[r] = gamma_r

        self.regions.append(regions)
        return (gamma_jm, gamma)

    def fit(self, X: np.array, y :np.array) -> None:
        self.trees = []
        self.regions = []
        self.tree_weights = []

        self.f0 = np.median(y)
        fm = np.repeat(self.f0, y.shape[0])

        for _ in range(self.n_estimators):
            residual = y - fm
            y_tilda = np.sign(residual)

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, y_tilda)

            y_pred = tree.predict(X)
            gamma_jm, gamma = self.__compute_gammas(y_pred, y, fm)
            fm += gamma_jm
            self.trees.append(tree)
            self.gammas.append(gamma)
        
    def predict(self, X: np.array) -> np.array:
        fm = np.repeat(self.f0, X.shape[0])
        for tree, gamma, regions in zip(self.trees, self.gammas, self.regions):
            y_pred = tree.predict(X)
            for r in regions:
                idx = y_pred == r
                fm[idx] += gamma[r] 
        return(fm)

In [264]:
lad = Lad_GBDT()
lad.fit(X_train, y_train)
y_pred = lad.predict(X_test)
accuracy_score(y_test, y_pred)

0.9590643274853801