In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt

from goss import SimpleGOSS

data = pd.read_csv("./data/boston.csv")
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# best_score = 1000
# iter = 0
# for i in range(200):
#     print(i)
model = SimpleGOSS(n_trees=50, learning_rate=0.01, a=0.1, b=0.1, max_depth=8, random_state=42, max_bin=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = mean_squared_error(y_test, y_pred)
# if score < best_score:
#     best_score = score
#     iter = i+1
print("MSE:", mean_squared_error(y_test, y_pred))

MSE: 29.935871759791905


In [14]:
params = {'n_trees': [50, 70, 100, 150, 200, 300],
          'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
          'a': [0.1, 0.2, 0.3, 0.4],
          'b': [0.1, 0.2, 0.3, 0.4],
          'max_depth': list(np.arange(3, 10)),
          'max_bin': [5, 10, 20, 50, 100, 255],
          }
grid = ParameterGrid(params)

best_score = 10000
best_param = None

total = len(grid)
for i, param in enumerate(grid):
    print(f'{i+1}/{total}')
    model = SimpleGOSS(**param)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred_test)

    if score < best_score:
        best_score = score
        best_param = param

1/3360
2/3360
3/3360
4/3360
5/3360
6/3360
7/3360
8/3360
9/3360
10/3360
11/3360
12/3360
13/3360
14/3360
15/3360
16/3360
17/3360
18/3360
19/3360
20/3360
21/3360
22/3360
23/3360
24/3360
25/3360
26/3360
27/3360
28/3360
29/3360
30/3360
31/3360
32/3360
33/3360
34/3360
35/3360
36/3360
37/3360
38/3360
39/3360
40/3360
41/3360
42/3360
43/3360
44/3360
45/3360
46/3360
47/3360
48/3360
49/3360
50/3360
51/3360
52/3360
53/3360
54/3360
55/3360
56/3360
57/3360
58/3360
59/3360
60/3360
61/3360
62/3360
63/3360
64/3360
65/3360
66/3360
67/3360
68/3360
69/3360
70/3360
71/3360
72/3360
73/3360
74/3360
75/3360
76/3360
77/3360
78/3360
79/3360
80/3360
81/3360
82/3360
83/3360
84/3360
85/3360
86/3360
87/3360
88/3360
89/3360
90/3360
91/3360
92/3360
93/3360
94/3360
95/3360
96/3360
97/3360
98/3360
99/3360
100/3360
101/3360
102/3360
103/3360
104/3360
105/3360
106/3360
107/3360
108/3360
109/3360
110/3360
111/3360
112/3360
113/3360
114/3360
115/3360
116/3360
117/3360
118/3360
119/3360
120/3360
121/3360
122/3360
123/3360
1

In [16]:
best_param

{'a': 0.1, 'b': 0.1, 'learning_rate': 0.001, 'max_depth': 8, 'n_trees': 50}

In [None]:
model.plot_grad_with_iter(10)

In [None]:
model.plot_grads()

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def load_boston_dataset():
    data = fetch_openml(name='Boston', version=1, as_frame=False)
    X, y = data.data, data.target
    features = data.feature_names
    return X, y, features

In [None]:
X, y, feats = load_boston_dataset()
df = pd.DataFrame(X, columns=feats)
df['target'] = y

df.to_csv('boston.csv', index=False)

In [None]:
pd.Series(y)

In [None]:
class SimpleGOSS:
    def __init__(self, n_trees=100, learning_rate=0.01, a=0.2, b=0.1, max_depth=4):
        self.n_trees = n_trees
        self.learning_rate = learning_rate
        self.a = a
        self.b = b
        self.max_depth = max_depth
        self.trees = []
        self.costs = []

    # MSE
    def _calc_cost(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    # MSEの負の勾配
    # MSEの勾配は: -2 * (y - y_pred) / len(y)
    # MSEの負の勾配を使っても良いが、定数倍は勾配降下法の学習率で調整できるため、簡潔に残差を使う。
    # ただし残差はMSEの負の勾配に比例するので、採用して問題ない。
    def _calc_gradients(self, y, y_pred):
        return y - y_pred
        # return 2 * (y - y_pred) / len(y)

    def _goss_sampling(self, X, top_n, rand_n, grads):
        top_indices = np.argpartition(np.abs(grads), -top_n)[-top_n:]
        rand_indices = np.setdiff1d(np.arange(len(X)), top_indices)
        # 論文に忠実な実装
        # rand_indices = np.random.choice(np.setdiff1d(np.arange(len(X)), top_indices), size=rand_n, replace=False)
        # こっちの方が精度が出る
        print(f"top_indices: {top_indices}, rand_indices: {rand_indices}")
        rand_indices = np.random.choice(rand_indices, size=rand_n, replace=False, p=np.abs(grads[rand_indices]) / np.abs(grads[rand_indices]).sum())
        used_indices = np.concatenate([top_indices, rand_indices])
        return used_indices

    def fit(self, X, y):
        # X = np.array(X)
        # y = np.array(y)
        np.random.seed(42)

        self.F0 = y.mean()
        Fm = np.repeat(self.F0, X.shape[0])

        top_n = int(self.a * len(X))
        rand_n = int(self.b * len(X))

        for _ in range(self.n_trees):
            grads = self._calc_gradients(y, Fm)

            used_indices = self._goss_sampling(X, top_n, rand_n, grads)

            # 重みを計算
            # ランダムサンプリングしたデータに重みをつける
            # 論文に忠実な実装
            # top_wight = np.repeat(self.a / top_n, top_n)
            # rand_weight = np.repeat((1 - self.a) / self.b, rand_n)
            # weight = np.concatenate([top_wight, rand_weight])
            # こっちの方が精度が出る
            weight = np.abs(grads[used_indices])

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=42)
            tree.fit(X[used_indices], grads[used_indices], sample_weight=weight)

            self.costs.append(self._calc_cost(y[used_indices], Fm[used_indices]))

            # Fmを更新
            Fm += self.learning_rate * tree.predict(X)

            self.trees.append(tree)
        return self

    def predict(self, X):
        Fm = np.repeat(self.F0, X.shape[0])
        pred = Fm + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)
        return pred

In [9]:
from sklearn.model_selection import ParameterGrid

X, y = load_boston_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {'n_trees': [50, 70, 100, 150, 200, 300], 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1], 'a': [0.1, 0.2, 0.3, 0.4], 'b': [0.1, 0.2, 0.3, 0.4], 'max_depth': list(np.arange(3, 10))}
grid = ParameterGrid(params)

best_score = 10000
best_param = None

total = len(grid)
for i, param in enumerate(grid):
    print(f'{i+1}/{total}')
    model = SimpleGOSS(**param)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred_test)

    if score < best_score:
        best_score = score
        best_param = param

NameError: name 'load_boston_dataset' is not defined

In [None]:
print(best_score)
print(best_param)

In [5]:
# X, y = load_boston_dataset()
data = pd.read_csv("boston.csv")
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

goss = SimpleGOSS(n_trees=70, learning_rate=0.1, a=0.2, b=0.4, max_depth=7)
goss.fit(X_train, y_train)
y_pred_train = goss.predict(X_train)
y_pred_test = goss.predict(X_test)

print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

plt.plot(goss.costs)

FileNotFoundError: [Errno 2] No such file or directory: 'boston.csv'

In [None]:
tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)
y_pred_train = tree.predict(X_train)
y_pred_test = tree.predict(X_test)
print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

In [None]:
class GBDT:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.costs = []
    
    def _calc_cost(self, y, y_pred):
        return np.mean(np.abs(y - y_pred))
    
    def _calc_gradients(self, y, y_pred):
        return y - y_pred

    def fit(self, X, y):
        self.F0 = y.mean()
        Fm = np.repeat(self.F0, y.shape[0])

        for _ in range(self.n_estimators):
            # 残差を計算
            grads = self._calc_gradients(y, Fm)
            self.costs.append(self._calc_cost(y, Fm))

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=42)
            tree.fit(X, grads)
            self.trees.append(tree)

            Fm += self.learning_rate * tree.predict(X)

    def predict(self, X):
        Fm = np.repeat(self.F0, X.shape[0])
        pred = Fm + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)
        return pred

In [None]:
tree = GBDT(max_depth=6)
tree.fit(X_train, y_train)
y_pred_train = tree.predict(X_train)
y_pred_test = tree.predict(X_test)
print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

plt.plot(tree.costs)