In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt

from goss import SimpleGOSS

data = pd.read_csv("./data/boston.csv")
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# best_score = 1000
# iter = 0
# for i in range(200):
#     print(i)
model = SimpleGOSS(n_trees=50, learning_rate=0.01, a=0.1, b=0.1, max_depth=8, random_state=42, max_bin=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = mean_squared_error(y_test, y_pred)
# if score < best_score:
#     best_score = score
#     iter = i+1
print("MSE:", mean_squared_error(y_test, y_pred))

MSE: 29.935871759791905


In [78]:
params = {'n_trees': [50, 70, 100, 150, 200, 300],
          'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
          'a': [0.1, 0.2, 0.3, 0.4],
          'b': [0.1, 0.2, 0.3, 0.4],
          'max_depth': list(np.arange(3, 10)),
          'max_bin': [5, 10, 20, 50, 100, 255],
          }
grid = ParameterGrid(params)

best_score = 10000
best_param = None

total = len(grid)
for i, param in enumerate(grid):
    print(f'{i+1}/{total}')
    model = SimpleGOSS(**param)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred_test)

    if score < best_score:
        best_score = score
        best_param = param

1/20160
2/20160
3/20160
4/20160
5/20160
6/20160
7/20160
8/20160
9/20160
10/20160
11/20160
12/20160
13/20160
14/20160
15/20160
16/20160
17/20160
18/20160
19/20160
20/20160
21/20160
22/20160
23/20160
24/20160
25/20160
26/20160
27/20160
28/20160
29/20160
30/20160
31/20160
32/20160
33/20160
34/20160
35/20160
36/20160
37/20160
38/20160
39/20160
40/20160
41/20160
42/20160
43/20160
44/20160
45/20160
46/20160
47/20160
48/20160
49/20160
50/20160
51/20160
52/20160
53/20160
54/20160
55/20160
56/20160
57/20160
58/20160
59/20160
60/20160
61/20160
62/20160
63/20160
64/20160
65/20160
66/20160
67/20160
68/20160
69/20160
70/20160
71/20160
72/20160
73/20160
74/20160
75/20160
76/20160
77/20160
78/20160
79/20160
80/20160
81/20160
82/20160
83/20160
84/20160
85/20160
86/20160
87/20160
88/20160
89/20160
90/20160
91/20160
92/20160
93/20160
94/20160
95/20160
96/20160
97/20160
98/20160
99/20160
100/20160
101/20160
102/20160
103/20160
104/20160
105/20160
106/20160
107/20160
108/20160
109/20160
110/20160
111/2016

In [79]:
print(best_score)
print(best_param)

16.46282935409331
{'a': 0.3, 'b': 0.1, 'learning_rate': 0.05, 'max_bin': 20, 'max_depth': 8, 'n_trees': 50}


In [None]:
model.plot_grad_with_iter(10)

In [None]:
model.plot_grads()

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def load_boston_dataset():
    data = fetch_openml(name='Boston', version=1, as_frame=False)
    X, y = data.data, data.target
    features = data.feature_names
    return X, y, features

In [None]:
X, y, feats = load_boston_dataset()
df = pd.DataFrame(X, columns=feats)
df['target'] = y

df.to_csv('boston.csv', index=False)

In [None]:
pd.Series(y)

In [None]:
class SimpleGOSS:
    def __init__(self, n_trees=100, learning_rate=0.01, a=0.2, b=0.1, max_depth=4):
        self.n_trees = n_trees
        self.learning_rate = learning_rate
        self.a = a
        self.b = b
        self.max_depth = max_depth
        self.trees = []
        self.costs = []

    # MSE
    def _calc_cost(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    # MSEの負の勾配
    # MSEの勾配は: -2 * (y - y_pred) / len(y)
    # MSEの負の勾配を使っても良いが、定数倍は勾配降下法の学習率で調整できるため、簡潔に残差を使う。
    # ただし残差はMSEの負の勾配に比例するので、採用して問題ない。
    def _calc_gradients(self, y, y_pred):
        return y - y_pred
        # return 2 * (y - y_pred) / len(y)

    def _goss_sampling(self, X, top_n, rand_n, grads):
        top_indices = np.argpartition(np.abs(grads), -top_n)[-top_n:]
        rand_indices = np.setdiff1d(np.arange(len(X)), top_indices)
        # 論文に忠実な実装
        # rand_indices = np.random.choice(np.setdiff1d(np.arange(len(X)), top_indices), size=rand_n, replace=False)
        # こっちの方が精度が出る
        print(f"top_indices: {top_indices}, rand_indices: {rand_indices}")
        rand_indices = np.random.choice(rand_indices, size=rand_n, replace=False, p=np.abs(grads[rand_indices]) / np.abs(grads[rand_indices]).sum())
        used_indices = np.concatenate([top_indices, rand_indices])
        return used_indices

    def fit(self, X, y):
        # X = np.array(X)
        # y = np.array(y)
        np.random.seed(42)

        self.F0 = y.mean()
        Fm = np.repeat(self.F0, X.shape[0])

        top_n = int(self.a * len(X))
        rand_n = int(self.b * len(X))

        for _ in range(self.n_trees):
            grads = self._calc_gradients(y, Fm)

            used_indices = self._goss_sampling(X, top_n, rand_n, grads)

            # 重みを計算
            # ランダムサンプリングしたデータに重みをつける
            # 論文に忠実な実装
            # top_wight = np.repeat(self.a / top_n, top_n)
            # rand_weight = np.repeat((1 - self.a) / self.b, rand_n)
            # weight = np.concatenate([top_wight, rand_weight])
            # こっちの方が精度が出る
            weight = np.abs(grads[used_indices])

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=42)
            tree.fit(X[used_indices], grads[used_indices], sample_weight=weight)

            self.costs.append(self._calc_cost(y[used_indices], Fm[used_indices]))

            # Fmを更新
            Fm += self.learning_rate * tree.predict(X)

            self.trees.append(tree)
        return self

    def predict(self, X):
        Fm = np.repeat(self.F0, X.shape[0])
        pred = Fm + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)
        return pred

In [9]:
from sklearn.model_selection import ParameterGrid

X, y = load_boston_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {'n_trees': [50, 70, 100, 150, 200, 300], 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1], 'a': [0.1, 0.2, 0.3, 0.4], 'b': [0.1, 0.2, 0.3, 0.4], 'max_depth': list(np.arange(3, 10))}
grid = ParameterGrid(params)

best_score = 10000
best_param = None

total = len(grid)
for i, param in enumerate(grid):
    print(f'{i+1}/{total}')
    model = SimpleGOSS(**param)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred_test)

    if score < best_score:
        best_score = score
        best_param = param

NameError: name 'load_boston_dataset' is not defined

In [None]:
print(best_score)
print(best_param)

In [5]:
# X, y = load_boston_dataset()
data = pd.read_csv("boston.csv")
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

goss = SimpleGOSS(n_trees=70, learning_rate=0.1, a=0.2, b=0.4, max_depth=7)
goss.fit(X_train, y_train)
y_pred_train = goss.predict(X_train)
y_pred_test = goss.predict(X_test)

print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

plt.plot(goss.costs)

FileNotFoundError: [Errno 2] No such file or directory: 'boston.csv'

In [None]:
tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)
y_pred_train = tree.predict(X_train)
y_pred_test = tree.predict(X_test)
print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

In [None]:
class GBDT:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.costs = []
    
    def _calc_cost(self, y, y_pred):
        return np.mean(np.abs(y - y_pred))
    
    def _calc_gradients(self, y, y_pred):
        return y - y_pred

    def fit(self, X, y):
        self.F0 = y.mean()
        Fm = np.repeat(self.F0, y.shape[0])

        for _ in range(self.n_estimators):
            # 残差を計算
            grads = self._calc_gradients(y, Fm)
            self.costs.append(self._calc_cost(y, Fm))

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=42)
            tree.fit(X, grads)
            self.trees.append(tree)

            Fm += self.learning_rate * tree.predict(X)

    def predict(self, X):
        Fm = np.repeat(self.F0, X.shape[0])
        pred = Fm + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)
        return pred

In [None]:
tree = GBDT(max_depth=6)
tree.fit(X_train, y_train)
y_pred_train = tree.predict(X_train)
y_pred_test = tree.predict(X_test)
print(f"Train error: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"Test error: {mean_squared_error(y_test, y_pred_test):.2f}")

plt.plot(tree.costs)