In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import networkx as nx

from efb import SimpleEFB

In [251]:
data = pd.read_csv("./data/boston.csv")
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_onehot = pd.concat([X, pd.get_dummies(data["CHAS"], prefix="CHAS", dtype=int)], axis=1)
# X_onehot.drop("CHAS", axis=1, inplace=True)

# X_onehot = pd.concat([X, pd.get_dummies(data["RAD"], prefix="RAD", dtype=int)], axis=1)
# X_onehot.drop("RAD", axis=1, inplace=True)
# X_onehot['ADD'] = 1

# X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2, random_state=42)

In [329]:
efb = SimpleEFB(n_trees=700, learning_rate=0.002, max_depth=9, random_state=42, max_bin=4)
efb.fit(X_train, y_train)
y_pred = efb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"mse: {mse}")

mse: 22.682990593076475


In [252]:
max_bin = 255

def _get_bins(X: pd.Series) -> int:
    unique_cnt = X.nunique()
    return min(unique_cnt, max_bin)

In [253]:
def _create_weighted_feature_graph(X: pd.DataFrame) -> nx.Graph:
    G = nx.Graph()
    feats = list(X.columns)
    for feat in feats:
        G.add_node(feat)

    for i, feature_i in enumerate(feats):
        for j, feature_j in enumerate(feats):
            if i < j:
                X_i = pd.cut(X[feature_i], bins=_get_bins(X[feature_i]), labels=False).astype(int)
                X_j = pd.cut(X[feature_j], bins=_get_bins(X[feature_j]), labels=False).astype(int)

                non_zero_mask = (X_i != 0) & (X_j != 0)
                conflicts = (X_i[non_zero_mask] == X_j[non_zero_mask]).sum()

                if conflicts > 0:
                    G.add_edge(feature_i, feature_j, weight=conflicts)
    return G

In [254]:
def _greedy_bundling(G: nx.Graph, total_sample_cnt, threshold=None) -> dict:
    if threshold is None:
        threshold = int(total_sample_cnt / 10000)

    bundles = {}
    sortedNodes = sorted(G.degree(weight='weight'), key=lambda x: x[1], reverse=True)

    for feat, degree in sortedNodes:
        needNew = True
        for i in range(len(bundles)):
            conflicts = 0
            for f in bundles[i]:
                if f in G[feat]:
                    conflicts += G[feat][f]['weight']

            if conflicts <= threshold:
                bundles[i].append(feat)
                needNew = False
                break
        if needNew:
            idx = len(bundles)
            bundles[idx] = [feat]
    
    return bundles

In [321]:
def _merge_exclusive_feature(X: pd.DataFrame, bundles: dict) -> pd.DataFrame:
    df = pd.DataFrame()

    for i, feats in enumerate(bundles.values()):
        bin_ranges = {}
        total_bin = 0

        # offsetを計算
        for feat in feats:
            # 各特徴量のbin数を加算していくことで、offsetを計算する
            total_bin += _get_bins(X[feat])
            bin_ranges[feat] = total_bin
        
        bin = pd.Series(np.zeros(len(X), dtype=int))
        for feat in feats:
            bin_df = pd.cut(X[feat], bins=_get_bins(X[feat]), labels=False)
            zero_mask = bin_df == 0

            # bin値が0の場合は、offsetを加算しない
            # そもそもここで加算している特徴量同士は、同時にnonzero値を取らないことが前提であるため、
            # offsetの加算により、どの特徴量のbin値かが一意に特定できる。0にoffsetを加算すると、この一意性が失われる。
            bin_df += bin_ranges[feat]
            bin_df[zero_mask] = 0

            bin += bin_df

        df[i] = bin

    return df

In [322]:
features = list(X_train.columns)
b = {}
for i, f in enumerate(features):
    b[i] = [f]

random_state = 42

np.random.seed(random_state)

X = X_train.copy()
y = y_train.copy()

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

F0 = y.mean()
Fm = np.repeat(F0, X.shape[0])

G = _create_weighted_feature_graph(X)
bundles = _greedy_bundling(G, X.shape[0])
X = _merge_exclusive_feature(X, b)

# X = np.array(X)
# y = np.array(y)

In [323]:
X.max()

0     509
1      49
2     143
3       3
4     159
5     509
6     509
7     509
8      17
9     127
10     89
11    509
12    509
dtype: int64

In [325]:
y_pred = DecisionTreeRegressor(max_depth=5, random_state=random_state).fit(X_train, y_train).predict(X_test)
print(mean_squared_error(y_test, y_pred))

8.553906584646844


In [326]:
X_t = X_test.copy()
y_t = y_test.copy()

X_t = X_t.reset_index(drop=True)
y_t = y_t.reset_index(drop=True)

X_test_bundled = _merge_exclusive_feature(X_t, b)

for i in range(10):
    y_pred = DecisionTreeRegressor(max_depth=i+1, random_state=random_state).fit(X, y_train).predict(X_test_bundled)
    print(mean_squared_error(y_t, y_pred))

75.77190795068026
76.64684038569845
601.7251348039217
886.2545098039217
886.2545098039217
886.2545098039217
886.2545098039217
886.2545098039217
886.2545098039217
886.2545098039217


In [302]:
y_test

173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
412    17.9
436     9.6
411    17.2
86     22.5
75     21.4
Name: target, Length: 102, dtype: float64

In [303]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0.0,0.510,6.416,84.1,2.6463,4.0,296.0,16.6,395.50,9.04
274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,3.0,254.0,17.6,396.90,3.53
491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,3.0,711.0,20.1,390.11,18.07
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,3.0,305.0,19.2,390.91,5.52
452,5.09017,0.0,18.10,0.0,0.713,6.297,91.8,2.3682,8.0,666.0,20.2,385.09,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,18.81100,0.0,18.10,0.0,0.597,4.628,100.0,1.5539,8.0,666.0,20.2,28.79,34.37
436,14.42080,0.0,18.10,0.0,0.740,6.461,93.3,2.0026,8.0,666.0,20.2,27.49,18.05
411,14.05070,0.0,18.10,0.0,0.597,6.657,100.0,1.5275,8.0,666.0,20.2,35.05,21.22
86,0.05188,0.0,4.49,0.0,0.449,6.015,45.1,4.4272,2.0,247.0,18.5,395.99,12.86


In [304]:
X_test_bundled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0,48,0,66,162,157,110,13,47,43,145,118
1,0,19,52,3,59,169,110,125,12,43,46,145,101
2,0,0,85,0,77,152,170,103,12,77,55,144,144
3,0,0,59,0,55,154,87,137,12,47,52,144,107
4,113,0,70,0,88,159,164,108,17,74,55,143,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,143,0,70,0,75,122,171,99,17,74,55,77,192
98,134,0,70,0,91,163,165,104,17,74,55,76,144
99,133,0,70,0,75,167,171,99,17,74,55,78,153
100,0,0,49,0,59,153,121,128,11,43,49,145,129


In [334]:
np.arange(0.001, 0.5, 0.005)

array([0.001, 0.006, 0.011, 0.016, 0.021, 0.026, 0.031, 0.036, 0.041,
       0.046, 0.051, 0.056, 0.061, 0.066, 0.071, 0.076, 0.081, 0.086,
       0.091, 0.096, 0.101, 0.106, 0.111, 0.116, 0.121, 0.126, 0.131,
       0.136, 0.141, 0.146, 0.151, 0.156, 0.161, 0.166, 0.171, 0.176,
       0.181, 0.186, 0.191, 0.196, 0.201, 0.206, 0.211, 0.216, 0.221,
       0.226, 0.231, 0.236, 0.241, 0.246, 0.251, 0.256, 0.261, 0.266,
       0.271, 0.276, 0.281, 0.286, 0.291, 0.296, 0.301, 0.306, 0.311,
       0.316, 0.321, 0.326, 0.331, 0.336, 0.341, 0.346, 0.351, 0.356,
       0.361, 0.366, 0.371, 0.376, 0.381, 0.386, 0.391, 0.396, 0.401,
       0.406, 0.411, 0.416, 0.421, 0.426, 0.431, 0.436, 0.441, 0.446,
       0.451, 0.456, 0.461, 0.466, 0.471, 0.476, 0.481, 0.486, 0.491,
       0.496])

In [335]:
from sklearn.model_selection import ParameterGrid

def simple_hyperparameter_search(X_train, y_train, X_test, y_test):
    param = {
        "n_trees": np.arange(10, 1000, 10),
        "learning_rate": np.arange(0.001, 0.5, 0.005),
        "max_depth": np.arange(3, 100, 5),
    }
    param_grid = ParameterGrid(param)

    best_score = 1000
    best_param = None
    total = len(param_grid)
    for i, p in enumerate(param_grid):
        if i % 30 == 0:
            print(f"{i}/{total}")
        efb = SimpleEFB(**p)
        efb.fit(X_train, y_train)
        y_pred = efb.predict(X_test)
        score = mean_squared_error(y_test, y_pred)

        if score < best_score:
            best_score = score
            best_param = p
    
    return best_param, best_score

In [336]:
simple_hyperparameter_search(X_train, y_train, X_test, y_test)

0/198000
30/198000
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/shuomura/workspace/math/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/ps/ryfzkqr13636fgz22v2x5h1h0000gn/T/ipykernel_97970/587577965.py", line 1, in <module>
    simple_hyperparameter_search(X_train, y_train, X_test, y_test)
  File "/var/folders/ps/ryfzkqr13636fgz22v2x5h1h0000gn/T/ipykernel_97970/3656766139.py", line 18, in simple_hyperparameter_search
    efb.fit(X_train, y_train)
  File "/Users/shuomura/workspace/math/lightgbm/efb.py", line 139, in fit
  File "/Users/shuomura/workspace/math/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/Users/shuomura/workspace/math/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 379, in fit
    builder.build(self.tree_, X, y, sample_weight)
KeyboardInterrupt

During handling of the above exception, another e

In [30]:
efb = SimpleEFB(n_trees=1000, learning_rate=0.002, max_depth=4, random_state=42, max_bin=4)
efb.fit(X_train, y_train)
y_pred_train = efb.predict(X_train)
y_pred_test = efb.predict(X_test)

print(f"Train MSE: {mean_squared_error(y_train, y_pred_train):.4f}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred_test):.4f}")

Train MSE: 15.2900
Test MSE: 21.6233
