-
Notifications
You must be signed in to change notification settings - Fork 1
/
HHCART.py
167 lines (142 loc) · 5.97 KB
/
HHCART.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from sklearn.base import BaseEstimator
from scipy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
import numpy as np
from copy import deepcopy
class HHCARTNode:
def __init__(self, depth, labels, **kwargs):
self.depth = depth
self.labels = labels
self.is_leaf = kwargs.get('is_leaf', False)
self._split_rules = kwargs.get('split_rules', None)
self._weights = kwargs.get('weights', None)
self._left_child = kwargs.get('left_child', None)
self._right_child = kwargs.get('right_child', None)
if not self.is_leaf:
assert self._split_rules
assert self._left_child
assert self._right_child
def get_child(self, datum):
if self.is_leaf:
raise StandardError("Leaf node does not have children.")
X = deepcopy(datum)
if X.dot(np.array(self._weights[:-1]).T) - self._weights[-1] < 0:
return self.left_child
else:
return self.right_child
@property
def label(self):
if not hasattr(self, '_label'):
self._label = np.mean(self.labels)
return self._label
@property
def split_rules(self):
if self.is_leaf:
raise StandardError("Leaf node does not have split rule.")
return self._split_rules
@property
def left_child(self):
if self.is_leaf:
raise StandardError("Leaf node does not have split rule.")
return self._left_child
@property
def right_child(self):
if self.is_leaf:
raise StandardError("Leaf node does not have split rule.")
return self._right_child
class HouseHolderCART(BaseEstimator):
def __init__(self, impurity, segmentor, method='eig', tau=1e-4, **kwargs):
self.impurity = impurity
self.segmentor = segmentor
self.method = method
self.tau = tau
self._max_depth = kwargs.get('max_depth', None)
self._min_samples = kwargs.get('min_samples', 2)
self._alpha = kwargs.get('alpha', None) #only for linreg method
self._root = None
self._nodes = []
def _terminate(self, X, y, cur_depth):
if self._max_depth != None and cur_depth == self._max_depth:
# maximum depth reached.
return True
elif y.size < self._min_samples:
# minimum number of samples reached.
return True
elif np.unique(y).size == 1:
return True
else:
return False
def _generate_leaf_node(self, cur_depth, y):
node = HHCARTNode(cur_depth, y, is_leaf=True)
self._nodes.append(node)
return node
def _generate_node(self, X, y, cur_depth):
if self._terminate(X, y, cur_depth):
return self._generate_leaf_node(cur_depth, y)
else:
n_objects, n_features = X.shape
impurity_best, sr, left_indices, right_indices = self.segmentor(X, y, self.impurity)
#generate Housholder matrix
if self.method == 'eig':
extractor = PCA(n_components=1)
extractor.fit(X, y)
mu = extractor.components_[0]
if self.method == 'ridge':
reg_model = Ridge(alpha=self._alpha)
reg_model.fit(X, y)
mu = reg_model.coef_
I = np.diag(np.ones(n_features))
check_ = np.sqrt(((I - mu) ** 2).sum(axis=1))
if (check_ > self.tau).sum() > 0:
i = np.argmax(check_)
e = np.zeros(n_features)
e[i] = 1.0
w = (e - mu) / norm(e - mu)
householder_matrix = I - 2 * w[:, np.newaxis].dot(w[:, np.newaxis].T)
X_house = X.dot(householder_matrix)
impurity_house, sr_house, left_indices_house, right_indices_house = self.segmentor(X_house, y, self.impurity)
if impurity_best > impurity_house:
impurity_best = impurity_house
left_indices = left_indices_house
right_indices = right_indices_house
sr = sr_house
else:
householder_matrix = I
if not sr:
return self._generate_leaf_node(cur_depth, y)
i, treshold = sr
weights = np.zeros(n_features + 1)
weights[:-1] = householder_matrix[:, i]
weights[-1] = treshold
X_left, y_left = X[left_indices], y[left_indices]
X_right, y_right = X[right_indices], y[right_indices]
node = HHCARTNode(cur_depth, y,
split_rules=sr,
weights=weights,
left_child=self._generate_node(X_left, y_left, cur_depth + 1),
right_child=self._generate_node(X_right, y_right, cur_depth + 1),
is_leaf=False)
self._nodes.append(node)
return node
def fit(self, X, y):
self._root = self._generate_node(X, y, 0)
def predict(self, X):
def predict_single(datum):
cur_node = self._root
while not cur_node.is_leaf:
cur_node = cur_node.get_child(datum)
return cur_node.label
if not self._root:
raise StandardError("Decision tree has not been trained.")
size = X.shape[0]
predictions = np.empty((size, ), dtype=float)
for i in range(size):
predictions[i] = predict_single(X[i, :])
return predictions
def score(self, data, labels):
if not self._root:
raise StandardError("Decision tree has not been trained.")
predictions = self.predict(data)
correct_count = np.count_nonzero(predictions == labels)
return correct_count / labels.shape[0]