-
Notifications
You must be signed in to change notification settings - Fork 182
/
online_boosting.py
362 lines (302 loc) · 14.4 KB
/
online_boosting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import copy as cp
import warnings
import numpy as np
from skmultiflow.core import BaseSKMObject, ClassifierMixin, MetaEstimatorMixin
from skmultiflow.drift_detection import ADWIN
from skmultiflow.lazy import KNNADWINClassifier
from skmultiflow.utils import check_random_state
from skmultiflow.utils.utils import get_dimensions
def OnlineBoosting(base_estimator=KNNADWINClassifier(), n_estimators=10, drift_detection=True,
random_state=None): # pragma: no cover
warnings.warn("'OnlineBoosting' has been renamed to 'OnlineBoostingClassifier' in v0.5.0.\n"
"The old name will be removed in v0.7.0", category=FutureWarning)
return OnlineBoostingClassifier(base_estimator=base_estimator,
n_estimators=n_estimators,
drift_detection=drift_detection,
random_state=random_state)
class OnlineBoostingClassifier(BaseSKMObject, ClassifierMixin, MetaEstimatorMixin):
r""" Online Boosting ensemble classifier.
Online Boosting [1]_ is the online version of the boosting ensemble method (AdaBoost).
AdaBoost focuses more on difficult examples. The misclassified examples by the current
classifier :math:`h_m` are given more weights in the training set of the following
learner :math:`h_{m+1}`.
In the online context, since there is no training dataset, but a stream
of samples, the drawing of samples with replacement can't be trivially
executed. The strategy adopted by the Online Boosting algorithm is to
simulate this task by training each arriving sample K times, which is
drawn by the binomial distribution. Since we can consider the data stream
to be infinite, and knowing that with infinite samples the binomial
distribution :math:`Binomial(p, N)` tends to a :math:`Poisson(\lambda)` distribution,
where :math:`\lambda = Np`. :math:`\lambda` is computed by tracking the total weights
of the correctly and misclassified examples.
This online ensemble learner method is improved by the addition of an ADWIN change
detector.
ADWIN stands for Adaptive Windowing. It works by keeping updated
statistics of a variable sized window, so it can detect changes and
perform cuts in its window to better adapt the learning algorithms.
Parameters
----------
base_estimator: skmultiflow.core.BaseSKMObject or sklearn.BaseEstimator
(default=KNNADWINClassifier) Each member of the ensemble is
an instance of the base estimator.
n_estimators: int, optional (default=10)
The size of the ensemble, in other words, how many classifiers to train.
drift_detection: bool, optional (default=True)
A drift detector (ADWIN) can be used by the method to track the performance
of the classifiers and adapt when a drift is detected.
random_state: int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used by `np.random`.
Raises
------
NotImplementedError: A few of the functions described here are not
implemented since they have no application in this context.
ValueError: A ValueError is raised if the 'classes' parameter is
not passed in the first partial_fit call.
References
----------
.. [1] B. Wang and J. Pineau, "Online Bagging and Boosting for Imbalanced Data Streams,"
in IEEE Transactions on Knowledge and Data Engineering, vol. 28, no. 12, pp.
3353-3366, 1 Dec. 2016. doi: 10.1109/TKDE.2016.2609424
Examples
--------
>>> # Imports
>>> from skmultiflow.data import SEAGenerator
>>> from skmultiflow.meta import OnlineBoostingClassifier
>>>
>>> # Setup a data stream
>>> stream = SEAGenerator(random_state=1)
>>>
>>> # Setup variables to control loop and track performance
>>> n_samples = 0
>>> correct_cnt = 0
>>> max_samples = 200
>>>
>>> # Setup the Online Boosting Classifier
>>> online_boosting = OnlineBoostingClassifier()
>>>
>>> # Train the classifier with the samples provided by the data stream
>>> while n_samples < max_samples and stream.has_more_samples():
>>> X, y = stream.next_sample()
>>> y_pred = online_boosting.predict(X)
>>> if y[0] == y_pred[0]:
>>> correct_cnt += 1
>>> online_boosting.partial_fit(X, y)
>>> n_samples += 1
>>>
>>> # Display results
>>> print('{} samples analyzed.'.format(n_samples))
>>> print('Online Boosting performance: {}'.format(correct_cnt / n_samples))
"""
def __init__(
self,
base_estimator=KNNADWINClassifier(),
n_estimators=10,
drift_detection=True,
random_state=None):
super().__init__()
self.base_estimator = base_estimator
self._init_n_estimators = n_estimators
self.random_state = random_state
self.drift_detection = drift_detection
# default values
self.ensemble = None
self.actual_n_estimators = None
self.classes = None
self._random_state = None
self.adwin_ensemble = None
self.lam_sc = None
self.lam_sw = None
self.epsilon = None
def __configure(self):
if hasattr(self.base_estimator, "reset"):
self.base_estimator.reset()
self.actual_n_estimators = self._init_n_estimators
self.adwin_ensemble = []
for i in range(self.actual_n_estimators):
self.adwin_ensemble.append(ADWIN())
self.ensemble = [cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators)]
self._random_state = check_random_state(self.random_state)
self.lam_sc = np.zeros(self.actual_n_estimators)
self.lam_sw = np.zeros(self.actual_n_estimators)
self.epsilon = np.zeros(self.actual_n_estimators)
def reset(self):
self.__configure()
def partial_fit(self, X, y, classes=None, sample_weight=None):
""" Partially fits the model, based on the X and y matrix.
Since it's an ensemble learner, if X and y matrix of more than one
sample are passed, the algorithm will partial fit the model one sample
at a time.
Each sample is trained by each classifier a total of K times, where K
is drawn by a Poisson(l) distribution. l is updated after every example
using :math:`lambda_{sc}` if th estimator correctly classifies the example or
:math:`lambda_{sw}` in the other case.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
The features to train the model.
y: numpy.ndarray of shape (n_samples)
An array-like with the class labels of all samples in X.
classes: numpy.ndarray, optional (default=None)
Array with all possible/known class labels. This is an optional parameter, except
for the first partial_fit call where it is compulsory.
sample_weight: Array-like
Instance weight. If not provided, uniform weights are assumed.
Usage varies depending on the base estimator.
Raises
------
ValueError: A ValueError is raised if the 'classes' parameter is not
passed in the first partial_fit call, or if they are passed in further
calls but differ from the initial classes list passed.
Returns
-------
self
"""
if self.ensemble is None:
self.__configure()
if self.classes is None:
if classes is None:
raise ValueError("The first partial_fit call should pass all the classes.")
else:
self.classes = classes
if self.classes is not None and classes is not None:
if set(self.classes) != set(classes):
raise ValueError("The classes passed to the partial_fit function differ "
"from those passed earlier.")
self.__adjust_ensemble_size()
r, _ = get_dimensions(X)
for j in range(r):
change_detected = False
lam = 1
for i in range(self.actual_n_estimators):
k = self._random_state.poisson(lam)
if k > 0:
for b in range(k):
self.ensemble[i].partial_fit([X[j]], [y[j]], classes, sample_weight)
if self.ensemble[i].predict([X[j]])[0] == y[j]:
self.lam_sc[i] += lam
self.epsilon[i] = self.lam_sw[i] / (self.lam_sw[i] + self.lam_sc[i])
if self.epsilon[i] != 0:
lam = lam / (2 * (1 - self.epsilon[i]))
else:
self.lam_sw[i] += lam
self.epsilon[i] = self.lam_sw[i] / (self.lam_sw[i] + self.lam_sc[i])
if self.epsilon[i] != 0:
lam = lam / (2 * self.epsilon[i])
if self.drift_detection:
try:
pred = self.ensemble[i].predict(X)
error_estimation = self.adwin_ensemble[i].estimation
for k in range(r):
if pred[k] is not None:
self.adwin_ensemble[i].add_element(int(pred[k] == y[k]))
if self.adwin_ensemble[i].detected_change():
if self.adwin_ensemble[i].estimation > error_estimation:
change_detected = True
except ValueError:
change_detected = False
pass
if change_detected and self.drift_detection:
max_threshold = 0.0
i_max = -1
for i in range(self.actual_n_estimators):
if max_threshold < self.adwin_ensemble[i].estimation:
max_threshold = self.adwin_ensemble[i].estimation
i_max = i
if i_max != -1:
self.ensemble[i_max].reset()
self.adwin_ensemble[i_max] = ADWIN()
return self
def __adjust_ensemble_size(self):
if len(self.classes) != len(self.ensemble):
if len(self.classes) > len(self.ensemble):
for i in range(len(self.ensemble), len(self.classes)):
self.ensemble.append(cp.deepcopy(self.base_estimator))
self.actual_n_estimators += 1
self.adwin_ensemble.append(ADWIN())
self.lam_sw = np.zeros(self.actual_n_estimators)
self.lam_sc = np.zeros(self.actual_n_estimators)
self.epsilon = np.zeros(self.actual_n_estimators)
def predict(self, X):
""" predict
The predict function will average the predictions from all its learners
to find the most likely prediction for the sample matrix X.
Parameters
----------
X: Numpy.ndarray of shape (n_samples, n_features)
A matrix of the samples we want to predict.
Returns
-------
numpy.ndarray
A numpy.ndarray with the label prediction for all the samples in X.
"""
r, c = get_dimensions(X)
proba = self.predict_proba(X)
predictions = []
if proba is None:
return None
for i in range(r):
predictions.append(np.argmax(proba[i]))
return np.asarray(predictions)
def predict_proba(self, X):
""" predict_proba
Predicts the probability of each sample belonging to each one of the
known classes.
Parameters
----------
X: Numpy.ndarray of shape (n_samples, n_features)
A matrix of the samples we want to predict.
Raises
------
ValueError: A ValueError is raised if the number of classes in the base_estimator
learner differs from that of the ensemble learner.
Returns
-------
numpy.ndarray
An array of shape (n_samples, n_features), in which each outer entry is
associated with the X entry of the same index. And where the list in
index [i] contains len(self.target_values) elements, each of which represents
the probability that the i-th sample of X belongs to a certain label.
"""
proba = []
r, c = get_dimensions(X)
if self.ensemble is None:
return np.zeros((r, 1))
# Context manager to catch errors raised by numpy as RuntimeWarning
with warnings.catch_warnings():
warnings.filterwarnings('error')
try:
for i in range(self.actual_n_estimators):
partial_proba = self.ensemble[i].predict_proba(X)
if len(partial_proba[0]) > max(self.classes) + 1:
raise ValueError("The number of classes in the base learner is larger "
"than in the ensemble.")
if len(proba) < 1:
for n in range(r):
proba.append([0.0 for _ in partial_proba[n]])
for n in range(r):
for k in range(len(partial_proba[n])):
try:
proba[n][k] += np.log((1 - self.epsilon[i]) /
self.epsilon[i]) * partial_proba[n][k]
except IndexError:
proba[n].append(partial_proba[n][k])
except RuntimeWarning:
# Catch division by zero errors raised by numpy as RuntimeWarning
continue
except ValueError:
return np.zeros((r, 1))
except TypeError:
return np.zeros((r, 1))
# normalizing probabilities
sum_proba = []
for k in range(r):
sum_proba.append(np.sum(proba[k]))
aux = []
for i in range(len(proba)):
if sum_proba[i] > 0.:
aux.append([x / sum_proba[i] for x in proba[i]])
else:
aux.append(proba[i])
return np.asarray(aux)