-
Notifications
You must be signed in to change notification settings - Fork 182
/
oza_bagging.py
278 lines (233 loc) · 10.7 KB
/
oza_bagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import copy as cp
import warnings
import numpy as np
from skmultiflow.core import BaseSKMObject, ClassifierMixin, MetaEstimatorMixin
from skmultiflow.lazy import KNNADWINClassifier
from skmultiflow.utils import check_random_state, get_dimensions
def OzaBagging(base_estimator=KNNADWINClassifier(), n_estimators=10,
random_state=None): # pragma: no cover
warnings.warn("'OzaBagging' has been renamed to 'OzaBaggingClassifier' in v0.5.0.\n"
"The old name will be removed in v0.7.0", category=FutureWarning)
return OzaBaggingClassifier(base_estimator=base_estimator,
n_estimators=n_estimators,
random_state=random_state)
class OzaBaggingClassifier(BaseSKMObject, ClassifierMixin, MetaEstimatorMixin):
""" Oza Bagging ensemble classifier.
Parameters
----------
base_estimator: skmultiflow.core.BaseSKMObject or sklearn.BaseEstimator
(default=KNNADWINClassifier) Each member of the ensemble is an instance of
the base estimator.
n_estimators: int (default=10)
The size of the ensemble, in other words, how many classifiers to train.
random_state: int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used by `np.random`.
Raises
------
ValueError: A ValueError is raised if the 'classes' parameter is
not passed in the first partial_fit call.
Notes
-----
Oza Bagging [1]_ is an ensemble learning method first introduced by Oza and
Russel's 'Online Bagging and Boosting'. They are an improvement of the
well known Bagging ensemble method for the batch setting, which in this
version can effectively handle data streams.
For a traditional Bagging algorithm, adapted for the batch setting, we
would have M classifiers training on M different datasets, created by
drawing N samples from the N-sized training set with replacement.
In the online context, since there is no training dataset, but a stream
of samples, the drawing of samples with replacement can't be trivially
executed. The strategy adopted by the Online Bagging algorithm is to
simulate this task by training each arriving sample K times, which is
drawn by the binomial distribution. Since we can consider the data stream
to be infinite, and knowing that with infinite samples the binomial
distribution tends to a Poisson(1) distribution, Oza and Russel found
that to be a good 'drawing with replacement'.
References
----------
.. [1] N. C. Oza, “Online Bagging and Boosting,” in 2005 IEEE International Conference
on Systems, Man and Cybernetics, 2005, vol. 3, no. 3, pp. 2340–2345.
Examples
--------
>>> # Imports
>>> from skmultiflow.meta import OzaBaggingClassifier
>>> from skmultiflow.lazy import KNNClassifier
>>> from skmultiflow.data import SEAGenerator
>>> # Setting up the stream
>>> stream = SEAGenerator(1, noise_percentage=0.07)
>>> # Setting up the OzaBagging classifier to work with KNN as base estimator
>>> clf = OzaBaggingClassifier(base_estimator=KNNClassifier(n_neighbors=8,
... max_window_size=2000, leaf_size=30), n_estimators=2)
>>> # Keeping track of sample count and correct prediction count
>>> sample_count = 0
>>> corrects = 0
>>> # Pre training the classifier with 200 samples
>>> X, y = stream.next_sample(200)
>>> clf = clf.partial_fit(X, y, classes=stream.target_values)
>>> for i in range(2000):
... X, y = stream.next_sample()
... pred = clf.predict(X)
... clf = clf.partial_fit(X, y)
... if pred is not None:
... if y[0] == pred[0]:
... corrects += 1
... sample_count += 1
>>>
>>> # Displaying the results
>>> print(str(sample_count) + ' samples analyzed.')
2000 samples analyzed.
>>> print('OzaBaggingClassifier performance: ' + str(corrects / sample_count))
OzaBagging classifier performance: 0.9095
"""
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None):
super().__init__()
# default values
self.ensemble = None
self.actual_n_estimators = None
self.classes = None
self._random_state = None # This is the actual random_state object used internally
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.random_state = random_state
self.__configure()
def __configure(self):
if hasattr(self.base_estimator, "reset"):
self.base_estimator.reset()
self.actual_n_estimators = self.n_estimators
self.ensemble = [cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators)]
self._random_state = check_random_state(self.random_state)
def reset(self):
self.__configure()
return self
def partial_fit(self, X, y, classes=None, sample_weight=None):
"""
Partially (incrementally) fit the model.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
The features to train the model.
y: numpy.ndarray of shape (n_samples)
An array-like with the class labels of all samples in X.
classes: numpy.ndarray, optional (default=None)
Array with all possible/known class labels. This is an optional parameter, except
for the first partial_fit call where it is compulsory.
sample_weight: numpy.ndarray of shape (n_samples), optional (default=None)
Samples weight. If not provided, uniform weights are assumed. Usage varies depending
on the base estimator.
Raises
------
ValueError
A ValueError is raised if the 'classes' parameter is not passed in the first
partial_fit call, or if they are passed in further calls but differ from
the initial classes list passed.
Returns
-------
OzaBaggingClassifier
self
Notes
-----
Since it's an ensemble learner, if X and y matrix of more than one
sample are passed, the algorithm will partial fit the model one sample
at a time.
Each sample is trained by each classifier a total of K times, where K
is drawn by a Poisson(1) distribution.
"""
if self.classes is None:
if classes is None:
raise ValueError("The first partial_fit call should pass all the classes.")
else:
self.classes = classes
if self.classes is not None and classes is not None:
if set(self.classes) == set(classes):
pass
else:
raise ValueError("The classes passed to the partial_fit function"
"differ from those passed earlier.")
self.__adjust_ensemble_size()
r, _ = get_dimensions(X)
for j in range(r):
for i in range(self.actual_n_estimators):
k = self._random_state.poisson()
if k > 0:
for b in range(k):
self.ensemble[i].partial_fit([X[j]], [y[j]], classes, sample_weight)
return self
def __adjust_ensemble_size(self):
if len(self.classes) != len(self.ensemble):
if len(self.classes) > len(self.ensemble):
for i in range(len(self.ensemble), len(self.classes)):
self.ensemble.append(cp.deepcopy(self.base_estimator))
self.actual_n_estimators += 1
def predict(self, X):
""" Predict classes for the passed data.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
The set of data samples to predict the class labels for.
Returns
-------
A numpy.ndarray with all the predictions for the samples in X.
Notes
-----
The predict function will average the predictions from all its learners
to find the most likely prediction for the sample matrix X.
"""
r, c = get_dimensions(X)
proba = self.predict_proba(X)
predictions = []
if proba is None:
return None
for i in range(r):
predictions.append(np.argmax(proba[i]))
return np.asarray(predictions)
def predict_proba(self, X):
""" Estimates the probability of each sample in X belonging to each of the class-labels.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
The matrix of samples one wants to predict the class probabilities for.
Returns
-------
A numpy.ndarray of shape (n_samples, n_labels), in which each outer entry
is associated with the X entry of the same index. And where the list in index [i] contains
len(self.target_values) elements, each of which represents the probability
that the i-th sample of X belongs to a certain class-label.
Raises
------
ValueError: A ValueError is raised if the number of classes in the base_estimator
learner differs from that of the ensemble learner.
"""
proba = []
r, c = get_dimensions(X)
try:
for i in range(self.actual_n_estimators):
partial_proba = self.ensemble[i].predict_proba(X)
if len(partial_proba[0]) > max(self.classes) + 1:
raise ValueError("The number of classes in the base learner "
"is larger than in the ensemble.")
if len(proba) < 1:
for n in range(r):
proba.append([0.0 for _ in partial_proba[n]])
for n in range(r):
for k in range(len(partial_proba[n])):
try:
proba[n][k] += partial_proba[n][k]
except IndexError:
proba[n].append(partial_proba[n][k])
except ValueError:
return np.zeros((r, 1))
except TypeError:
return np.zeros((r, 1))
# normalizing probabilities
sum_proba = []
for k in range(r):
sum_proba.append(np.sum(proba[k]))
aux = []
for i in range(len(proba)):
if sum_proba[i] > 0.:
aux.append([x / sum_proba[i] for x in proba[i]])
else:
aux.append(proba[i])
return np.asarray(aux)