-
Notifications
You must be signed in to change notification settings - Fork 182
/
sea_generator.py
377 lines (301 loc) · 12.9 KB
/
sea_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import numpy as np
from skmultiflow.data.base_stream import Stream
from skmultiflow.utils import check_random_state
class SEAGenerator(Stream):
r""" SEA stream generator.
This generator is an implementation of the data stream with abrupt
concept drift, first described in Street and Kim's 'A streaming
ensemble algorithm (SEA) for large-scale classification' [1]_.
It generates 3 numerical attributes, that vary from 0 to 10, where
only 2 of them are relevant to the classification task. A classification
function is chosen, among four possible ones. These functions compare
the sum of the two relevant attributes with a threshold value, unique
for each of the classification functions. Depending on the comparison
the generator will classify an instance as one of the two possible
labels.
The functions are:
* Function 0: if :math:`(att1 + att2 \leq 8)` else 1
* Function 1: if :math:`(att1 + att2 \leq 9)` else 1
* Function 2: if :math:`(att1 + att2 \leq 7)` else 1
* Function 3: if :math:`(att1 + att2 \leq 9.5)` else 1
Concept drift can be introduced by changing the classification function.
This can be done manually or using ``ConceptDriftStream``.
This data stream has two additional parameters, the first is to balance classes, which
means the class distribution will tend to a uniform one, and the possibility
to add noise, which will, according to some probability, change the chosen
label for an instance.
Parameters
----------
classification_function: int (Default: 0)
Which of the four classification functions to use for the generation.
This value can vary from 0 to 3, and the thresholds are, 8, 9, 7 and 9.5.
random_state: int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
balance_classes: bool (Default: False)
Whether to balance classes or not. If balanced, the class
distribution will converge to a uniform distribution.
noise_percentage: float (Default: 0.0)
The probability that noise will happen in the generation. At each
new sample generated, a random probability is generated, and if that
probability is higher than the noise_percentage, the chosen label will
be switched. From 0.0 to 1.0.
References
----------
.. [1] W. Nick Street and YongSeog Kim. 2001. A streaming ensemble algorithm (SEA)
for large-scale classification. In Proceedings of the seventh ACM SIGKDD international
conference on Knowledge discovery and data mining (KDD '01). ACM, New York, NY, USA,
377-382. DOI=http://dx.doi.org/10.1145/502512.502568
Examples
--------
>>> # Imports
>>> from skmultiflow.data.sea_generator import SEAGenerator
>>> # Setting up the stream
>>> stream = SEAGenerator(classification_function = 2, random_state = 112,
... balance_classes = False, noise_percentage = 0.28)
>>> # Retrieving one sample
>>> stream.next_sample()
(array([[ 3.75057129, 6.4030462 , 9.50016579]]), array([ 0.]))
>>> # Retrieving 10 samples
>>> stream.next_sample(10)
(array([[ 7.76929659, 8.32745763, 0.5480574 ],
[ 8.85351458, 7.22346511, 0.02556032],
[ 3.43419851, 0.94759888, 3.94642589],
[ 7.3670683 , 9.55806869, 8.20609371],
[ 3.78544458, 7.84763615, 0.86231513],
[ 1.6222602 , 2.90069726, 0.45008172],
[ 7.36533216, 8.39211485, 7.09361615],
[ 9.8566856 , 3.88003308, 5.03154482],
[ 6.8373245 , 7.21957381, 2.14152091],
[ 0.75216155, 6.10890702, 4.25630425]]),
array([ 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.]))
>>> # Generators will have infinite remaining instances, so it returns -1
>>> stream.n_remaining_samples()
-1
>>> stream.has_more_samples()
True
"""
def __init__(self, classification_function=0, random_state=None, balance_classes=False,
noise_percentage=0.0):
super().__init__()
# Classification functions to use
self._classification_functions = [self._classification_function_zero,
self._classification_function_one,
self._classification_function_two,
self._classification_function_three]
self.classification_function = classification_function
self.random_state = random_state
self.balance_classes = balance_classes
self.noise_percentage = noise_percentage
self.n_num_features = 3
self.n_features = self.n_num_features
self.n_classes = 2
self.n_targets = 1
self._random_state = None # This is the actual random_state object used internally
self.next_class_should_be_zero = False
self.name = "SEA Generator"
self.target_names = ["target_0"]
self.feature_names = ["att_num_" + str(i) for i in range(self.n_features)]
self.target_values = [i for i in range(self.n_classes)]
self._prepare_for_use()
@property
def classification_function(self):
""" Retrieve the index of the current classification function.
Returns
-------
int
index of the classification function [0,1,2,3]
"""
return self._classification_function_idx
@classification_function.setter
def classification_function(self, classification_function_idx):
""" Set the index of the current classification function.
Parameters
----------
classification_function_idx: int (0,1,2,3)
"""
if classification_function_idx in range(4):
self._classification_function_idx = classification_function_idx
else:
raise ValueError("classification_function takes only these values: 0, 1, 2, 3, {} was "
"passed".format(classification_function_idx))
@property
def balance_classes(self):
""" Retrieve the value of the option: Balance classes.
Returns
-------
Boolean
True is the classes are balanced
"""
return self._balance_classes
@balance_classes.setter
def balance_classes(self, balance_classes):
""" Set the value of the option: Balance classes.
Parameters
----------
balance_classes: Boolean
"""
if isinstance(balance_classes, bool):
self._balance_classes = balance_classes
else:
raise ValueError(
"balance_classes should be boolean, {} was passed".format(balance_classes))
@property
def noise_percentage(self):
""" Retrieve the value of the value of Noise percentage
Returns
-------
float
percentage of the noise
"""
return self._noise_percentage
@noise_percentage.setter
def noise_percentage(self, noise_percentage):
""" Set the value of the value of noise percentage.
Parameters
----------
noise_percentage: float (0.0..1.0)
"""
if (0.0 <= noise_percentage) and (noise_percentage <= 1.0):
self._noise_percentage = noise_percentage
else:
raise ValueError(
"noise percentage should be in [0.0..1.0], {} was passed".format(noise_percentage))
def _prepare_for_use(self):
self._random_state = check_random_state(self.random_state)
self.next_class_should_be_zero = False
def next_sample(self, batch_size=1):
""" Returns next sample from the stream.
The sample generation works as follows: The three attributes are
generated with the random generator, initialized with the seed passed
by the user. Then, the classification function decides, as a function
of the two relevant attributes, whether to classify the instance as
class 0 or class 1. The next step is to verify if the classes should
be balanced, and if so, balance the classes. The last step is to add
noise, if the noise percentage is higher than 0.0.
The generated sample will have 3 features, where only the two first
are relevant, and 1 label (it has one classification task).
Parameters
----------
batch_size: int (optional, default=1)
The number of samples to return.
Returns
-------
tuple or tuple list
Return a tuple with the features matrix and the labels matrix for
the batch_size samples that were requested.
"""
data = np.zeros([batch_size, self.n_features + 1])
for j in range(batch_size):
self.sample_idx += 1
att1 = att2 = att3 = 0.0
group = 0
desired_class_found = False
while not desired_class_found:
att1 = 10 * self._random_state.rand()
att2 = 10 * self._random_state.rand()
att3 = 10 * self._random_state.rand()
group = self._classification_functions[self.classification_function](att1, att2,
att3)
if not self.balance_classes:
desired_class_found = True
else:
if (self.next_class_should_be_zero and (group == 0)) or \
((not self.next_class_should_be_zero) and (group == 1)):
desired_class_found = True
self.next_class_should_be_zero = not self.next_class_should_be_zero
if 0.01 + self._random_state.rand() <= self.noise_percentage:
group = 1 if (group == 0) else 0
data[j, 0] = att1
data[j, 1] = att2
data[j, 2] = att3
data[j, 3] = group
self.current_sample_x = data[:, :self.n_features]
self.current_sample_y = data[:, self.n_features:].flatten().astype(np.int64)
return self.current_sample_x, self.current_sample_y
def generate_drift(self):
"""
Generate drift by switching the classification function randomly.
"""
new_function = self._random_state.randint(4)
while new_function == self.classification_function:
new_function = self._random_state.randint(4)
self.classification_function = new_function
@staticmethod
def _classification_function_zero(att1, att2, att3):
""" classification_function_zero
Decides the sample class label based on the sum of att1 and att2,
and the threshold value of 8.
Parameters
----------
att1: float
First numeric attribute.
att2: float
Second numeric attribute.
att3: float
Third numeric attribute.
Returns
-------
int
Returns the sample class label, either 0 or 1.
"""
return 0 if (att1 + att2 <= 8) else 1
@staticmethod
def _classification_function_one(att1, att2, att3):
""" classification_function_one
Decides the sample class label based on the sum of att1 and att2,
and the threshold value of 9.
Parameters
----------
att1: float
First numeric attribute.
att2: float
Second numeric attribute.
att3: float
Third numeric attribute.
Returns
-------
int
Returns the sample class label, either 0 or 1.
"""
return 0 if (att1 + att2 <= 9) else 1
@staticmethod
def _classification_function_two(att1, att2, att3):
""" classification_function_two
Decides the sample class label based on the sum of att1 and att2,
and the threshold value of 7.
Parameters
----------
att1: float
First numeric attribute.
att2: float
Second numeric attribute.
att3: float
Third numeric attribute.
Returns
-------
int
Returns the sample class label, either 0 or 1.
"""
return 0 if (att1 + att2 <= 7) else 1
@staticmethod
def _classification_function_three(att1, att2, att3):
""" classification_function_three
Decides the sample class label based on the sum of att1 and att2,
and the threshold value of 9.5.
Parameters
----------
att1: float
First numeric attribute.
att2: float
Second numeric attribute.
att3: float
Third numeric attribute.
Returns
-------
int
Returns the sample class label, either 0 or 1.
"""
return 0 if (att1 + att2 <= 9.5) else 1