-
Notifications
You must be signed in to change notification settings - Fork 182
/
knn_regressor.py
174 lines (142 loc) · 5.83 KB
/
knn_regressor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import numpy as np
from skmultiflow.core import RegressorMixin
from skmultiflow.lazy.base_neighbors import BaseNeighbors
from skmultiflow.utils import get_dimensions
class KNNRegressor(BaseNeighbors, RegressorMixin):
"""k-Nearest Neighbors regressor.
This non-parametric regression method keeps track of the last
``max_window_size`` training samples. Predictions are obtained by
aggregating the values of the closest n_neighbors stored-samples with
respect to a query sample.
Parameters
----------
n_neighbors: int (default=5)
The number of nearest neighbors to search for.
max_window_size: int (default=1000)
The maximum size of the window storing the last observed samples.
leaf_size: int (default=30)
sklearn.KDTree parameter. The maximum number of samples that can
be stored in one leaf node, which determines from which point the
algorithm will switch for a brute-force approach. The bigger this
number the faster the tree construction time, but the slower the
query time will be.
metric: string or sklearn.DistanceMetric object
sklearn.KDTree parameter. The distance metric to use for the KDTree.
Default=’euclidean’. KNNRegressor.valid_metrics() gives a list of
the metrics which are valid for KDTree.
aggregation_method: str (default='mean')
| The method to aggregate the target values of neighbors.
| 'mean'
| 'median'
Notes
-----
This estimator is not optimal for a mixture of categorical and numerical
features. This implementation treats all features from a given stream as
numerical.
Examples
--------
>>> # Imports
>>> from skmultiflow.data import RegressionGenerator
>>> from skmultiflow.lazy import KNNRegressor
>>> import numpy as np
>>>
>>> # Setup the data stream
>>> stream = RegressionGenerator(random_state=1)
>>> # Setup the estimator
>>> knn = KNNRegressor()
>>>
>>> # Auxiliary variables to control loop and track performance
>>> n_samples = 0
>>> correct_cnt = 0
>>> max_samples = 2000
>>> y_pred = np.zeros(max_samples)
>>> y_true = np.zeros(max_samples)
>>>
>>> # Run test-then-train loop for max_samples or while there is data in the stream
>>> while n_samples < max_samples and stream.has_more_samples():
... X, y = stream.next_sample()
... y_true[n_samples] = y[0]
... y_pred[n_samples] = knn.predict(X)[0]
... knn.partial_fit(X, y)
... n_samples += 1
>>>
>>> # Display results
>>> print('{} samples analyzed.'.format(n_samples))
2000 samples analyzed
>>> print('KNN regressor mean absolute error: {}'.format(np.mean(np.abs(y_true - y_pred))))
KNN regressor mean absolute error: 144.5672450178514
"""
_MEAN = 'mean'
_MEDIAN = 'median'
def __init__(self,
n_neighbors=5,
max_window_size=1000,
leaf_size=30,
metric='euclidean',
aggregation_method='mean'):
super().__init__(n_neighbors=n_neighbors,
max_window_size=max_window_size,
leaf_size=leaf_size,
metric=metric)
if aggregation_method not in {self._MEAN, self._MEDIAN}:
raise ValueError("Invalid aggregation_method: {}.\n"
"Valid options are: {}".format(aggregation_method,
{self._MEAN, self._MEDIAN}))
self.aggregation_method = aggregation_method
def partial_fit(self, X, y, sample_weight=None):
""" Partially (incrementally) fit the model.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
The data upon which the algorithm will create its model.
y: numpy.ndarray of shape (n_samples)
An array-like containing the target values for all
samples in X.
sample_weight: Not used.
Returns
-------
KNNRegressor
self
Notes
-----
For the K-Nearest Neighbors regressor, fitting the model is the
equivalent of inserting the newer samples in the observed window,
and if the size_limit is reached, removing older results.
"""
r, c = get_dimensions(X)
for i in range(r):
self.data_window.add_sample(X=X[i], y=y[i])
return self
def predict(self, X):
""" Predict the target value for sample X
Search the KDTree for the n_neighbors nearest neighbors.
Parameters
----------
X: Numpy.ndarray of shape (n_samples, n_features)
All the samples we want to predict the target value for.
Returns
-------
np.ndarray
An array containing the predicted target values for each \
sample in X.
"""
r, c = get_dimensions(X)
predictions = np.zeros(r)
for i in range(r):
predictions[i] = self._predict(X)
return predictions
def _predict(self, X):
r, c = get_dimensions(X)
y_pred = np.zeros(r)
if self.data_window is None or self.data_window.size < self.n_neighbors:
# Not enough information available, return default predictions (0.0)
return y_pred
_, neighbors_idx = self._get_neighbors(X)
neighbors_val = self.data_window.targets_buffer[neighbors_idx]
if self.aggregation_method == self._MEAN:
y_pred = np.mean(neighbors_val)
else: # self.aggregation_method == self._MEDIAN
y_pred = np.median(neighbors_val)
return y_pred
def predict_proba(self, X):
raise NotImplementedError('predict_proba is not implemented for this method.')