/
kneighbors_regressor.pyx
238 lines (193 loc) · 8.86 KB
/
kneighbors_regressor.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# distutils: language = c++
from cuml.neighbors.nearest_neighbors import NearestNeighbors
import cuml.internals
from cuml.internals.array import CumlArray
from cuml.common import input_to_cuml_array
from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.internals.mixins import RegressorMixin
from cuml.common.doc_utils import generate_docstring
from cuml.internals.mixins import FMajorInputTagMixin
from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
from cython.operator cimport dereference as deref
from libcpp.vector cimport vector
from pylibraft.common.handle cimport handle_t
from cuml.internals.safe_imports import gpu_only_import
rmm = gpu_only_import('rmm')
from libc.stdint cimport uintptr_t, int64_t
from cuml.internals.safe_imports import gpu_only_import_from
cuda = gpu_only_import_from('numba', 'cuda')
import rmm
cimport cuml.common.cuda
cdef extern from "cuml/neighbors/knn.hpp" namespace "ML":
void knn_regress(
handle_t &handle,
float *out,
int64_t *knn_indices,
vector[float *] &y,
size_t n_rows,
size_t n_samples,
int k,
) except +
class KNeighborsRegressor(RegressorMixin,
FMajorInputTagMixin,
NearestNeighbors):
"""
K-Nearest Neighbors Regressor is an instance-based learning technique,
that keeps training samples around for prediction, rather than trying
to learn a generalizable set of model parameters.
The K-Nearest Neighbors Regressor will compute the average of the
labels for the k closest neighbors and use it as the label.
Parameters
----------
n_neighbors : int (default=5)
Default number of neighbors to query
algorithm : string (default='auto')
The query algorithm to use. Valid options are:
- ``'auto'``: to automatically select brute-force or
random ball cover based on data shape and metric
- ``'rbc'``: for the random ball algorithm, which partitions
the data space and uses the triangle inequality to lower the
number of potential distances. Currently, this algorithm
supports 2d Euclidean and Haversine.
- ``'brute'``: for brute-force, slow but produces exact results
- ``'ivfflat'``: for inverted file, divide the dataset in partitions
and perform search on relevant partitions only
- ``'ivfpq'``: for inverted file and product quantization,
same as inverted list, in addition the vectors are broken
in n_features/M sub-vectors that will be encoded thanks
to intermediary k-means clusterings. This encoding provide
partial information allowing faster distances calculations
metric : string (default='euclidean').
Distance metric to use.
weights : string (default='uniform')
Sample weights to use. Currently, only the uniform strategy is
supported.
handle : cuml.Handle
Specifies the cuml.handle that holds internal CUDA state for
computations in this model. Most importantly, this specifies the CUDA
stream that will be used for the model's computations, so users can
run different models concurrently in different streams by creating
handles in several streams.
If it is None, a new one is created.
verbose : int or boolean, default=False
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \
'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None
Return results and set estimator attributes to the indicated output
type. If None, the output type set at the module level
(`cuml.global_settings.output_type`) will be used. See
:ref:`output-data-type-configuration` for more info.
Examples
--------
.. code-block:: python
>>> from cuml.neighbors import KNeighborsRegressor
>>> from cuml.datasets import make_regression
>>> from cuml.model_selection import train_test_split
>>> X, y = make_regression(n_samples=100, n_features=10,
... random_state=5)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, train_size=0.80, random_state=5)
>>> knn = KNeighborsRegressor(n_neighbors=10)
>>> knn.fit(X_train, y_train)
KNeighborsRegressor()
>>> knn.predict(X_test) # doctest: +SKIP
array([ 14.770798 , 51.8834 , 66.15657 , 46.978275 ,
21.589611 , -14.519918 , -60.25534 , -20.856869 ,
29.869623 , -34.83317 , 0.45447388, 120.39675 ,
109.94834 , 63.57794 , -17.956171 , 78.77663 ,
30.412262 , 32.575233 , 74.72834 , 122.276855 ],
dtype=float32)
Notes
-----
For additional docs, see `scikitlearn's KNeighborsClassifier
<https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html>`_.
"""
y = CumlArrayDescriptor()
def __init__(self, *, weights="uniform", handle=None, verbose=False,
output_type=None, **kwargs):
super().__init__(
handle=handle,
verbose=verbose,
output_type=output_type,
**kwargs)
self.y = None
self.weights = weights
if weights != "uniform":
raise ValueError("Only uniform weighting strategy "
"is supported currently.")
@generate_docstring(convert_dtype_cast='np.float32')
def fit(self, X, y, convert_dtype=True) -> "KNeighborsRegressor":
"""
Fit a GPU index for k-nearest neighbors regression model.
"""
self._set_target_dtype(y)
super(KNeighborsRegressor, self).fit(X, convert_dtype=convert_dtype)
self.y, _, _, _ = \
input_to_cuml_array(y, order='F', check_dtype=np.float32,
convert_to_dtype=(np.float32
if convert_dtype
else None))
return self
@generate_docstring(convert_dtype_cast='np.float32',
return_values={'name': 'X_new',
'type': 'dense',
'description': 'Predicted values',
'shape': '(n_samples, n_features)'})
def predict(self, X, convert_dtype=True) -> CumlArray:
"""
Use the trained k-nearest neighbors regression model to
predict the labels for X
"""
if (convert_dtype):
cuml.internals.set_api_output_dtype(self._get_target_dtype())
knn_indices = self.kneighbors(X, return_distance=False,
convert_dtype=convert_dtype)
inds, n_rows, _n_cols, _dtype = \
input_to_cuml_array(knn_indices, order='C', check_dtype=np.int64,
convert_to_dtype=(np.int64
if convert_dtype
else None))
cdef uintptr_t inds_ctype = inds.ptr
res_cols = 1 if len(self.y.shape) == 1 else self.y.shape[1]
res_shape = n_rows if res_cols == 1 else (n_rows, res_cols)
results = CumlArray.zeros(res_shape, dtype=np.float32,
order="C",
index=inds.index)
cdef uintptr_t results_ptr = results.ptr
cdef uintptr_t y_ptr
cdef vector[float*] *y_vec = new vector[float*]()
for col_num in range(res_cols):
col = self.y if res_cols == 1 else self.y[:, col_num]
y_ptr = col.ptr
y_vec.push_back(<float*>y_ptr)
cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
knn_regress(
handle_[0],
<float*>results_ptr,
<int64_t*>inds_ctype,
deref(y_vec),
<size_t>self.n_samples_fit_,
<size_t>n_rows,
<int>self.n_neighbors
)
self.handle.sync()
return results
def get_param_names(self):
return super().get_param_names() + ["weights"]