/
_ranking.py
252 lines (203 loc) · 8.05 KB
/
_ranking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
from cuml.internals.input_utils import input_to_cupy_array
from cuml.internals.array import CumlArray
import cuml.internals
from cuml.internals.safe_imports import cpu_only_import
import typing
from cuml.internals.safe_imports import gpu_only_import
cp = gpu_only_import("cupy")
np = cpu_only_import("numpy")
@cuml.internals.api_return_generic(get_output_type=True)
def precision_recall_curve(
y_true, probs_pred
) -> typing.Tuple[CumlArray, CumlArray, CumlArray]:
"""
Compute precision-recall pairs for different probability thresholds
.. note:: this implementation is restricted to the binary classification
task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the
number of true positives and ``fp`` the number of false positives. The
precision is intuitively the ability of the classifier not to label as
positive a sample that is negative.
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
of true positives and ``fn`` the number of false negatives. The recall
is intuitively the ability of the classifier to find all the positive
samples. The last precision and recall values are 1. and 0.
respectively and do not have a corresponding threshold. This ensures
that the graph starts on the y axis.
Read more in the scikit-learn's `User Guide
<https://scikit-learn.org/stable/modules/model_evaluation.html#precision-recall-f-measure-metrics>`_.
Parameters
----------
y_true : array, shape = [n_samples]
True binary labels, {0, 1}.
probas_pred : array, shape = [n_samples]
Estimated probabilities or decision function.
Returns
-------
precision : array, shape = [n_thresholds + 1]
Precision values such that element i is the precision of
predictions with score >= thresholds[i] and the last element is 1.
recall : array, shape = [n_thresholds + 1]
Decreasing recall values such that element i is the recall of
predictions with score >= thresholds[i] and the last element is 0.
thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
Increasing thresholds on the decision function used to compute
precision and recall.
Examples
--------
.. code-block:: python
>>> import cupy as cp
>>> from cuml.metrics import precision_recall_curve
>>> y_true = cp.array([0, 0, 1, 1])
>>> y_scores = cp.array([0.1, 0.4, 0.35, 0.8])
>>> precision, recall, thresholds = precision_recall_curve(
... y_true, y_scores)
>>> print(precision)
[0.666... 0.5 1. 1. ]
>>> print(recall)
[1. 0.5 0.5 0. ]
>>> print(thresholds)
[0.35 0.4 0.8 ]
"""
y_true, n_rows, n_cols, ytype = input_to_cupy_array(
y_true, check_dtype=[np.int32, np.int64, np.float32, np.float64]
)
y_score, _, _, _ = input_to_cupy_array(
probs_pred,
check_dtype=[np.int32, np.int64, np.float32, np.float64],
check_rows=n_rows,
check_cols=n_cols,
)
if cp.any(y_true) == 0:
raise ValueError(
"precision_recall_curve cannot be used when " "y_true is all zero."
)
fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
precision = cp.flip(tps / (tps + fps), axis=0)
recall = cp.flip(tps / tps[-1], axis=0)
n = (recall == 1).sum()
if n > 1:
precision = precision[n - 1 :]
recall = recall[n - 1 :]
thresholds = thresholds[n - 1 :]
precision = cp.concatenate([precision, cp.ones(1)])
recall = cp.concatenate([recall, cp.zeros(1)])
return precision, recall, thresholds
@cuml.internals.api_return_any()
def roc_auc_score(y_true, y_score):
"""
Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
from prediction scores.
.. note:: this implementation can only be used with binary classification.
Parameters
----------
y_true : array-like of shape (n_samples,)
True labels. The binary cases
expect labels with shape (n_samples,)
y_score : array-like of shape (n_samples,)
Target scores. In the binary cases, these can be either
probability estimates or non-thresholded decision values (as returned
by `decision_function` on some classifiers). The binary
case expects a shape (n_samples,), and the scores must be the scores of
the class with the greater label.
Returns
-------
auc : float
Examples
--------
>>> import numpy as np
>>> from cuml.metrics import roc_auc_score
>>> y_true = np.array([0, 0, 1, 1])
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
>>> print(roc_auc_score(y_true, y_scores))
0.75
"""
y_true, n_rows, n_cols, ytype = input_to_cupy_array(
y_true, check_dtype=[np.int32, np.int64, np.float32, np.float64]
)
y_score, _, _, _ = input_to_cupy_array(
y_score,
check_dtype=[np.int32, np.int64, np.float32, np.float64],
check_rows=n_rows,
check_cols=n_cols,
)
return _binary_roc_auc_score(y_true, y_score)
def _binary_clf_curve(y_true, y_score):
if y_true.dtype.kind == "f" and np.any(y_true != y_true.astype(int)):
raise ValueError("Continuous format of y_true " "is not supported.")
ids = cp.argsort(-y_score)
sorted_score = y_score[ids]
ones = y_true[ids].astype("float32") # for calculating true positives
zeros = 1 - ones # for calculating predicted positives
# calculate groups
group = _group_same_scores(sorted_score)
num = int(group[-1])
tps = cp.zeros(num, dtype="float32")
fps = cp.zeros(num, dtype="float32")
tps = _addup_x_in_group(group, ones, tps)
fps = _addup_x_in_group(group, zeros, fps)
tps = cp.cumsum(tps)
fps = cp.cumsum(fps)
thresholds = cp.unique(y_score)
return fps, tps, thresholds
def _binary_roc_auc_score(y_true, y_score):
"""Compute binary roc_auc_score using cupy"""
if cp.unique(y_true).shape[0] == 1:
raise ValueError(
"roc_auc_score cannot be used when "
"only one class present in y_true. ROC AUC score "
"is not defined in that case."
)
if cp.unique(y_score).shape[0] == 1:
return 0.5
fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
tpr = tps / tps[-1]
fpr = fps / fps[-1]
return _calculate_area_under_curve(fpr, tpr).item()
def _addup_x_in_group(group, x, result):
addup_x_in_group_kernel = cp.RawKernel(
r"""
extern "C" __global__
void addup_x_in_group(const int* group, const float* x,
float* result, int N)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid<N){
atomicAdd(result + group[tid] - 1, x[tid]);
}
}
""",
"addup_x_in_group",
)
N = x.shape[0]
tpb = 256
bpg = math.ceil(N / tpb)
addup_x_in_group_kernel((bpg,), (tpb,), (group, x, result, N))
return result
def _group_same_scores(sorted_score):
mask = cp.empty(sorted_score.shape, dtype=cp.bool_)
mask[0] = True
mask[1:] = sorted_score[1:] != sorted_score[:-1]
group = cp.cumsum(mask, dtype=cp.int32)
return group
def _calculate_area_under_curve(fpr, tpr):
"""helper function to calculate area under curve given fpr & tpr arrays"""
return (
cp.sum((fpr[1:] - fpr[:-1]) * (tpr[1:] + tpr[:-1])) / 2
+ tpr[0] * fpr[0] / 2
)