/
entropy.pyx
103 lines (85 loc) · 3.52 KB
/
entropy.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# distutils: language = c++
import math
import typing
from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
from cuml.internals.safe_imports import gpu_only_import
cp = gpu_only_import('cupy')
from libc.stdint cimport uintptr_t
import cuml.internals
from pylibraft.common.handle cimport handle_t
from cuml.common import CumlArray
from cuml.internals.input_utils import input_to_cupy_array
from pylibraft.common.handle import Handle
cimport cuml.common.cuda
cdef extern from "cuml/metrics/metrics.hpp" namespace "ML::Metrics":
double entropy(const handle_t &handle,
const int *y,
const int n,
const int lower_class_range,
const int upper_class_range) except +
@cuml.internals.api_return_generic()
def _prepare_cluster_input(cluster) -> typing.Tuple[CumlArray, int, int, int]:
"""Helper function to avoid code duplication for clustering metrics."""
cluster_m, n_rows, _, _ = input_to_cupy_array(
cluster,
check_dtype=np.int32,
check_cols=1
)
lower_class_range = cp.min(cluster_m).item()
upper_class_range = cp.max(cluster_m).item()
return cluster_m, n_rows, lower_class_range, upper_class_range
@cuml.internals.api_return_any()
def cython_entropy(clustering, base=None, handle=None) -> float:
"""
Computes the entropy of a distribution for given probability values.
Parameters
----------
clustering : array-like (device or host) shape = (n_samples,)
Clustering of labels. Probabilities are computed based on occurrences
of labels. For instance, to represent a fair coin (2 equally possible
outcomes), the clustering could be [0,1]. For a biased coin with 2/3
probability for tail, the clustering could be [0, 0, 1].
base: float, optional
The logarithmic base to use, defaults to e (natural logarithm).
handle : cuml.Handle
Specifies the cuml.handle that holds internal CUDA state for
computations in this model. Most importantly, this specifies the CUDA
stream that will be used for the model's computations, so users can
run different models concurrently in different streams by creating
handles in several streams.
If it is None, a new one is created.
Returns
-------
S : float
The calculated entropy.
"""
handle = Handle() if handle is None else handle
cdef handle_t *handle_ = <handle_t*> <size_t> handle.getHandle()
(clustering, n_rows,
lower_class_range, upper_class_range) = _prepare_cluster_input(clustering)
cdef uintptr_t clustering_ptr = clustering.ptr
S = entropy(handle_[0],
<int*> clustering_ptr,
<int> n_rows,
<int> lower_class_range,
<int> upper_class_range)
if base is not None:
# S needs to be converted from base e
S = math.log(math.exp(S), base)
return S