/
blobs.py
218 lines (186 loc) · 8.43 KB
/
blobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import nvtx
import numbers
from collections.abc import Iterable
import cupy as cp
import numpy as np
import cuml.internals
from cuml.datasets.utils import _create_rs_generator
def _get_centers(rs, centers, center_box, n_samples, n_features, dtype):
if isinstance(n_samples, numbers.Integral):
# Set n_centers by looking at centers arg
if centers is None:
centers = 3
if isinstance(centers, numbers.Integral):
n_centers = centers
centers = rs.uniform(center_box[0], center_box[1],
size=(n_centers, n_features),
dtype=dtype)
else:
if n_features != centers.shape[1]:
raise ValueError("Expected `n_features` to be equal to"
" the length of axis 1 of centers array")
n_centers = centers.shape[0]
else:
# Set n_centers by looking at [n_samples] arg
n_centers = len(n_samples)
if centers is None:
centers = rs.uniform(center_box[0], center_box[1],
size=(n_centers, n_features),
dtype=dtype)
try:
assert len(centers) == n_centers
except TypeError:
raise ValueError("Parameter `centers` must be array-like. "
"Got {!r} instead".format(centers))
except AssertionError:
raise ValueError("Length of `n_samples` not consistent"
" with number of centers. Got n_samples = {} "
"and centers = {}".format(n_samples, centers))
else:
if n_features != centers.shape[1]:
raise ValueError("Expected `n_features` to be equal to"
" the length of axis 1 of centers array")
return centers, n_centers
@nvtx.annotate(message="datasets.make_blobs", domain="cuml_python")
@cuml.internals.api_return_generic()
def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
center_box=(-10.0, 10.0), shuffle=True, random_state=None,
return_centers=False, order='F', dtype='float32'):
"""Generate isotropic Gaussian blobs for clustering.
Parameters
----------
n_samples : int or array-like, optional (default=100)
If int, it is the total number of points equally divided among
clusters.
If array-like, each element of the sequence indicates
the number of samples per cluster.
n_features : int, optional (default=2)
The number of features for each sample.
centers : int or array of shape [`n_centers`, `n_features`], optional
(default=None)
The number of centers to generate, or the fixed center locations.
If `n_samples` is an int and centers is None, 3 centers are generated.
If `n_samples` is array-like, centers must be
either None or an array of length equal to the length of `n_samples`.
cluster_std : float or sequence of floats, optional (default=1.0)
The standard deviation of the clusters.
center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
The bounding box for each cluster center when centers are
generated at random.
shuffle : boolean, optional (default=True)
Shuffle the samples.
random_state : int, RandomState instance, default=None
Determines random number generation for dataset creation. Pass an int
for reproducible output across multiple function calls.
return_centers : bool, optional (default=False)
If True, then return the centers of each cluster
order: str, optional (default='F')
The order of the generated samples
dtype : str, optional (default='float32')
Dtype of the generated samples
Returns
-------
X : device array of shape [n_samples, n_features]
The generated samples.
y : device array of shape [n_samples]
The integer labels for cluster membership of each sample.
centers : device array, shape [n_centers, n_features]
The centers of each cluster. Only returned if
``return_centers=True``.
Examples
--------
.. code-block:: python
>>> from sklearn.datasets import make_blobs
>>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
... random_state=0)
>>> print(X.shape)
(10, 2)
>>> y
array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
>>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
... random_state=0)
>>> print(X.shape)
(10, 2)
>>> y
array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
See also
--------
make_classification: a more intricate variant
"""
# Set the default output type to "cupy". This will be ignored if the user
# has set `cuml.global_settings.output_type`. Only necessary for array
# generation methods that do not take an array as input
cuml.internals.set_api_output_type("cupy")
generator = _create_rs_generator(random_state=random_state)
centers, n_centers = _get_centers(generator, centers, center_box,
n_samples, n_features,
dtype)
# stds: if cluster_std is given as list, it must be consistent
# with the n_centers
if (hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers):
raise ValueError("Length of `clusters_std` not consistent with "
"number of centers. Got centers = {} "
"and cluster_std = {}".format(centers, cluster_std))
if isinstance(cluster_std, numbers.Real):
cluster_std = cp.full(len(centers), cluster_std)
if isinstance(n_samples, Iterable):
n_samples_per_center = n_samples
else:
n_samples_per_center = [int(n_samples // n_centers)] * n_centers
for i in range(n_samples % n_centers):
n_samples_per_center[i] += 1
X = cp.zeros(n_samples * n_features, dtype=dtype)
X = X.reshape((n_samples, n_features), order=order)
y = cp.zeros(n_samples, dtype=dtype)
if shuffle:
proba_samples_per_center = np.array(n_samples_per_center) / np.sum(
n_samples_per_center)
np_seed = int(generator.randint(n_samples, size=1))
np.random.seed(np_seed)
shuffled_sample_indices = cp.array(np.random.choice(
n_centers,
n_samples,
replace=True,
p=proba_samples_per_center
))
for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
center_indices = cp.where(shuffled_sample_indices == i)
y[center_indices[0]] = i
X_k = generator.normal(scale=std,
size=(len(center_indices[0]), n_features),
dtype=dtype)
# NOTE: Adding the loc explicitly as cupy has a bug
# when calling generator.normal with an array for loc.
# cupy.random.normal, however, works with the same
# arguments
cp.add(X_k, centers[i], out=X_k)
X[center_indices[0], :] = X_k
else:
stop = 0
for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
start, stop = stop, stop + n_samples_per_center[i]
y[start:stop] = i
X_k = generator.normal(scale=std,
size=(n, n_features),
dtype=dtype)
cp.add(X_k, centers[i], out=X_k)
X[start:stop, :] = X_k
if return_centers:
return X, y, centers
else:
return X, y