Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG + 1] KMeans optimisation for array C/F contiguity #10471

Merged
merged 20 commits into from Feb 8, 2018
Merged
Changes from 1 commit
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+16 −5
Diff settings

Always

Just for now

Copy path View file
@@ -170,6 +170,9 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
algorithm="auto", return_n_iter=False):
"""K-means clustering algorithm.
Warning: The data will be converted to C ordering, which might cause a

This comment has been minimized.

Copy link
@glemaitre

glemaitre Jan 14, 2018

Contributor

I would avoid to issue a warning here. You can add a line of explanation under X, similarly to the some explanation regarding sparse matrix CSR

Also a conversion will for sure make a memory copy so you should remove "might" :)

memory copy if the given data is in fortran order.
Read more in the :ref:`User Guide <k_means>`.
Parameters
@@ -299,7 +302,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',

# Validate init array
if hasattr(init, '__array__'):
init = check_array(init, dtype=X.dtype.type, copy=True)
init = check_array(init, dtype=X.dtype.type, copy=True, order='C')
_validate_center_shape(X, n_clusters, init)

if n_init != 1:
@@ -393,7 +396,7 @@ def _kmeans_single_elkan(X, n_clusters, max_iter=300, init='k-means++',
precompute_distances=True):
if sp.issparse(X):
raise ValueError("algorithm='elkan' not supported for sparse input X")
X = check_array(X, order="C")
X = check_array(X, order='C')

This comment has been minimized.

Copy link
@glemaitre

glemaitre Jan 14, 2018

Contributor

unnecessary change

random_state = check_random_state(random_state)
if x_squared_norms is None:
x_squared_norms = row_norms(X, squared=True)
@@ -716,6 +719,9 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
Read more in the :ref:`User Guide <k_means>`.
Warning: The data will be converted to C ordering, which might cause a

This comment has been minimized.

Copy link
@glemaitre

glemaitre Jan 14, 2018

Contributor

I would avoid to issue a warning here. You can add a line of explanation under X, similarly to the some explanation regarding sparse matrix CSR

memory copy if the given data is in fortran order.
Parameters
----------
@@ -862,14 +868,16 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10,

def _check_fit_data(self, X):
"""Verify that the number of samples given is larger than k"""
X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
order='C')
if X.shape[0] < self.n_clusters:
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
X.shape[0], self.n_clusters))
return X

def _check_test_data(self, X):
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES)
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
order='C')
n_samples, n_features = X.shape
expected_n_features = self.cluster_centers_.shape[1]
if not n_features == expected_n_features:
@@ -1225,6 +1233,9 @@ class MiniBatchKMeans(KMeans):
Read more in the :ref:`User Guide <mini_batch_kmeans>`.
Warning: The data will be converted to C ordering, which might cause a

This comment has been minimized.

Copy link
@glemaitre

glemaitre Jan 14, 2018

Contributor

I would avoid to issue a warning here. You can add a line of explanation under X, similarly to the some explanation regarding sparse matrix CSR

memory copy if the given data is in fortran order.
Parameters
----------
@@ -1522,7 +1533,7 @@ def partial_fit(self, X, y=None):
"""

X = check_array(X, accept_sparse="csr")
X = check_array(X, accept_sparse="csr", order='C')
n_samples, n_features = X.shape
if hasattr(self.init, '__array__'):
self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.