MAINT: Merge.

scikit-learn · Oct 29, 2016 · 279fd60 · 279fd60
2 parents ce17751 + edc9e7f
commit 279fd60
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 19 deletions.
diff --git a/README.rst b/README.rst
@@ -159,3 +159,7 @@ Communication
 - IRC channel: ``#scikit-learn`` at ``irc.freenode.net``
 - Stack Overflow: http://stackoverflow.com/questions/tagged/scikit-learn
 - Website: http://scikit-learn.org
+
+Citation
+~~~~~~~~~~~~~
+If you use scikit-learn in a scientific publication, we would appreciate citations: http://scikit-learn.org/stable/about.html#citing-scikit-learn
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -22,6 +22,12 @@ New features
 Enhancements
 ............
 
+   - :class:`cluster.MiniBatchKMeans` and :class:`cluster.KMeans`
+     now uses significantly less memory when assigning data points to their
+     nearest cluster center.
+     (`#7721 <https://github.com/scikit-learn/scikit-learn/pull/7721>`_)
+     By `Jon Crall`_.
+
    - Added ``classes_`` attribute to :class:`model_selection.GridSearchCV`
      that matches the ``classes_`` attribute of ``best_estimator_``. (`#7661
      <https://github.com/scikit-learn/scikit-learn/pull/7661>`_) by `Alyssa
@@ -84,6 +90,20 @@ Bug fixes
 Version 0.18.1
 ==============
 
+Enhancements
+.........
+   - Improved ``sample_without_replacement`` speed by utilizing
+     numpy.random.permutation for most cases. As a result,
+     samples may differ in this release for a fixed random state.
+     Affected estimators:
+      - :class:`ensemble.BaggingClassifier`
+      - :class:`ensemble.BaggingRegressor`
+      - :class:`linear_model.RANSACRegressor`
+      - :class:`model_selection.RandomizedSearchCV`
+      - :class:`random_projection.SparseRandomProjection`
+     This also affects the :meth:`datasets.make_classification`
+     method.
+
 Bug fixes
 .........
 

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -18,6 +18,7 @@
 
 from ..base import BaseEstimator, ClusterMixin, TransformerMixin
 from ..metrics.pairwise import euclidean_distances
+from ..metrics.pairwise import pairwise_distances_argmin_min
 from ..utils.extmath import row_norms, squared_norm, stable_cumsum
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.sparsefuncs import mean_variance_axis
@@ -552,17 +553,14 @@ def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances):
 
     """
     n_samples = X.shape[0]
-    k = centers.shape[0]
-    all_distances = euclidean_distances(centers, X, x_squared_norms,
-                                        squared=True)
-    labels = np.empty(n_samples, dtype=np.int32)
-    labels.fill(-1)
-    mindist = np.empty(n_samples)
-    mindist.fill(np.infty)
-    for center_id in range(k):
-        dist = all_distances[center_id]
-        labels[dist < mindist] = center_id
-        mindist = np.minimum(dist, mindist)
+
+    # Breakup nearest neighbor distance computation into batches to prevent
+    # memory blowup in the case of a large number of samples and clusters.
+    # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
+    labels, mindist = pairwise_distances_argmin_min(
+        X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
+    # cython k-means code assumes int32 inputs
+    labels = labels.astype(np.int32)
     if n_samples == distances.shape[0]:
         # distances will be changed in-place
         distances[:] = mindist

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -136,13 +136,13 @@ class Pipeline(_BasePipeline):
     Pipeline(steps=[...])
     >>> prediction = anova_svm.predict(X)
     >>> anova_svm.score(X, y)                        # doctest: +ELLIPSIS
-    0.77...
+    0.829...
     >>> # getting the selected features chosen by anova_filter
     >>> anova_svm.named_steps['anova'].get_support()
     ... # doctest: +NORMALIZE_WHITESPACE
-    array([ True,  True,  True, False, False,  True, False,  True,  True, True,
-           False, False,  True, False,  True, False, False, False, False,
-           True], dtype=bool)
+    array([False, False,  True,  True, False, False, True,  True, False,
+           True,  False,  True,  True, False, True,  False, True, True,
+           False, False], dtype=bool)
     """
 
     # BaseEstimator interface

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -732,7 +732,9 @@ def fit_transform(self, y):
         class_mapping = np.empty(len(tmp), dtype=dtype)
         class_mapping[:] = tmp
         self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
-        yt.indices = np.take(inverse, yt.indices)
+        # ensure yt.indices keeps its current dtype
+        yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype,
+                              copy=False)
 
         if not self.sparse_output:
             yt = yt.toarray()

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -226,6 +226,8 @@ def test_sparse_output_multilabel_binarizer():
             got = mlb.fit_transform(inp())
             assert_equal(issparse(got), sparse_output)
             if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert_equal(got.indices.dtype, got.indptr.dtype)
                 got = got.toarray()
             assert_array_equal(indicator_mat, got)
             assert_array_equal([1, 2, 3], mlb.classes_)
@@ -236,6 +238,8 @@ def test_sparse_output_multilabel_binarizer():
             got = mlb.fit(inp()).transform(inp())
             assert_equal(issparse(got), sparse_output)
             if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert_equal(got.indices.dtype, got.indptr.dtype)
                 got = got.toarray()
             assert_array_equal(indicator_mat, got)
             assert_array_equal([1, 2, 3], mlb.classes_)

diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
@@ -248,7 +248,11 @@ cpdef sample_without_replacement(np.int_t n_population,
         by `np.random`.
 
     method : "auto", "tracking_selection", "reservoir_sampling" or "pool"
-        If method == "auto", an algorithm is automatically selected.
+        If method == "auto", the ratio of n_samples / n_population is used
+        to determine which algorithm to use:
+        If ratio is between 0 and 0.01, tracking selection is used.
+        If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
+        If ratio is greater than 0.99, reservoir sampling is used.
         The order of the selected integers is undefined. If a random order is
         desired, the selected subset should be shuffled.
 
@@ -276,11 +280,17 @@ cpdef sample_without_replacement(np.int_t n_population,
 
     all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
 
+    ratio = n_samples / n_population if n_population != 0.0 else 1.0
+
+    # Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
+    if method == "auto" and ratio > 0.01 and ratio < 0.99:
+        rng = check_random_state(random_state)
+        return rng.permutation(n_population)[:n_samples]
+
     if method == "auto" or method == "tracking_selection":
         # TODO the pool based method can also be used.
         #      however, it requires special benchmark to take into account
         #      the memory requirement of the array vs the set.
-        ratio = n_samples / n_population if n_population != 0.0 else 1.0
 
         # The value 0.2 has been determined through benchmarking.
         if ratio < 0.2:
@@ -296,7 +306,7 @@ cpdef sample_without_replacement(np.int_t n_population,
 
     elif method == "pool":
         return _sample_without_replacement_with_pool(n_population, n_samples,
-                                                    random_state)
+                                                     random_state)
     else:
         raise ValueError('Expected a method name in %s, got %s. '
                          % (all_methods, method))