Skip to content

Commit

Permalink
MAINT: Merge.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlof committed Oct 29, 2016
2 parents ce17751 + edc9e7f commit 279fd60
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 19 deletions.
4 changes: 4 additions & 0 deletions README.rst
Expand Up @@ -159,3 +159,7 @@ Communication
- IRC channel: ``#scikit-learn`` at ``irc.freenode.net``
- Stack Overflow: http://stackoverflow.com/questions/tagged/scikit-learn
- Website: http://scikit-learn.org

Citation
~~~~~~~~~~~~~
If you use scikit-learn in a scientific publication, we would appreciate citations: http://scikit-learn.org/stable/about.html#citing-scikit-learn
20 changes: 20 additions & 0 deletions doc/whats_new.rst
Expand Up @@ -22,6 +22,12 @@ New features
Enhancements
............

- :class:`cluster.MiniBatchKMeans` and :class:`cluster.KMeans`
now uses significantly less memory when assigning data points to their
nearest cluster center.
(`#7721 <https://github.com/scikit-learn/scikit-learn/pull/7721>`_)
By `Jon Crall`_.

- Added ``classes_`` attribute to :class:`model_selection.GridSearchCV`
that matches the ``classes_`` attribute of ``best_estimator_``. (`#7661
<https://github.com/scikit-learn/scikit-learn/pull/7661>`_) by `Alyssa
Expand Down Expand Up @@ -84,6 +90,20 @@ Bug fixes
Version 0.18.1
==============

Enhancements
.........
- Improved ``sample_without_replacement`` speed by utilizing
numpy.random.permutation for most cases. As a result,
samples may differ in this release for a fixed random state.
Affected estimators:
- :class:`ensemble.BaggingClassifier`
- :class:`ensemble.BaggingRegressor`
- :class:`linear_model.RANSACRegressor`
- :class:`model_selection.RandomizedSearchCV`
- :class:`random_projection.SparseRandomProjection`
This also affects the :meth:`datasets.make_classification`
method.

Bug fixes
.........

Expand Down
20 changes: 9 additions & 11 deletions sklearn/cluster/k_means_.py
Expand Up @@ -18,6 +18,7 @@

from ..base import BaseEstimator, ClusterMixin, TransformerMixin
from ..metrics.pairwise import euclidean_distances
from ..metrics.pairwise import pairwise_distances_argmin_min
from ..utils.extmath import row_norms, squared_norm, stable_cumsum
from ..utils.sparsefuncs_fast import assign_rows_csr
from ..utils.sparsefuncs import mean_variance_axis
Expand Down Expand Up @@ -552,17 +553,14 @@ def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances):
"""
n_samples = X.shape[0]
k = centers.shape[0]
all_distances = euclidean_distances(centers, X, x_squared_norms,
squared=True)
labels = np.empty(n_samples, dtype=np.int32)
labels.fill(-1)
mindist = np.empty(n_samples)
mindist.fill(np.infty)
for center_id in range(k):
dist = all_distances[center_id]
labels[dist < mindist] = center_id
mindist = np.minimum(dist, mindist)

# Breakup nearest neighbor distance computation into batches to prevent
# memory blowup in the case of a large number of samples and clusters.
# TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
labels, mindist = pairwise_distances_argmin_min(
X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
# cython k-means code assumes int32 inputs
labels = labels.astype(np.int32)
if n_samples == distances.shape[0]:
# distances will be changed in-place
distances[:] = mindist
Expand Down
8 changes: 4 additions & 4 deletions sklearn/pipeline.py
Expand Up @@ -136,13 +136,13 @@ class Pipeline(_BasePipeline):
Pipeline(steps=[...])
>>> prediction = anova_svm.predict(X)
>>> anova_svm.score(X, y) # doctest: +ELLIPSIS
0.77...
0.829...
>>> # getting the selected features chosen by anova_filter
>>> anova_svm.named_steps['anova'].get_support()
... # doctest: +NORMALIZE_WHITESPACE
array([ True, True, True, False, False, True, False, True, True, True,
False, False, True, False, True, False, False, False, False,
True], dtype=bool)
array([False, False, True, True, False, False, True, True, False,
True, False, True, True, False, True, False, True, True,
False, False], dtype=bool)
"""

# BaseEstimator interface
Expand Down
4 changes: 3 additions & 1 deletion sklearn/preprocessing/label.py
Expand Up @@ -732,7 +732,9 @@ def fit_transform(self, y):
class_mapping = np.empty(len(tmp), dtype=dtype)
class_mapping[:] = tmp
self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
yt.indices = np.take(inverse, yt.indices)
# ensure yt.indices keeps its current dtype
yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype,
copy=False)

if not self.sparse_output:
yt = yt.toarray()
Expand Down
4 changes: 4 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Expand Up @@ -226,6 +226,8 @@ def test_sparse_output_multilabel_binarizer():
got = mlb.fit_transform(inp())
assert_equal(issparse(got), sparse_output)
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert_equal(got.indices.dtype, got.indptr.dtype)
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
Expand All @@ -236,6 +238,8 @@ def test_sparse_output_multilabel_binarizer():
got = mlb.fit(inp()).transform(inp())
assert_equal(issparse(got), sparse_output)
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert_equal(got.indices.dtype, got.indptr.dtype)
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
Expand Down
16 changes: 13 additions & 3 deletions sklearn/utils/_random.pyx
Expand Up @@ -248,7 +248,11 @@ cpdef sample_without_replacement(np.int_t n_population,
by `np.random`.
method : "auto", "tracking_selection", "reservoir_sampling" or "pool"
If method == "auto", an algorithm is automatically selected.
If method == "auto", the ratio of n_samples / n_population is used
to determine which algorithm to use:
If ratio is between 0 and 0.01, tracking selection is used.
If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
If ratio is greater than 0.99, reservoir sampling is used.
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
Expand Down Expand Up @@ -276,11 +280,17 @@ cpdef sample_without_replacement(np.int_t n_population,

all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")

ratio = n_samples / n_population if n_population != 0.0 else 1.0

# Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
if method == "auto" and ratio > 0.01 and ratio < 0.99:
rng = check_random_state(random_state)
return rng.permutation(n_population)[:n_samples]

if method == "auto" or method == "tracking_selection":
# TODO the pool based method can also be used.
# however, it requires special benchmark to take into account
# the memory requirement of the array vs the set.
ratio = n_samples / n_population if n_population != 0.0 else 1.0

# The value 0.2 has been determined through benchmarking.
if ratio < 0.2:
Expand All @@ -296,7 +306,7 @@ cpdef sample_without_replacement(np.int_t n_population,

elif method == "pool":
return _sample_without_replacement_with_pool(n_population, n_samples,
random_state)
random_state)
else:
raise ValueError('Expected a method name in %s, got %s. '
% (all_methods, method))

0 comments on commit 279fd60

Please sign in to comment.