Skip to content

Commit

Permalink
FIX Increase mean precision for large float32 arrays (#12338)
Browse files Browse the repository at this point in the history
  • Loading branch information
bauks authored and jnothman committed Nov 14, 2018
1 parent 87077f3 commit c58d322
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
8 changes: 8 additions & 0 deletions doc/whats_new/v0.20.rst
Expand Up @@ -86,6 +86,14 @@ Changelog
- |Fix| Fixed bug in :class:`preprocessing.OrdinalEncoder` when passing
manually specified categories. :issue:`12365` by `Joris Van den Bossche`_.

:mod:`sklearn.utils`
........................

- |Fix| Use float64 for mean accumulator to avoid floating point
precision issues in :class:`preprocessing.StandardScaler` and
:class:`decomposition.IncrementalPCA` when using float32 datasets.
:issue:`12338` by :user:`bauks <bauks>`.

.. _changes_0_20:

Version 0.20.0
Expand Down
14 changes: 14 additions & 0 deletions sklearn/preprocessing/tests/test_data.py
Expand Up @@ -220,6 +220,20 @@ def test_standard_scaler_1d():
assert_equal(scaler.n_samples_seen_, X.shape[0])


def test_standard_scaler_dtype():
# Ensure scaling does not affect dtype
rng = np.random.RandomState(0)
n_samples = 10
n_features = 3
for dtype in [np.float16, np.float32, np.float64]:
X = rng.randn(n_samples, n_features).astype(dtype)
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X)
assert X.dtype == X_scaled.dtype
assert scaler.mean_.dtype == np.float64
assert scaler.scale_.dtype == np.float64


def test_scale_1d():
# 1-d inputs
X_list = [1., 3., 5., 0.]
Expand Down
7 changes: 6 additions & 1 deletion sklearn/utils/extmath.py
Expand Up @@ -763,7 +763,12 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
# new = the current increment
# updated = the aggregated stats
last_sum = last_mean * last_sample_count
new_sum = np.nansum(X, axis=0)
if np.issubdtype(X.dtype, np.floating) and X.dtype.itemsize < 8:
# Use at least float64 for the accumulator to avoid precision issues;
# see https://github.com/numpy/numpy/issues/9393
new_sum = np.nansum(X, axis=0, dtype=np.float64).astype(X.dtype)
else:
new_sum = np.nansum(X, axis=0)

new_sample_count = np.sum(~np.isnan(X), axis=0)
updated_sample_count = last_sample_count + new_sample_count
Expand Down

0 comments on commit c58d322

Please sign in to comment.