FIX Increase mean precision for large float32 arrays (#12338)

scikit-learn · Nov 14, 2018 · c58d322 · c58d322
1 parent 87077f3
commit c58d322
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 1 deletion.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -86,6 +86,14 @@ Changelog
 - |Fix| Fixed bug in :class:`preprocessing.OrdinalEncoder` when passing
   manually specified categories. :issue:`12365` by `Joris Van den Bossche`_.
 
+:mod:`sklearn.utils`
+........................
+
+- |Fix| Use float64 for mean accumulator to avoid floating point
+  precision issues in :class:`preprocessing.StandardScaler` and
+  :class:`decomposition.IncrementalPCA` when using float32 datasets.
+  :issue:`12338` by :user:`bauks <bauks>`.
+
 .. _changes_0_20:
 
 Version 0.20.0

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -220,6 +220,20 @@ def test_standard_scaler_1d():
     assert_equal(scaler.n_samples_seen_, X.shape[0])
 
 
+def test_standard_scaler_dtype():
+    # Ensure scaling does not affect dtype
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    n_features = 3
+    for dtype in [np.float16, np.float32, np.float64]:
+        X = rng.randn(n_samples, n_features).astype(dtype)
+        scaler = StandardScaler()
+        X_scaled = scaler.fit(X).transform(X)
+        assert X.dtype == X_scaled.dtype
+        assert scaler.mean_.dtype == np.float64
+        assert scaler.scale_.dtype == np.float64
+
+
 def test_scale_1d():
     # 1-d inputs
     X_list = [1., 3., 5., 0.]

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
@@ -763,7 +763,12 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
-    new_sum = np.nansum(X, axis=0)
+    if np.issubdtype(X.dtype, np.floating) and X.dtype.itemsize < 8:
+        # Use at least float64 for the accumulator to avoid precision issues;
+        # see https://github.com/numpy/numpy/issues/9393
+        new_sum = np.nansum(X, axis=0, dtype=np.float64).astype(X.dtype)
+    else:
+        new_sum = np.nansum(X, axis=0)
 
     new_sample_count = np.sum(~np.isnan(X), axis=0)
     updated_sample_count = last_sample_count + new_sample_count