scikit-learn · ogrisel · Jun 21, 2018 · Jun 5, 2018 · Jun 5, 2018 · Jun 8, 2018
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
@@ -243,9 +243,10 @@ def partial_fit(self, X, y=None, check_input=True):
 
         # Update stats - they are 0 if this is the fisrt step
         col_mean, col_var, n_total_samples = \
-            _incremental_mean_and_var(X, last_mean=self.mean_,
-                                      last_variance=self.var_,
-                                      last_sample_count=self.n_samples_seen_)
+            _incremental_mean_and_var(
+                X, last_mean=self.mean_, last_variance=self.var_,
+                last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]))
+        n_total_samples = n_total_samples[0]
 
         # Whitening
         if self.n_samples_seen_ == 0:

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -138,7 +138,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     """  # noqa
     X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False,
                     warn_on_dtype=True, estimator='the scale function',
-                    dtype=FLOAT_DTYPES)
+                    dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
     if sparse.issparse(X):
         if with_mean:
             raise ValueError(
@@ -154,9 +154,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     else:
         X = np.asarray(X)
         if with_mean:
-            mean_ = np.mean(X, axis)
+            mean_ = np.nanmean(X, axis)
         if with_std:
-            scale_ = np.std(X, axis)
+            scale_ = np.nanstd(X, axis)
         # Xr is a view on the original array that enables easy use of
         # broadcasting on the axis in which we are interested in
         Xr = np.rollaxis(X, axis)
@@ -179,7 +179,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
             scale_ = _handle_zeros_in_scale(scale_, copy=False)
             Xr /= scale_
             if with_mean:
-                mean_2 = Xr.mean(axis=0)
+                mean_2 = np.nanmean(Xr, axis=0)
                 # If mean_2 is not 'close to zero', it comes from the fact that
                 # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
                 # if mean_1 was close to zero. The problem is thus essentially
@@ -533,9 +533,10 @@ class StandardScaler(BaseEstimator, TransformerMixin):
         The variance for each feature in the training set. Used to compute
         `scale_`
 
-    n_samples_seen_ : int
-        The number of samples processed by the estimator. Will be reset on
-        new calls to fit, but increments across ``partial_fit`` calls.
+    n_samples_seen_ : array, shape(n_features,)
+        The number of samples processed by the estimator for each feature.
+        Will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
 
     Examples
     --------
@@ -626,7 +627,8 @@ def partial_fit(self, X, y=None):
             Ignored
         """
         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
+                        warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
@@ -641,7 +643,14 @@ def partial_fit(self, X, y=None):
                 # First pass
                 if not hasattr(self, 'n_samples_seen_'):
                     self.mean_, self.var_ = mean_variance_axis(X, axis=0)
-                    self.n_samples_seen_ = X.shape[0]
+                    self.n_samples_seen_ = (np.ones(X.shape[1], dtype=np.int32)
+                                            * X.shape[0])
+                    sparse_constr = (sparse.csr_matrix if X.format == 'csr'
+                                     else sparse.csc_matrix)
+                    counts_nan = sparse_constr(
+                        (np.isnan(X.data), X.indices, X.indptr),
+                        shape=X.shape).sum(axis=0).A.ravel()
+                    self.n_samples_seen_ -= counts_nan
                 # Next passes
                 else:
                     self.mean_, self.var_, self.n_samples_seen_ = \
@@ -656,9 +665,10 @@ def partial_fit(self, X, y=None):
             # First pass
             if not hasattr(self, 'n_samples_seen_'):
                 self.mean_ = .0
-                self.n_samples_seen_ = 0
+                self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int32)
                 if self.with_std:
                     self.var_ = .0
+
                 else:
                     self.var_ = None
 
@@ -695,7 +705,8 @@ def transform(self, X, y='deprecated', copy=None):
 
         copy = copy if copy is not None else self.copy
         X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True,
-                        estimator=self, dtype=FLOAT_DTYPES)
+                        estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_mean:

diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py
@@ -8,8 +8,9 @@
 
 from sklearn.base import clone
 
-from sklearn.preprocessing import QuantileTransformer
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import QuantileTransformer
 
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_allclose
@@ -25,6 +26,8 @@ def _get_valid_samples_by_column(X, col):
 @pytest.mark.parametrize(
     "est, support_sparse",
     [(MinMaxScaler(), False),
+     (StandardScaler(), False),
+     (StandardScaler(with_mean=False), True),
      (QuantileTransformer(n_quantiles=10, random_state=42), True)]
 )
 def test_missing_value_handling(est, support_sparse):
@@ -57,7 +60,7 @@ def test_missing_value_handling(est, support_sparse):
         est.fit(_get_valid_samples_by_column(X_train, i))
         # check transforming with NaN works even when training without NaN
         Xt_col = est.transform(X_test[:, [i]])
-        assert_array_equal(Xt_col, Xt[:, [i]])
+        assert_allclose(Xt_col, Xt[:, [i]])
         # check non-NaN is handled as before - the 1st column is all nan
         if not np.isnan(X_test[:, i]).all():
             Xt_col_nonan = est.transform(

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -203,7 +203,7 @@ def test_standard_scaler_1d():
                                       np.zeros_like(n_features))
             assert_array_almost_equal(X_scaled.mean(axis=0), .0)
             assert_array_almost_equal(X_scaled.std(axis=0), 1.)
-        assert_equal(scaler.n_samples_seen_, X.shape[0])
+        assert_array_equal(scaler.n_samples_seen_, X.shape[0])
 
         # check inverse transform
         X_scaled_back = scaler.inverse_transform(X_scaled)
@@ -217,7 +217,7 @@ def test_standard_scaler_1d():
     assert_almost_equal(scaler.scale_, 1.)
     assert_array_almost_equal(X_scaled.mean(axis=0), .0)
     assert_array_almost_equal(X_scaled.std(axis=0), .0)
-    assert_equal(scaler.n_samples_seen_, X.shape[0])
+    assert_array_equal(scaler.n_samples_seen_, X.shape[0])
 
 
 def test_scale_1d():
@@ -283,7 +283,7 @@ def test_scaler_2d_arrays():
     scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=True)
     assert_false(np.any(np.isnan(X_scaled)))
-    assert_equal(scaler.n_samples_seen_, n_samples)
+    assert_array_equal(scaler.n_samples_seen_, n_samples)
 
     assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
     assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
@@ -399,7 +399,8 @@ def test_standard_scaler_partial_fit():
 
         assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
         assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
-        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
+        assert_array_equal(scaler_batch.n_samples_seen_,
+                           scaler_incr.n_samples_seen_)
 
         # Test std after 1 step
         batch0 = slice(0, chunk_size)
@@ -423,10 +424,11 @@ def test_standard_scaler_partial_fit():
             assert_correct_incr(i, batch_start=batch.start,
                                 batch_stop=batch.stop, n=n,
                                 chunk_size=chunk_size,
-                                n_samples_seen=scaler_incr.n_samples_seen_)
+                                n_samples_seen=scaler_incr.n_samples_seen_[0])
 
         assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
-        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
+        assert_array_equal(scaler_batch.n_samples_seen_,
+                           scaler_incr.n_samples_seen_)
 
 
 def test_standard_scaler_partial_fit_numerical_stability():
@@ -515,7 +517,7 @@ def test_standard_scaler_trasform_with_partial_fit():
         assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
         assert_array_less(zero, scaler_incr.scale_ + epsilon)
         # (i+1) because the Scaler has been already fitted
-        assert_equal((i + 1), scaler_incr.n_samples_seen_)
+        assert_array_equal((i + 1), scaler_incr.n_samples_seen_)
 
 
 def test_min_max_scaler_iris():
@@ -822,14 +824,9 @@ def test_scale_sparse_with_mean_raise_exception():
 
 def test_scale_input_finiteness_validation():
     # Check if non finite inputs raise ValueError
-    X = [[np.nan, 5, 6, 7, 8]]
-    assert_raises_regex(ValueError,
-                        "Input contains NaN, infinity or a value too large",
-                        scale, X)
-
     X = [[np.inf, 5, 6, 7, 8]]
     assert_raises_regex(ValueError,
-                        "Input contains NaN, infinity or a value too large",
+                        "Input contains infinity or a value too large",
                         scale, X)
 
 

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -77,7 +77,7 @@
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
 ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer',
-             'MinMaxScaler', 'QuantileTransformer']
+             'MinMaxScaler', 'StandardScaler', 'QuantileTransformer']
 
 
 def _yield_non_meta_checks(name, estimator):

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
@@ -664,7 +664,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
 
     last_variance : array-like, shape: (n_features,)
 
-    last_sample_count : int
+    last_sample_count : array-like, shape (n_features,)
 
     Returns
     -------
@@ -673,7 +673,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
     updated_variance : array, shape (n_features,)
         If None, only mean is computed
 
-    updated_sample_count : int
+    updated_sample_count : array, shape (n_features,)
 
     References
     ----------
@@ -689,27 +689,33 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
-    new_sum = X.sum(axis=0)
+    new_sum = np.nansum(X, axis=0)
 
-    new_sample_count = X.shape[0]
+    new_sample_count = np.nansum(~np.isnan(X), axis=0)
     updated_sample_count = last_sample_count + new_sample_count
 
     updated_mean = (last_sum + new_sum) / updated_sample_count
 
     if last_variance is None:
         updated_variance = None
     else:
-        new_unnormalized_variance = X.var(axis=0) * new_sample_count
-        if last_sample_count == 0:  # Avoid division by 0
-            updated_unnormalized_variance = new_unnormalized_variance
-        else:
-            last_over_new_count = last_sample_count / new_sample_count
-            last_unnormalized_variance = last_variance * last_sample_count
-            updated_unnormalized_variance = (
-                last_unnormalized_variance +
-                new_unnormalized_variance +
-                last_over_new_count / updated_sample_count *
-                (last_sum / last_over_new_count - new_sum) ** 2)
+        new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
+        last_over_new_count = last_sample_count / new_sample_count
+        last_unnormalized_variance = last_variance * last_sample_count
+
+        updated_unnormalized_variance = np.zeros_like(
+            new_unnormalized_variance)
+        # avoid division by 0
+        non_zero_idx = last_sample_count > 0
+        updated_unnormalized_variance[~non_zero_idx] =\
+            new_unnormalized_variance[~non_zero_idx]
+        updated_unnormalized_variance[non_zero_idx] = \
+            (last_unnormalized_variance[non_zero_idx] +
+             new_unnormalized_variance[non_zero_idx] +
+             last_over_new_count[non_zero_idx] /
+             updated_sample_count[non_zero_idx] *
+             (last_sum[non_zero_idx] / last_over_new_count[non_zero_idx] -
+              new_sum[non_zero_idx]) ** 2)
         updated_variance = updated_unnormalized_variance / updated_sample_count
 
     return updated_mean, updated_variance, updated_sample_count

diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
@@ -122,7 +122,7 @@ def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
     last_var : float array with shape (n_features,)
         Array of feature-wise var to update with the new data X.
 
-    last_n : int
+    last_n : unsigned int with shape (n_features,)
         Number of samples seen so far, excluded X.
 
     Returns