New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] ENH: Ignore NaNs in StandardScaler and scale #11206
Changes from 10 commits
5dba57a
591c18f
43fa54b
faab12a
fa12fe7
4d60bfe
eba3087
1782e69
0b074ce
a6cb76d
ab2c465
76691a9
d5ece66
504323c
8d13da1
124742b
2ffe497
424fdba
0fe8a3e
f267a35
4785fb2
082633d
d79b867
cb077ea
449d24a
6090a85
7b4a6a3
52d833f
c0b633a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,7 +138,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): | |
""" # noqa | ||
X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False, | ||
warn_on_dtype=True, estimator='the scale function', | ||
dtype=FLOAT_DTYPES) | ||
dtype=FLOAT_DTYPES, force_all_finite='allow-nan') | ||
if sparse.issparse(X): | ||
if with_mean: | ||
raise ValueError( | ||
|
@@ -154,9 +154,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): | |
else: | ||
X = np.asarray(X) | ||
if with_mean: | ||
mean_ = np.mean(X, axis) | ||
mean_ = np.nanmean(X, axis) | ||
if with_std: | ||
scale_ = np.std(X, axis) | ||
scale_ = np.nanstd(X, axis) | ||
# Xr is a view on the original array that enables easy use of | ||
# broadcasting on the axis in which we are interested in | ||
Xr = np.rollaxis(X, axis) | ||
|
@@ -179,7 +179,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): | |
scale_ = _handle_zeros_in_scale(scale_, copy=False) | ||
Xr /= scale_ | ||
if with_mean: | ||
mean_2 = Xr.mean(axis=0) | ||
mean_2 = np.nanmean(Xr, axis=0) | ||
# If mean_2 is not 'close to zero', it comes from the fact that | ||
# scale_ is very small so that mean_2 = mean_1/scale_ > 0, even | ||
# if mean_1 was close to zero. The problem is thus essentially | ||
|
@@ -533,9 +533,10 @@ class StandardScaler(BaseEstimator, TransformerMixin): | |
The variance for each feature in the training set. Used to compute | ||
`scale_` | ||
|
||
n_samples_seen_ : int | ||
The number of samples processed by the estimator. Will be reset on | ||
new calls to fit, but increments across ``partial_fit`` calls. | ||
n_samples_seen_ : array, shape(n_features,) | ||
The number of samples processed by the estimator for each feature. | ||
Will be reset on new calls to fit, but increments across | ||
``partial_fit`` calls. | ||
|
||
Examples | ||
-------- | ||
|
@@ -626,7 +627,8 @@ def partial_fit(self, X, y=None): | |
Ignored | ||
""" | ||
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, | ||
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) | ||
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES, | ||
force_all_finite='allow-nan') | ||
|
||
# Even in the case of `with_mean=False`, we update the mean anyway | ||
# This is needed for the incremental computation of the var | ||
|
@@ -641,7 +643,14 @@ def partial_fit(self, X, y=None): | |
# First pass | ||
if not hasattr(self, 'n_samples_seen_'): | ||
self.mean_, self.var_ = mean_variance_axis(X, axis=0) | ||
self.n_samples_seen_ = X.shape[0] | ||
self.n_samples_seen_ = (np.ones(X.shape[1], dtype=np.int32) | ||
* X.shape[0]) | ||
sparse_constr = (sparse.csr_matrix if X.format == 'csr' | ||
else sparse.csc_matrix) | ||
counts_nan = sparse_constr( | ||
(np.isnan(X.data), X.indices, X.indptr), | ||
shape=X.shape).sum(axis=0).A.ravel() | ||
self.n_samples_seen_ -= counts_nan | ||
# Next passes | ||
else: | ||
self.mean_, self.var_, self.n_samples_seen_ = \ | ||
|
@@ -656,9 +665,10 @@ def partial_fit(self, X, y=None): | |
# First pass | ||
if not hasattr(self, 'n_samples_seen_'): | ||
self.mean_ = .0 | ||
self.n_samples_seen_ = 0 | ||
self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int32) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I being overly cautious if I worry that this changes public API? We could squeeze this back to a single value if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree it's better to try to preserve backward compat but maybe instead of using a data dependent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In other feature-wise preprocessing, we're allowing NaNs through by default, under the assumption that extra processing cost is negligible and that downstream estimators will deal with or complain about the presence of NaN. Thus There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, I misread the code snippet. We don't have much choice but to use the |
||
if self.with_std: | ||
self.var_ = .0 | ||
|
||
else: | ||
self.var_ = None | ||
|
||
|
@@ -695,7 +705,8 @@ def transform(self, X, y='deprecated', copy=None): | |
|
||
copy = copy if copy is not None else self.copy | ||
X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True, | ||
estimator=self, dtype=FLOAT_DTYPES) | ||
estimator=self, dtype=FLOAT_DTYPES, | ||
force_all_finite='allow-nan') | ||
|
||
if sparse.issparse(X): | ||
if self.with_mean: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,9 @@ | |
|
||
from sklearn.base import clone | ||
|
||
from sklearn.preprocessing import QuantileTransformer | ||
from sklearn.preprocessing import MinMaxScaler | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.preprocessing import QuantileTransformer | ||
|
||
from sklearn.utils.testing import assert_array_equal | ||
from sklearn.utils.testing import assert_allclose | ||
|
@@ -25,6 +26,8 @@ def _get_valid_samples_by_column(X, col): | |
@pytest.mark.parametrize( | ||
"est, support_sparse", | ||
[(MinMaxScaler(), False), | ||
(StandardScaler(), False), | ||
(StandardScaler(with_mean=False), True), | ||
(QuantileTransformer(n_quantiles=10, random_state=42), True)] | ||
) | ||
def test_missing_value_handling(est, support_sparse): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this should be extended for the |
||
|
@@ -57,7 +60,7 @@ def test_missing_value_handling(est, support_sparse): | |
est.fit(_get_valid_samples_by_column(X_train, i)) | ||
# check transforming with NaN works even when training without NaN | ||
Xt_col = est.transform(X_test[:, [i]]) | ||
assert_array_equal(Xt_col, Xt[:, [i]]) | ||
assert_allclose(Xt_col, Xt[:, [i]]) | ||
# check non-NaN is handled as before - the 1st column is all nan | ||
if not np.isnan(X_test[:, i]).all(): | ||
Xt_col_nonan = est.transform( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -203,7 +203,7 @@ def test_standard_scaler_1d(): | |
np.zeros_like(n_features)) | ||
assert_array_almost_equal(X_scaled.mean(axis=0), .0) | ||
assert_array_almost_equal(X_scaled.std(axis=0), 1.) | ||
assert_equal(scaler.n_samples_seen_, X.shape[0]) | ||
assert_array_equal(scaler.n_samples_seen_, X.shape[0]) | ||
|
||
# check inverse transform | ||
X_scaled_back = scaler.inverse_transform(X_scaled) | ||
|
@@ -217,7 +217,7 @@ def test_standard_scaler_1d(): | |
assert_almost_equal(scaler.scale_, 1.) | ||
assert_array_almost_equal(X_scaled.mean(axis=0), .0) | ||
assert_array_almost_equal(X_scaled.std(axis=0), .0) | ||
assert_equal(scaler.n_samples_seen_, X.shape[0]) | ||
assert_array_equal(scaler.n_samples_seen_, X.shape[0]) | ||
|
||
|
||
def test_scale_1d(): | ||
|
@@ -283,7 +283,7 @@ def test_scaler_2d_arrays(): | |
scaler = StandardScaler() | ||
X_scaled = scaler.fit(X).transform(X, copy=True) | ||
assert_false(np.any(np.isnan(X_scaled))) | ||
assert_equal(scaler.n_samples_seen_, n_samples) | ||
assert_array_equal(scaler.n_samples_seen_, n_samples) | ||
|
||
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) | ||
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) | ||
|
@@ -399,7 +399,8 @@ def test_standard_scaler_partial_fit(): | |
|
||
assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) | ||
assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones | ||
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) | ||
assert_array_equal(scaler_batch.n_samples_seen_, | ||
scaler_incr.n_samples_seen_) | ||
|
||
# Test std after 1 step | ||
batch0 = slice(0, chunk_size) | ||
|
@@ -423,10 +424,11 @@ def test_standard_scaler_partial_fit(): | |
assert_correct_incr(i, batch_start=batch.start, | ||
batch_stop=batch.stop, n=n, | ||
chunk_size=chunk_size, | ||
n_samples_seen=scaler_incr.n_samples_seen_) | ||
n_samples_seen=scaler_incr.n_samples_seen_[0]) | ||
|
||
assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) | ||
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) | ||
assert_array_equal(scaler_batch.n_samples_seen_, | ||
scaler_incr.n_samples_seen_) | ||
|
||
|
||
def test_standard_scaler_partial_fit_numerical_stability(): | ||
|
@@ -515,7 +517,7 @@ def test_standard_scaler_trasform_with_partial_fit(): | |
assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal | ||
assert_array_less(zero, scaler_incr.scale_ + epsilon) | ||
# (i+1) because the Scaler has been already fitted | ||
assert_equal((i + 1), scaler_incr.n_samples_seen_) | ||
assert_array_equal((i + 1), scaler_incr.n_samples_seen_) | ||
|
||
|
||
def test_min_max_scaler_iris(): | ||
|
@@ -822,14 +824,9 @@ def test_scale_sparse_with_mean_raise_exception(): | |
|
||
def test_scale_input_finiteness_validation(): | ||
# Check if non finite inputs raise ValueError | ||
X = [[np.nan, 5, 6, 7, 8]] | ||
assert_raises_regex(ValueError, | ||
"Input contains NaN, infinity or a value too large", | ||
scale, X) | ||
|
||
X = [[np.inf, 5, 6, 7, 8]] | ||
assert_raises_regex(ValueError, | ||
"Input contains NaN, infinity or a value too large", | ||
"Input contains infinity or a value too large", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually a weird message: what is a "value too large" if it's not infinity? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the values vary too much and computing the scale is not numerically possible (overflow)? If so we should add a specific test for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not the reason: >>> from sklearn.preprocessing import scale
>>> import numpy as np
>>> data = np.array([np.finfo(np.float32).max, np.finfo(np.float32).min])
>>> scale(data)
/home/ogrisel/.virtualenvs/py36/lib/python3.6/site-packages/numpy/core/_methods.py:116: RuntimeWarning: overflow encountered in multiply
x = um.multiply(x, x, out=x)
array([ 0., -0.], dtype=float32) So I would just change the message to "Input contains infinity". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uhm this is the error message of |
||
scale, X) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -664,7 +664,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
|
||
last_variance : array-like, shape: (n_features,) | ||
|
||
last_sample_count : int | ||
last_sample_count : array-like, shape (n_features,) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default is still 0, which will actually never work. |
||
|
||
Returns | ||
------- | ||
|
@@ -673,7 +673,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
updated_variance : array, shape (n_features,) | ||
If None, only mean is computed | ||
|
||
updated_sample_count : int | ||
updated_sample_count : array, shape (n_features,) | ||
|
||
References | ||
---------- | ||
|
@@ -689,27 +689,33 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
# new = the current increment | ||
# updated = the aggregated stats | ||
last_sum = last_mean * last_sample_count | ||
new_sum = X.sum(axis=0) | ||
new_sum = np.nansum(X, axis=0) | ||
|
||
new_sample_count = X.shape[0] | ||
new_sample_count = np.nansum(~np.isnan(X), axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nansum of a bool array? |
||
updated_sample_count = last_sample_count + new_sample_count | ||
|
||
updated_mean = (last_sum + new_sum) / updated_sample_count | ||
|
||
if last_variance is None: | ||
updated_variance = None | ||
else: | ||
new_unnormalized_variance = X.var(axis=0) * new_sample_count | ||
if last_sample_count == 0: # Avoid division by 0 | ||
updated_unnormalized_variance = new_unnormalized_variance | ||
else: | ||
last_over_new_count = last_sample_count / new_sample_count | ||
last_unnormalized_variance = last_variance * last_sample_count | ||
updated_unnormalized_variance = ( | ||
last_unnormalized_variance + | ||
new_unnormalized_variance + | ||
last_over_new_count / updated_sample_count * | ||
(last_sum / last_over_new_count - new_sum) ** 2) | ||
new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count | ||
last_over_new_count = last_sample_count / new_sample_count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can also already divide by zero, and then will give a RuntimeWarning by numpy, need to ignore this with an errstate? |
||
last_unnormalized_variance = last_variance * last_sample_count | ||
|
||
updated_unnormalized_variance = np.zeros_like( | ||
new_unnormalized_variance) | ||
# avoid division by 0 | ||
non_zero_idx = last_sample_count > 0 | ||
updated_unnormalized_variance[~non_zero_idx] =\ | ||
new_unnormalized_variance[~non_zero_idx] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be more readable to just do the calculation on the full array, and then in the end do |
||
updated_unnormalized_variance[non_zero_idx] = \ | ||
(last_unnormalized_variance[non_zero_idx] + | ||
new_unnormalized_variance[non_zero_idx] + | ||
last_over_new_count[non_zero_idx] / | ||
updated_sample_count[non_zero_idx] * | ||
(last_sum[non_zero_idx] / last_over_new_count[non_zero_idx] - | ||
new_sum[non_zero_idx]) ** 2) | ||
updated_variance = updated_unnormalized_variance / updated_sample_count | ||
|
||
return updated_mean, updated_variance, updated_sample_count | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wish we could just use
X._with_data
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, it could be nice to have something like that publicly exposed.