Skip to content
Please note that GitHub no longer supports Internet Explorer.

We recommend upgrading to the latest Microsoft Edge, Google Chrome, or Firefox.

Learn more
Permalink
Browse files

FIX float16 overflow on accumulator operations in StandardScaler (#13010

)
  • Loading branch information
baluyotraf authored and rth committed Jan 26, 2019
1 parent e8045a7 commit 1f5bcaeb39698c9a150ef92cdeaeec75c355a274
Showing with 70 additions and 9 deletions.
  1. +4 −0 doc/whats_new/v0.21.rst
  2. +25 −0 sklearn/preprocessing/tests/test_data.py
  3. +35 −7 sklearn/utils/extmath.py
  4. +6 −2 sklearn/utils/validation.py
@@ -222,6 +222,10 @@ Support for Python 3.4 and below has been officially dropped.
in the dense case. Also added a new parameter ``order`` which controls output
order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_.

- |Fix| Fixed the calculation overflow when using a float16 dtype with
:class:`preprocessing.StandardScaler`. :issue:`13007` by
:user:`Raffaello Baluyot <baluyotraf>`

:mod:`sklearn.tree`
...................
- |Feature| Decision Trees can now be plotted with matplotlib using
@@ -450,6 +450,31 @@ def test_scaler_2d_arrays():
assert X_scaled is not X


def test_scaler_float16_overflow():
# Test if the scaler will not overflow on float16 numpy arrays
rng = np.random.RandomState(0)
# float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
# which is enough to overflow the data type
X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)

with np.errstate(over='raise'):
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

# Calculate the float64 equivalent to verify result
X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))

# Overflow calculations may cause -inf, inf, or nan. Since there is no nan
# input, all of the outputs should be finite. This may be redundant since a
# FloatingPointError exception will be thrown on overflow above.
assert np.all(np.isfinite(X_scaled))

# The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
# float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
# checked to account for precision differences.
assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)


def test_handle_zeros_in_scale():
s1 = np.array([0, 1, 2, 3])
s2 = _handle_zeros_in_scale(s1, copy=True)
@@ -658,6 +658,38 @@ def make_nonnegative(X, min_value=0):
return X


# Use at least float64 for the accumulating functions to avoid precision issue
# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
# as it is in case the float overflows
def _safe_accumulator_op(op, x, *args, **kwargs):
"""
This function provides numpy accumulator functions with a float64 dtype
when used on a floating point input. This prevents accumulator overflow on
smaller floating point dtypes.
Parameters
----------
op : function
A numpy accumulator function such as np.mean or np.sum
x : numpy array
A numpy array to apply the accumulator function
*args : positional arguments
Positional arguments passed to the accumulator function after the
input x
**kwargs : keyword arguments
Keyword arguments passed to the accumulator function
Returns
-------
result : The output of the accumulator function passed to this function
"""
if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:
result = op(x, *args, **kwargs, dtype=np.float64)
else:
result = op(x, *args, **kwargs)
return result


def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
"""Calculate mean update and a Youngs and Cramer variance update.
@@ -708,12 +740,7 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
# new = the current increment
# updated = the aggregated stats
last_sum = last_mean * last_sample_count
if np.issubdtype(X.dtype, np.floating) and X.dtype.itemsize < 8:
# Use at least float64 for the accumulator to avoid precision issues;
# see https://github.com/numpy/numpy/issues/9393
new_sum = np.nansum(X, axis=0, dtype=np.float64).astype(X.dtype)
else:
new_sum = np.nansum(X, axis=0)
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)

new_sample_count = np.sum(~np.isnan(X), axis=0)
updated_sample_count = last_sample_count + new_sample_count
@@ -723,7 +750,8 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
if last_variance is None:
updated_variance = None
else:
new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
new_unnormalized_variance = (
_safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
last_unnormalized_variance = last_variance * last_sample_count

with np.errstate(divide='ignore', invalid='ignore'):
@@ -34,14 +34,18 @@

def _assert_all_finite(X, allow_nan=False):
"""Like assert_all_finite, but only for ndarray."""
# validation is also imported in extmath
from .extmath import _safe_accumulator_op

if _get_config()['assume_finite']:
return
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method.
# false positives from overflow in sum method. The sum is also calculated
# safely to reduce dtype induced overflows.
is_float = X.dtype.kind in 'fc'
if is_float and np.isfinite(X.sum()):
if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
pass
elif is_float:
msg_err = "Input contains {} or a value too large for {!r}."

0 comments on commit 1f5bcae

Please sign in to comment.
You can’t perform that action at this time.