Skip to content

Commit

Permalink
API kwonly for utils (#17046)
Browse files Browse the repository at this point in the history
* kwonly for utils

* More

* fixed some

* some more

* iwannagohomepls

* accept_sparse not kwonly anymore
  • Loading branch information
NicolasHug committed Apr 27, 2020
1 parent f624f4e commit 76ef8b0
Show file tree
Hide file tree
Showing 15 changed files with 118 additions and 93 deletions.
6 changes: 4 additions & 2 deletions sklearn/ensemble/_forest.py
Expand Up @@ -159,9 +159,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
if class_weight == 'subsample':
with catch_warnings():
simplefilter('ignore', DeprecationWarning)
curr_sample_weight *= compute_sample_weight('auto', y, indices)
curr_sample_weight *= compute_sample_weight('auto', y,
indices=indices)
elif class_weight == 'balanced_subsample':
curr_sample_weight *= compute_sample_weight('balanced', y, indices)
curr_sample_weight *= compute_sample_weight('balanced', y,
indices=indices)

tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
else:
Expand Down
5 changes: 3 additions & 2 deletions sklearn/feature_selection/_univariate_selection.py
Expand Up @@ -146,7 +146,7 @@ def f_classif(X, y):
chi2: Chi-squared stats of non-negative features for classification tasks.
f_regression: F-value between label/feature for regression tasks.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
return f_oneway(*args)

Expand Down Expand Up @@ -277,7 +277,8 @@ def f_regression(X, y, center=True):
SelectPercentile: Select features based on percentile of the highest
scores.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64)
n_samples = X.shape[0]

# compute centered values
Expand Down
2 changes: 1 addition & 1 deletion sklearn/feature_selection/tests/test_base.py
Expand Up @@ -15,7 +15,7 @@ def __init__(self, step=2):
self.step = step

def fit(self, X, y=None):
X = check_array(X, 'csc')
X = check_array(X, accept_sparse='csc')
self.n_input_feats = X.shape[1]
return self

Expand Down
11 changes: 6 additions & 5 deletions sklearn/linear_model/_coordinate_descent.py
Expand Up @@ -131,7 +131,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
if Xy is None:
X_sparse = sparse.isspmatrix(X)
sparse_center = X_sparse and (fit_intercept or normalize)
X = check_array(X, 'csc',
X = check_array(X, accept_sparse='csc',
copy=(copy_X and fit_intercept and not X_sparse))
if not X_sparse:
# X can be touched inplace thanks to the above line
Expand Down Expand Up @@ -435,10 +435,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
# We expect X and y to be already Fortran ordered when bypassing
# checks
if check_input:
X = check_array(X, 'csc', dtype=[np.float64, np.float32],
X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32],
order='F', copy=copy_X)
y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
ensure_2d=False)
y = check_array(y, accept_sparse='csc', dtype=X.dtype.type,
order='F', copy=False, ensure_2d=False)
if Xy is not None:
# Xy should be a 1d contiguous array or a 2D C ordered array
Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False,
Expand Down Expand Up @@ -1095,7 +1095,8 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,

# Do the ordering and type casting here, as if it is done in the path,
# X is copied and a reference is kept here
X_train = check_array(X_train, 'csc', dtype=dtype, order=X_order)
X_train = check_array(X_train, accept_sparse='csc', dtype=dtype,
order=X_order)
alphas, coefs, _ = path(X_train, y_train, **path_params)
del X_train, y_train

Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/_stochastic_gradient.py
Expand Up @@ -487,8 +487,8 @@ def _partial_fit(self, X, y, alpha, C,
loss, learning_rate, max_iter,
classes, sample_weight,
coef_init, intercept_init):
X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
accept_large_sparse=False)
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
order="C", accept_large_sparse=False)

n_samples, n_features = X.shape

Expand Down
3 changes: 2 additions & 1 deletion sklearn/manifold/_spectral_embedding.py
Expand Up @@ -301,7 +301,8 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
ml = smoothed_aggregation_solver(check_array(laplacian,
accept_sparse='csr'))
laplacian -= diag_shift

M = ml.aspreconditioner()
Expand Down
8 changes: 4 additions & 4 deletions sklearn/neighbors/_nca.py
Expand Up @@ -308,7 +308,7 @@ def _validate_params(self, X, y):
# Check the preferred dimensionality of the projected space
if self.n_components is not None:
check_scalar(
self.n_components, 'n_components', numbers.Integral, 1)
self.n_components, 'n_components', numbers.Integral, min_val=1)

if self.n_components > X.shape[1]:
raise ValueError('The preferred dimensionality of the '
Expand All @@ -327,9 +327,9 @@ def _validate_params(self, X, y):
.format(X.shape[1],
self.components_.shape[1]))

check_scalar(self.max_iter, 'max_iter', numbers.Integral, 1)
check_scalar(self.tol, 'tol', numbers.Real, 0.)
check_scalar(self.verbose, 'verbose', numbers.Integral, 0)
check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)
check_scalar(self.tol, 'tol', numbers.Real, min_val=0.)
check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)

if self.callback is not None:
if not callable(self.callback):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/neural_network/_rbm.py
Expand Up @@ -357,7 +357,7 @@ def fit(self, X, y=None):

n_batches = int(np.ceil(float(n_samples) / self.batch_size))
batch_slices = list(gen_even_slices(n_batches * self.batch_size,
n_batches, n_samples))
n_batches, n_samples=n_samples))
verbose = self.verbose
begin = time.time()
for iteration in range(1, self.n_iter + 1):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/preprocessing/_data.py
Expand Up @@ -1707,7 +1707,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
else:
raise ValueError("'%d' is not a supported axis" % axis)

X = check_array(X, sparse_format, copy=copy,
X = check_array(X, accept_sparse=sparse_format, copy=copy,
estimator='the normalize function', dtype=FLOAT_DTYPES)
if axis == 0:
X = X.T
Expand Down
20 changes: 12 additions & 8 deletions sklearn/utils/__init__.py
Expand Up @@ -29,7 +29,8 @@
assert_all_finite,
check_random_state, column_or_1d, check_array,
check_consistent_length, check_X_y, indexable,
check_symmetric, check_scalar)
check_symmetric, check_scalar,
_deprecate_positional_args)
from .. import get_config


Expand Down Expand Up @@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0):
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
"""
return _safe_indexing(X, indices, axis)
return _safe_indexing(X, indices, axis=axis)


def _safe_indexing(X, indices, axis=0):
def _safe_indexing(X, indices, *, axis=0):
"""Return rows, items or columns of X using indices.
.. warning::
Expand Down Expand Up @@ -684,7 +685,8 @@ def shuffle(*arrays, **options):
return resample(*arrays, **options)


def safe_sqr(X, copy=True):
@_deprecate_positional_args
def safe_sqr(X, *, copy=True):
"""Element wise squaring of array-likes and sparse matrices.
Parameters
Expand Down Expand Up @@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize):
return


def gen_batches(n, batch_size, min_batch_size=0):
@_deprecate_positional_args
def gen_batches(n, batch_size, *, min_batch_size=0):
"""Generator to create slices containing batch_size elements, from 0 to n.
The last slice may contain less than batch_size elements, when batch_size
Expand Down Expand Up @@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0):
yield slice(start, n)


def gen_even_slices(n, n_packs, n_samples=None):
@_deprecate_positional_args
def gen_even_slices(n, n_packs, *, n_samples=None):
"""Generator to create n_packs slices going up to n.
Parameters
Expand Down Expand Up @@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None):
timeit.default_timer() - start))


def get_chunk_n_rows(row_bytes, max_n_rows=None,
working_memory=None):
@_deprecate_positional_args
def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
"""Calculates how many rows can be processed within working_memory
Parameters
Expand Down
5 changes: 4 additions & 1 deletion sklearn/utils/class_weight.py
Expand Up @@ -4,6 +4,8 @@

import numpy as np

from .validation import _deprecate_positional_args


def compute_class_weight(class_weight, classes, y):
"""Estimate class weights for unbalanced datasets.
Expand Down Expand Up @@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y):
return weight


def compute_sample_weight(class_weight, y, indices=None):
@_deprecate_positional_args
def compute_sample_weight(class_weight, y, *, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
Expand Down
7 changes: 5 additions & 2 deletions sklearn/utils/multiclass.py
Expand Up @@ -27,7 +27,9 @@ def _unique_multiclass(y):


def _unique_indicator(y):
return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
return np.arange(
check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
)


_FN_UNIQUE_LABELS = {
Expand Down Expand Up @@ -83,7 +85,8 @@ def unique_labels(*ys):

# Check consistency for the indicator format
if (label_type == "multilabel-indicator" and
len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
len(set(check_array(y,
accept_sparse=['csr', 'csc', 'coo']).shape[1]
for y in ys)) > 1):
raise ValueError("Multi-label binary indicator input with "
"different numbers of labels")
Expand Down
22 changes: 12 additions & 10 deletions sklearn/utils/tests/test_class_weight.py
Expand Up @@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, range(4))
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
2. / 3, 2., 2., 2.])

# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
assert_array_almost_equal(sample_weight, expected_balanced)

# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced ** 2)

# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])

# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])


Expand All @@ -237,15 +239,15 @@ def test_compute_sample_weight_errors():
with pytest.raises(ValueError):
compute_sample_weight("ni", y)
with pytest.raises(ValueError):
compute_sample_weight("ni", y, range(4))
compute_sample_weight("ni", y, indices=range(4))
with pytest.raises(ValueError):
compute_sample_weight("ni", y_)
with pytest.raises(ValueError):
compute_sample_weight("ni", y_, range(4))
compute_sample_weight("ni", y_, indices=range(4))

# Not "balanced" for subsample
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y, range(4))
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))

# Not a list or preset for multi-output
with pytest.raises(ValueError):
Expand Down
5 changes: 3 additions & 2 deletions sklearn/utils/tests/test_validation.py
Expand Up @@ -63,7 +63,7 @@ def test_as_float_array():
X = X.astype(np.int64)
X2 = as_float_array(X, copy=True)
# Checking that the array wasn't overwritten
assert as_float_array(X, False) is not X
assert as_float_array(X, copy=False) is not X
assert X2.dtype == np.float64
# Test int dtypes <= 32bit
tested_dtypes = [np.bool,
Expand Down Expand Up @@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
"""Test that check_scalar returns no error/warning if valid inputs are
provided"""
with pytest.warns(None) as record:
check_scalar(x, "test_name", target_type, min_val, max_val)
check_scalar(x, "test_name", target_type=target_type,
min_val=min_val, max_val=max_val)
assert len(record) == 0


Expand Down

0 comments on commit 76ef8b0

Please sign in to comment.