Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] API kwonly for utils #17046

Merged
merged 8 commits into from Apr 27, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions sklearn/ensemble/_forest.py
Expand Up @@ -159,9 +159,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
if class_weight == 'subsample':
with catch_warnings():
simplefilter('ignore', DeprecationWarning)
curr_sample_weight *= compute_sample_weight('auto', y, indices)
curr_sample_weight *= compute_sample_weight('auto', y,
indices=indices)
elif class_weight == 'balanced_subsample':
curr_sample_weight *= compute_sample_weight('balanced', y, indices)
curr_sample_weight *= compute_sample_weight('balanced', y,
indices=indices)

tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
else:
Expand Down
5 changes: 3 additions & 2 deletions sklearn/feature_selection/_univariate_selection.py
Expand Up @@ -146,7 +146,7 @@ def f_classif(X, y):
chi2: Chi-squared stats of non-negative features for classification tasks.
f_regression: F-value between label/feature for regression tasks.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
return f_oneway(*args)

Expand Down Expand Up @@ -277,7 +277,8 @@ def f_regression(X, y, center=True):
SelectPercentile: Select features based on percentile of the highest
scores.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64)
n_samples = X.shape[0]

# compute centered values
Expand Down
2 changes: 1 addition & 1 deletion sklearn/feature_selection/tests/test_base.py
Expand Up @@ -15,7 +15,7 @@ def __init__(self, step=2):
self.step = step

def fit(self, X, y=None):
X = check_array(X, 'csc')
X = check_array(X, accept_sparse='csc')
self.n_input_feats = X.shape[1]
return self

Expand Down
11 changes: 6 additions & 5 deletions sklearn/linear_model/_coordinate_descent.py
Expand Up @@ -131,7 +131,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
if Xy is None:
X_sparse = sparse.isspmatrix(X)
sparse_center = X_sparse and (fit_intercept or normalize)
X = check_array(X, 'csc',
X = check_array(X, accept_sparse='csc',
copy=(copy_X and fit_intercept and not X_sparse))
if not X_sparse:
# X can be touched inplace thanks to the above line
Expand Down Expand Up @@ -435,10 +435,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
# We expect X and y to be already Fortran ordered when bypassing
# checks
if check_input:
X = check_array(X, 'csc', dtype=[np.float64, np.float32],
X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32],
order='F', copy=copy_X)
y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
ensure_2d=False)
y = check_array(y, accept_sparse='csc', dtype=X.dtype.type,
order='F', copy=False, ensure_2d=False)
if Xy is not None:
# Xy should be a 1d contiguous array or a 2D C ordered array
Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False,
Expand Down Expand Up @@ -1095,7 +1095,8 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,

# Do the ordering and type casting here, as if it is done in the path,
# X is copied and a reference is kept here
X_train = check_array(X_train, 'csc', dtype=dtype, order=X_order)
X_train = check_array(X_train, accept_sparse='csc', dtype=dtype,
order=X_order)
alphas, coefs, _ = path(X_train, y_train, **path_params)
del X_train, y_train

Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/_stochastic_gradient.py
Expand Up @@ -487,8 +487,8 @@ def _partial_fit(self, X, y, alpha, C,
loss, learning_rate, max_iter,
classes, sample_weight,
coef_init, intercept_init):
X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
accept_large_sparse=False)
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
order="C", accept_large_sparse=False)

n_samples, n_features = X.shape

Expand Down
3 changes: 2 additions & 1 deletion sklearn/manifold/_spectral_embedding.py
Expand Up @@ -301,7 +301,8 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
ml = smoothed_aggregation_solver(check_array(laplacian,
accept_sparse='csr'))
laplacian -= diag_shift

M = ml.aspreconditioner()
Expand Down
8 changes: 4 additions & 4 deletions sklearn/neighbors/_nca.py
Expand Up @@ -308,7 +308,7 @@ def _validate_params(self, X, y):
# Check the preferred dimensionality of the projected space
if self.n_components is not None:
check_scalar(
self.n_components, 'n_components', numbers.Integral, 1)
self.n_components, 'n_components', numbers.Integral, min_val=1)

if self.n_components > X.shape[1]:
raise ValueError('The preferred dimensionality of the '
Expand All @@ -327,9 +327,9 @@ def _validate_params(self, X, y):
.format(X.shape[1],
self.components_.shape[1]))

check_scalar(self.max_iter, 'max_iter', numbers.Integral, 1)
check_scalar(self.tol, 'tol', numbers.Real, 0.)
check_scalar(self.verbose, 'verbose', numbers.Integral, 0)
check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)
check_scalar(self.tol, 'tol', numbers.Real, min_val=0.)
check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)

if self.callback is not None:
if not callable(self.callback):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/neural_network/_rbm.py
Expand Up @@ -357,7 +357,7 @@ def fit(self, X, y=None):

n_batches = int(np.ceil(float(n_samples) / self.batch_size))
batch_slices = list(gen_even_slices(n_batches * self.batch_size,
n_batches, n_samples))
n_batches, n_samples=n_samples))
verbose = self.verbose
begin = time.time()
for iteration in range(1, self.n_iter + 1):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/preprocessing/_data.py
Expand Up @@ -1707,7 +1707,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
else:
raise ValueError("'%d' is not a supported axis" % axis)

X = check_array(X, sparse_format, copy=copy,
X = check_array(X, accept_sparse=sparse_format, copy=copy,
estimator='the normalize function', dtype=FLOAT_DTYPES)
if axis == 0:
X = X.T
Expand Down
20 changes: 12 additions & 8 deletions sklearn/utils/__init__.py
Expand Up @@ -29,7 +29,8 @@
assert_all_finite,
check_random_state, column_or_1d, check_array,
check_consistent_length, check_X_y, indexable,
check_symmetric, check_scalar)
check_symmetric, check_scalar,
_deprecate_positional_args)
from .. import get_config


Expand Down Expand Up @@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0):
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
"""
return _safe_indexing(X, indices, axis)
return _safe_indexing(X, indices, axis=axis)


def _safe_indexing(X, indices, axis=0):
def _safe_indexing(X, indices, *, axis=0):
"""Return rows, items or columns of X using indices.

.. warning::
Expand Down Expand Up @@ -684,7 +685,8 @@ def shuffle(*arrays, **options):
return resample(*arrays, **options)


def safe_sqr(X, copy=True):
@_deprecate_positional_args
def safe_sqr(X, *, copy=True):
"""Element wise squaring of array-likes and sparse matrices.

Parameters
Expand Down Expand Up @@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize):
return


def gen_batches(n, batch_size, min_batch_size=0):
@_deprecate_positional_args
def gen_batches(n, batch_size, *, min_batch_size=0):
"""Generator to create slices containing batch_size elements, from 0 to n.

The last slice may contain less than batch_size elements, when batch_size
Expand Down Expand Up @@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0):
yield slice(start, n)


def gen_even_slices(n, n_packs, n_samples=None):
@_deprecate_positional_args
def gen_even_slices(n, n_packs, *, n_samples=None):
"""Generator to create n_packs slices going up to n.

Parameters
Expand Down Expand Up @@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None):
timeit.default_timer() - start))


def get_chunk_n_rows(row_bytes, max_n_rows=None,
working_memory=None):
@_deprecate_positional_args
def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
"""Calculates how many rows can be processed within working_memory

Parameters
Expand Down
5 changes: 4 additions & 1 deletion sklearn/utils/class_weight.py
Expand Up @@ -4,6 +4,8 @@

import numpy as np

from .validation import _deprecate_positional_args


def compute_class_weight(class_weight, classes, y):
"""Estimate class weights for unbalanced datasets.
Expand Down Expand Up @@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y):
return weight


def compute_sample_weight(class_weight, y, indices=None):
@_deprecate_positional_args
def compute_sample_weight(class_weight, y, *, indices=None):
"""Estimate sample weights by class for unbalanced datasets.

Parameters
Expand Down
7 changes: 5 additions & 2 deletions sklearn/utils/multiclass.py
Expand Up @@ -27,7 +27,9 @@ def _unique_multiclass(y):


def _unique_indicator(y):
return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
return np.arange(
check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
)


_FN_UNIQUE_LABELS = {
Expand Down Expand Up @@ -83,7 +85,8 @@ def unique_labels(*ys):

# Check consistency for the indicator format
if (label_type == "multilabel-indicator" and
len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
len(set(check_array(y,
accept_sparse=['csr', 'csc', 'coo']).shape[1]
for y in ys)) > 1):
raise ValueError("Multi-label binary indicator input with "
"different numbers of labels")
Expand Down
22 changes: 12 additions & 10 deletions sklearn/utils/tests/test_class_weight.py
Expand Up @@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, range(4))
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
2. / 3, 2., 2., 2.])

# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
assert_array_almost_equal(sample_weight, expected_balanced)

# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced ** 2)

# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])

# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, range(6))
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])


Expand All @@ -237,15 +239,15 @@ def test_compute_sample_weight_errors():
with pytest.raises(ValueError):
compute_sample_weight("ni", y)
with pytest.raises(ValueError):
compute_sample_weight("ni", y, range(4))
compute_sample_weight("ni", y, indices=range(4))
with pytest.raises(ValueError):
compute_sample_weight("ni", y_)
with pytest.raises(ValueError):
compute_sample_weight("ni", y_, range(4))
compute_sample_weight("ni", y_, indices=range(4))

# Not "balanced" for subsample
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y, range(4))
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))

# Not a list or preset for multi-output
with pytest.raises(ValueError):
Expand Down
5 changes: 3 additions & 2 deletions sklearn/utils/tests/test_validation.py
Expand Up @@ -63,7 +63,7 @@ def test_as_float_array():
X = X.astype(np.int64)
X2 = as_float_array(X, copy=True)
# Checking that the array wasn't overwritten
assert as_float_array(X, False) is not X
assert as_float_array(X, copy=False) is not X
assert X2.dtype == np.float64
# Test int dtypes <= 32bit
tested_dtypes = [np.bool,
Expand Down Expand Up @@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
"""Test that check_scalar returns no error/warning if valid inputs are
provided"""
with pytest.warns(None) as record:
check_scalar(x, "test_name", target_type, min_val, max_val)
check_scalar(x, "test_name", target_type=target_type,
min_val=min_val, max_val=max_val)
assert len(record) == 0


Expand Down