Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC _convert_container #28681

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.5.rst
Expand Up @@ -367,6 +367,10 @@ Changelog
`axis=0` and supports indexing polars Series.
:pr:`28521` by :user:`Yao Xiao <Charlie-XIAO>`.

- |Fix| :func:`~utils._safe_indexing` now works correctly for scipy sparse arrays with
scalar index.
:pr:`28681` by :user:`Yao Xiao <Charlie-XIAO>`.

.. rubric:: Code and documentation contributors

Thanks to everyone who has contributed to the maintenance and improvement of
Expand Down
28 changes: 14 additions & 14 deletions sklearn/cluster/_hdbscan/tests/test_reachibility.py
Expand Up @@ -2,61 +2,61 @@
import pytest

from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS


def test_mutual_reachability_graph_error_sparse_format():
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_mutual_reachability_graph_error_sparse_format(csc_container):
"""Check that we raise an error if the sparse format is not CSR."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, "sparse_csc")
X = csc_container(X)

err_msg = "Only sparse CSR matrices are supported"
with pytest.raises(ValueError, match=err_msg):
mutual_reachability_graph(X)


@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
def test_mutual_reachability_graph_inplace(array_type):
@pytest.mark.parametrize("csr_container", [np.asarray] + CSR_CONTAINERS)
def test_mutual_reachability_graph_inplace(csr_container):
"""Check that the operation is happening inplace."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
X = csr_container(X)

mr_graph = mutual_reachability_graph(X)

assert id(mr_graph) == id(X)


def test_mutual_reachability_graph_equivalence_dense_sparse():
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_mutual_reachability_graph_equivalence_dense_sparse(csr_container):
"""Check that we get the same results for dense and sparse implementation."""
rng = np.random.RandomState(0)
X = rng.randn(5, 5)
X_dense = X.T @ X
X_sparse = _convert_container(X_dense, "sparse_csr")
X_sparse = csr_container(X_dense)

mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)

assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())


@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
@pytest.mark.parametrize("csr_container", [np.asarray] + CSR_CONTAINERS)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
def test_mutual_reachability_graph_preserve_dtype(csr_container, dtype):
"""Check that the computation preserve dtype thanks to fused types."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = (X.T @ X).astype(dtype)
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
X = csr_container(X, dtype=dtype)

assert X.dtype == dtype
mr_graph = mutual_reachability_graph(X)
Expand Down
18 changes: 9 additions & 9 deletions sklearn/compose/tests/test_column_transformer.py
Expand Up @@ -172,16 +172,16 @@ def test_column_transformer_tuple_transformers_parameter():
)


@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
def test_column_transformer_dataframe(constructor_name):
if constructor_name == "dataframe":
dataframe_lib = pytest.importorskip("pandas")
else:
dataframe_lib = pytest.importorskip(constructor_name)
@pytest.mark.parametrize("constructor_lib", ["pandas", "polars"])
def test_column_transformer_dataframe(constructor_lib):
dataframe_lib = pytest.importorskip(constructor_lib)

X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = _convert_container(
X_array, constructor_name, columns_name=["first", "second"]
X_array,
"dataframe",
constructor_lib=constructor_lib,
column_names=["first", "second"],
)

X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
Expand All @@ -206,7 +206,7 @@ def test_column_transformer_dataframe(constructor_name):
(np.array([True, False]), X_res_first),
([True, False], X_res_first),
]
if constructor_name == "dataframe":
if constructor_lib == "pandas":
# Scalars are only supported for pandas dataframes.
cases.extend(
[
Expand Down Expand Up @@ -306,7 +306,7 @@ def transform(self, X, y=None):
)
ct.fit_transform(X_df)

if constructor_name == "dataframe":
if constructor_lib == "pandas":
# DataFrame protocol does not have 1d columns, so we only test on Pandas
# dataframes.
ct = ColumnTransformer(
Expand Down
Expand Up @@ -1473,8 +1473,9 @@ def test_dataframe_categorical_results_same_as_ndarray(
f_cat = [f"cat{c:0>3}" for c in f_cat]
X_df = _convert_container(
np.asarray([f_num, f_cat]).T,
dataframe_lib,
["f_num", "f_cat"],
"dataframe",
constructor_lib=dataframe_lib,
column_names=["f_num", "f_cat"],
categorical_feature_names=["f_cat"],
)

Expand Down Expand Up @@ -1517,7 +1518,11 @@ def test_dataframe_categorical_errors(dataframe_lib, HistGradientBoosting):
rng = np.random.RandomState(42)
f_cat = rng.randint(0, high=100, size=100).astype(str)
X_df = _convert_container(
f_cat[:, None], dataframe_lib, ["f_cat"], categorical_feature_names=["f_cat"]
f_cat[:, None],
"dataframe",
constructor_lib=dataframe_lib,
column_names=["f_cat"],
categorical_feature_names=["f_cat"],
)
y = rng.randint(0, high=2, size=100)

Expand All @@ -1543,14 +1548,16 @@ def test_categorical_different_order_same_model(dataframe_lib):
f_cat_b_a = np.asarray(["B", "A"])[f_ints]
df_a_b = _convert_container(
f_cat_a_b[:, None],
dataframe_lib,
["f_cat"],
"dataframe",
constructor_lib=dataframe_lib,
column_names=["f_cat"],
categorical_feature_names=["f_cat"],
)
df_b_a = _convert_container(
f_cat_b_a[:, None],
dataframe_lib,
["f_cat"],
"dataframe",
constructor_lib=dataframe_lib,
column_names=["f_cat"],
categorical_feature_names=["f_cat"],
)

Expand Down
Expand Up @@ -218,9 +218,11 @@ def test_predictions(global_random_seed, use_feature_names):
f_0 = rng.rand(n_samples) # positive correlation with y
f_1 = rng.rand(n_samples) # negative correslation with y
X = np.c_[f_0, f_1]
columns_name = ["f_0", "f_1"]
constructor_name = "dataframe" if use_feature_names else "array"
X = _convert_container(X, constructor_name, columns_name=columns_name)
convert_container_kwargs = {
"constructor_type": "dataframe" if use_feature_names else "array",
"column_names": ["f_0", "f_1"], # no effect if array
}
X = _convert_container(X, **convert_container_kwargs)

noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
Expand Down Expand Up @@ -250,24 +252,24 @@ def test_predictions(global_random_seed, use_feature_names):
# First feature (POS)
# assert pred is all increasing when f_0 is all increasing
X = np.c_[linspace, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
X = _convert_container(X, **convert_container_kwargs)
pred = gbdt.predict(X)
assert is_increasing(pred)
# assert pred actually follows the variations of f_0
X = np.c_[sin, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
X = _convert_container(X, **convert_container_kwargs)
pred = gbdt.predict(X)
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))

# Second feature (NEG)
# assert pred is all decreasing when f_1 is all increasing
X = np.c_[constant, linspace]
X = _convert_container(X, constructor_name, columns_name=columns_name)
X = _convert_container(X, **convert_container_kwargs)
pred = gbdt.predict(X)
assert is_decreasing(pred)
# assert pred actually follows the inverse variations of f_1
X = np.c_[constant, sin]
X = _convert_container(X, constructor_name, columns_name=columns_name)
X = _convert_container(X, **convert_container_kwargs)
pred = gbdt.predict(X)
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()

Expand Down
19 changes: 12 additions & 7 deletions sklearn/ensemble/tests/test_forest.py
Expand Up @@ -49,7 +49,6 @@
from sklearn.svm import LinearSVC
from sklearn.tree._classes import SPARSE_SPLITTERS
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
assert_almost_equal,
assert_array_almost_equal,
Expand Down Expand Up @@ -461,7 +460,7 @@ def test_unfitted_feature_importances(name):


@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values())
@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
@pytest.mark.parametrize(
"X, y, lower_bound_accuracy",
[
Expand All @@ -488,10 +487,12 @@ def test_unfitted_feature_importances(name):
)
@pytest.mark.parametrize("oob_score", [True, partial(f1_score, average="micro")])
def test_forest_classifier_oob(
ForestClassifier, X, y, X_type, lower_bound_accuracy, oob_score
ForestClassifier, X, y, sparse_container, lower_bound_accuracy, oob_score
):
"""Check that OOB score is close to score on a test set."""
X = _convert_container(X, constructor_name=X_type)
if sparse_container is not None:
X = sparse_container(X)

X_train, X_test, y_train, y_test = train_test_split(
X,
y,
Expand Down Expand Up @@ -529,7 +530,7 @@ def test_forest_classifier_oob(


@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values())
@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
@pytest.mark.parametrize(
"X, y, lower_bound_r2",
[
Expand All @@ -548,10 +549,14 @@ def test_forest_classifier_oob(
],
)
@pytest.mark.parametrize("oob_score", [True, explained_variance_score])
def test_forest_regressor_oob(ForestRegressor, X, y, X_type, lower_bound_r2, oob_score):
def test_forest_regressor_oob(
ForestRegressor, X, y, sparse_container, lower_bound_r2, oob_score
):
"""Check that forest-based regressor provide an OOB score close to the
score on a test set."""
X = _convert_container(X, constructor_name=X_type)
if sparse_container is not None:
X = sparse_container(X)

X_train, X_test, y_train, y_test = train_test_split(
X,
y,
Expand Down
6 changes: 3 additions & 3 deletions sklearn/feature_selection/tests/test_feature_select.py
Expand Up @@ -27,7 +27,6 @@
)
from sklearn.utils import safe_mask
from sklearn.utils._testing import (
_convert_container,
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
Expand Down Expand Up @@ -94,7 +93,8 @@ def test_f_classif(csr_container):


@pytest.mark.parametrize("center", [True, False])
def test_r_regression(center):
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_r_regression(center, csr_container):
X, y = make_regression(
n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0
)
Expand All @@ -103,7 +103,7 @@ def test_r_regression(center):
assert (-1 < corr_coeffs).all()
assert (corr_coeffs < 1).all()

sparse_X = _convert_container(X, "sparse")
sparse_X = csr_container(X)

sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
assert_allclose(sparse_corr_coeffs, corr_coeffs)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/impute/tests/test_base.py
Expand Up @@ -96,7 +96,7 @@ def test_assign_where(X1_type):
rng = np.random.RandomState(0)

n_samples, n_features = 10, 5
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_type=X1_type)
X2 = rng.randn(n_samples, n_features)
mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)

Expand Down
26 changes: 17 additions & 9 deletions sklearn/impute/tests/test_impute.py
Expand Up @@ -22,7 +22,6 @@
from sklearn.pipeline import Pipeline, make_union
from sklearn.random_projection import _sparse_random_matrix
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
assert_allclose_dense_sparse,
assert_array_almost_equal,
Expand Down Expand Up @@ -1667,15 +1666,19 @@ def test_imputer_transform_preserves_numeric_dtype(dtype_test):
assert X_trans.dtype == dtype_test


@pytest.mark.parametrize("array_type", ["array", "sparse"])
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
@pytest.mark.parametrize("keep_empty_features", [True, False])
def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_features):
def test_simple_imputer_constant_keep_empty_features(
csr_container, keep_empty_features
):
"""Check the behaviour of `keep_empty_features` with `strategy='constant'.
For backward compatibility, a column full of missing values will always be
fill and never dropped.
"""
X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
X = _convert_container(X, array_type)
if csr_container is not None:
X = csr_container(X)

fill_value = 10
imputer = SimpleImputer(
strategy="constant",
Expand All @@ -1687,28 +1690,33 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat
X_imputed = getattr(imputer, method)(X)
assert X_imputed.shape == X.shape
constant_feature = (
X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
X_imputed[:, 0] if csr_container is None else X_imputed[:, [0]].toarray()
)
assert_array_equal(constant_feature, fill_value)


@pytest.mark.parametrize("array_type", ["array", "sparse"])
@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
@pytest.mark.parametrize("keep_empty_features", [True, False])
def test_simple_imputer_keep_empty_features(strategy, array_type, keep_empty_features):
def test_simple_imputer_keep_empty_features(
strategy, csr_container, keep_empty_features
):
"""Check the behaviour of `keep_empty_features` with all strategies but
'constant'.
"""
X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
X = _convert_container(X, array_type)
if csr_container is not None:
X = csr_container(X)
imputer = SimpleImputer(strategy=strategy, keep_empty_features=keep_empty_features)

for method in ["fit_transform", "transform"]:
X_imputed = getattr(imputer, method)(X)
if keep_empty_features:
assert X_imputed.shape == X.shape
constant_feature = (
X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
X_imputed[:, 0]
if csr_container is None
else X_imputed[:, [0]].toarray()
)
assert_array_equal(constant_feature, 0)
else:
Expand Down