scikit-learn · Charlie-XIAO · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -367,6 +367,10 @@ Changelog
   `axis=0` and supports indexing polars Series.
   :pr:`28521` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+- |Fix| :func:`~utils._safe_indexing` now works correctly for scipy sparse arrays with
+  scalar index.
+  :pr:`28681` by :user:`Yao Xiao <Charlie-XIAO>`.
+
 .. rubric:: Code and documentation contributors
 
 Thanks to everyone who has contributed to the maintenance and improvement of

diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py
@@ -2,61 +2,61 @@
 import pytest
 
 from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
-from sklearn.utils._testing import (
-    _convert_container,
-    assert_allclose,
-)
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
 
 
-def test_mutual_reachability_graph_error_sparse_format():
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_mutual_reachability_graph_error_sparse_format(csc_container):
     """Check that we raise an error if the sparse format is not CSR."""
     rng = np.random.RandomState(0)
     X = rng.randn(10, 10)
     X = X.T @ X
     np.fill_diagonal(X, 0.0)
-    X = _convert_container(X, "sparse_csc")
+    X = csc_container(X)
 
     err_msg = "Only sparse CSR matrices are supported"
     with pytest.raises(ValueError, match=err_msg):
         mutual_reachability_graph(X)
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
-def test_mutual_reachability_graph_inplace(array_type):
+@pytest.mark.parametrize("csr_container", [np.asarray] + CSR_CONTAINERS)
+def test_mutual_reachability_graph_inplace(csr_container):
     """Check that the operation is happening inplace."""
     rng = np.random.RandomState(0)
     X = rng.randn(10, 10)
     X = X.T @ X
     np.fill_diagonal(X, 0.0)
-    X = _convert_container(X, array_type)
+    X = csr_container(X)
 
     mr_graph = mutual_reachability_graph(X)
 
     assert id(mr_graph) == id(X)
 
 
-def test_mutual_reachability_graph_equivalence_dense_sparse():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_mutual_reachability_graph_equivalence_dense_sparse(csr_container):
     """Check that we get the same results for dense and sparse implementation."""
     rng = np.random.RandomState(0)
     X = rng.randn(5, 5)
     X_dense = X.T @ X
-    X_sparse = _convert_container(X_dense, "sparse_csr")
+    X_sparse = csr_container(X_dense)
 
     mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
     mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
 
     assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+@pytest.mark.parametrize("csr_container", [np.asarray] + CSR_CONTAINERS)
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
+def test_mutual_reachability_graph_preserve_dtype(csr_container, dtype):
     """Check that the computation preserve dtype thanks to fused types."""
     rng = np.random.RandomState(0)
     X = rng.randn(10, 10)
     X = (X.T @ X).astype(dtype)
     np.fill_diagonal(X, 0.0)
-    X = _convert_container(X, array_type)
+    X = csr_container(X, dtype=dtype)
 
     assert X.dtype == dtype
     mr_graph = mutual_reachability_graph(X)

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -172,16 +172,16 @@ def test_column_transformer_tuple_transformers_parameter():
     )
 
 
-@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
-def test_column_transformer_dataframe(constructor_name):
-    if constructor_name == "dataframe":
-        dataframe_lib = pytest.importorskip("pandas")
-    else:
-        dataframe_lib = pytest.importorskip(constructor_name)
+@pytest.mark.parametrize("constructor_lib", ["pandas", "polars"])
+def test_column_transformer_dataframe(constructor_lib):
+    dataframe_lib = pytest.importorskip(constructor_lib)
 
     X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
     X_df = _convert_container(
-        X_array, constructor_name, columns_name=["first", "second"]
+        X_array,
+        "dataframe",
+        constructor_lib=constructor_lib,
+        column_names=["first", "second"],
     )
 
     X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
@@ -206,7 +206,7 @@ def test_column_transformer_dataframe(constructor_name):
         (np.array([True, False]), X_res_first),
         ([True, False], X_res_first),
     ]
-    if constructor_name == "dataframe":
+    if constructor_lib == "pandas":
         # Scalars are only supported for pandas dataframes.
         cases.extend(
             [
@@ -306,7 +306,7 @@ def transform(self, X, y=None):
     )
     ct.fit_transform(X_df)
 
-    if constructor_name == "dataframe":
+    if constructor_lib == "pandas":
         # DataFrame protocol does not have 1d columns, so we only test on Pandas
         # dataframes.
         ct = ColumnTransformer(

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1473,8 +1473,9 @@ def test_dataframe_categorical_results_same_as_ndarray(
     f_cat = [f"cat{c:0>3}" for c in f_cat]
     X_df = _convert_container(
         np.asarray([f_num, f_cat]).T,
-        dataframe_lib,
-        ["f_num", "f_cat"],
+        "dataframe",
+        constructor_lib=dataframe_lib,
+        column_names=["f_num", "f_cat"],
         categorical_feature_names=["f_cat"],
     )
 
@@ -1517,7 +1518,11 @@ def test_dataframe_categorical_errors(dataframe_lib, HistGradientBoosting):
     rng = np.random.RandomState(42)
     f_cat = rng.randint(0, high=100, size=100).astype(str)
     X_df = _convert_container(
-        f_cat[:, None], dataframe_lib, ["f_cat"], categorical_feature_names=["f_cat"]
+        f_cat[:, None],
+        "dataframe",
+        constructor_lib=dataframe_lib,
+        column_names=["f_cat"],
+        categorical_feature_names=["f_cat"],
     )
     y = rng.randint(0, high=2, size=100)
 
@@ -1543,14 +1548,16 @@ def test_categorical_different_order_same_model(dataframe_lib):
     f_cat_b_a = np.asarray(["B", "A"])[f_ints]
     df_a_b = _convert_container(
         f_cat_a_b[:, None],
-        dataframe_lib,
-        ["f_cat"],
+        "dataframe",
+        constructor_lib=dataframe_lib,
+        column_names=["f_cat"],
         categorical_feature_names=["f_cat"],
     )
     df_b_a = _convert_container(
         f_cat_b_a[:, None],
-        dataframe_lib,
-        ["f_cat"],
+        "dataframe",
+        constructor_lib=dataframe_lib,
+        column_names=["f_cat"],
         categorical_feature_names=["f_cat"],
     )
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -218,9 +218,11 @@ def test_predictions(global_random_seed, use_feature_names):
     f_0 = rng.rand(n_samples)  # positive correlation with y
     f_1 = rng.rand(n_samples)  # negative correslation with y
     X = np.c_[f_0, f_1]
-    columns_name = ["f_0", "f_1"]
-    constructor_name = "dataframe" if use_feature_names else "array"
-    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    convert_container_kwargs = {
+        "constructor_type": "dataframe" if use_feature_names else "array",
+        "column_names": ["f_0", "f_1"],  # no effect if array
+    }
+    X = _convert_container(X, **convert_container_kwargs)
 
     noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
     y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
@@ -250,24 +252,24 @@ def test_predictions(global_random_seed, use_feature_names):
     # First feature (POS)
     # assert pred is all increasing when f_0 is all increasing
     X = np.c_[linspace, constant]
-    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    X = _convert_container(X, **convert_container_kwargs)
     pred = gbdt.predict(X)
     assert is_increasing(pred)
     # assert pred actually follows the variations of f_0
     X = np.c_[sin, constant]
-    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    X = _convert_container(X, **convert_container_kwargs)
     pred = gbdt.predict(X)
     assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
 
     # Second feature (NEG)
     # assert pred is all decreasing when f_1 is all increasing
     X = np.c_[constant, linspace]
-    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    X = _convert_container(X, **convert_container_kwargs)
     pred = gbdt.predict(X)
     assert is_decreasing(pred)
     # assert pred actually follows the inverse variations of f_1
     X = np.c_[constant, sin]
-    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    X = _convert_container(X, **convert_container_kwargs)
     pred = gbdt.predict(X)
     assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
 

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -49,7 +49,6 @@
 from sklearn.svm import LinearSVC
 from sklearn.tree._classes import SPARSE_SPLITTERS
 from sklearn.utils._testing import (
-    _convert_container,
     assert_allclose,
     assert_almost_equal,
     assert_array_almost_equal,
@@ -461,7 +460,7 @@ def test_unfitted_feature_importances(name):
 
 
 @pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values())
-@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
 @pytest.mark.parametrize(
     "X, y, lower_bound_accuracy",
     [
@@ -488,10 +487,12 @@ def test_unfitted_feature_importances(name):
 )
 @pytest.mark.parametrize("oob_score", [True, partial(f1_score, average="micro")])
 def test_forest_classifier_oob(
-    ForestClassifier, X, y, X_type, lower_bound_accuracy, oob_score
+    ForestClassifier, X, y, sparse_container, lower_bound_accuracy, oob_score
 ):
     """Check that OOB score is close to score on a test set."""
-    X = _convert_container(X, constructor_name=X_type)
+    if sparse_container is not None:
+        X = sparse_container(X)
+
     X_train, X_test, y_train, y_test = train_test_split(
         X,
         y,
@@ -529,7 +530,7 @@ def test_forest_classifier_oob(
 
 
 @pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values())
-@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
 @pytest.mark.parametrize(
     "X, y, lower_bound_r2",
     [
@@ -548,10 +549,14 @@ def test_forest_classifier_oob(
     ],
 )
 @pytest.mark.parametrize("oob_score", [True, explained_variance_score])
-def test_forest_regressor_oob(ForestRegressor, X, y, X_type, lower_bound_r2, oob_score):
+def test_forest_regressor_oob(
+    ForestRegressor, X, y, sparse_container, lower_bound_r2, oob_score
+):
     """Check that forest-based regressor provide an OOB score close to the
     score on a test set."""
-    X = _convert_container(X, constructor_name=X_type)
+    if sparse_container is not None:
+        X = sparse_container(X)
+
     X_train, X_test, y_train, y_test = train_test_split(
         X,
         y,

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -27,7 +27,6 @@
 )
 from sklearn.utils import safe_mask
 from sklearn.utils._testing import (
-    _convert_container,
     assert_almost_equal,
     assert_array_almost_equal,
     assert_array_equal,
@@ -94,7 +93,8 @@ def test_f_classif(csr_container):
 
 
 @pytest.mark.parametrize("center", [True, False])
-def test_r_regression(center):
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_r_regression(center, csr_container):
     X, y = make_regression(
         n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0
     )
@@ -103,7 +103,7 @@ def test_r_regression(center):
     assert (-1 < corr_coeffs).all()
     assert (corr_coeffs < 1).all()
 
-    sparse_X = _convert_container(X, "sparse")
+    sparse_X = csr_container(X)
 
     sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
     assert_allclose(sparse_corr_coeffs, corr_coeffs)

diff --git a/sklearn/impute/tests/test_base.py b/sklearn/impute/tests/test_base.py
@@ -96,7 +96,7 @@ def test_assign_where(X1_type):
     rng = np.random.RandomState(0)
 
     n_samples, n_features = 10, 5
-    X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
+    X1 = _convert_container(rng.randn(n_samples, n_features), constructor_type=X1_type)
     X2 = rng.randn(n_samples, n_features)
     mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)
 

diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
@@ -22,7 +22,6 @@
 from sklearn.pipeline import Pipeline, make_union
 from sklearn.random_projection import _sparse_random_matrix
 from sklearn.utils._testing import (
-    _convert_container,
     assert_allclose,
     assert_allclose_dense_sparse,
     assert_array_almost_equal,
@@ -1667,15 +1666,19 @@ def test_imputer_transform_preserves_numeric_dtype(dtype_test):
     assert X_trans.dtype == dtype_test
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse"])
+@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
 @pytest.mark.parametrize("keep_empty_features", [True, False])
-def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_features):
+def test_simple_imputer_constant_keep_empty_features(
+    csr_container, keep_empty_features
+):
     """Check the behaviour of `keep_empty_features` with `strategy='constant'.
     For backward compatibility, a column full of missing values will always be
     fill and never dropped.
     """
     X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
-    X = _convert_container(X, array_type)
+    if csr_container is not None:
+        X = csr_container(X)
+
     fill_value = 10
     imputer = SimpleImputer(
         strategy="constant",
@@ -1687,28 +1690,33 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat
         X_imputed = getattr(imputer, method)(X)
         assert X_imputed.shape == X.shape
         constant_feature = (
-            X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
+            X_imputed[:, 0] if csr_container is None else X_imputed[:, [0]].toarray()
         )
         assert_array_equal(constant_feature, fill_value)
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse"])
 @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
 @pytest.mark.parametrize("keep_empty_features", [True, False])
-def test_simple_imputer_keep_empty_features(strategy, array_type, keep_empty_features):
+def test_simple_imputer_keep_empty_features(
+    strategy, csr_container, keep_empty_features
+):
     """Check the behaviour of `keep_empty_features` with all strategies but
     'constant'.
     """
     X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
-    X = _convert_container(X, array_type)
+    if csr_container is not None:
+        X = csr_container(X)
     imputer = SimpleImputer(strategy=strategy, keep_empty_features=keep_empty_features)
 
     for method in ["fit_transform", "transform"]:
         X_imputed = getattr(imputer, method)(X)
         if keep_empty_features:
             assert X_imputed.shape == X.shape
             constant_feature = (
-                X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
+                X_imputed[:, 0]
+                if csr_container is None
+                else X_imputed[:, [0]].toarray()
             )
             assert_array_equal(constant_feature, 0)
         else: