diff --git a/asv_benchmarks/benchmarks/fastcan.py b/asv_benchmarks/benchmarks/fastcan.py index c9ddb12..ce9e4e7 100644 --- a/asv_benchmarks/benchmarks/fastcan.py +++ b/asv_benchmarks/benchmarks/fastcan.py @@ -37,11 +37,7 @@ def setup_cache(self): else: eta = False beam_width = 10 - estimator = FastCan( - n_features_to_select=20, - eta=eta, - beam_width=beam_width - ) + estimator = FastCan(n_features_to_select=20, eta=eta, beam_width=beam_width) estimator.fit(X, y) est_path = get_estimator_path(self, params) diff --git a/fastcan/_beam.py b/fastcan/_beam.py index 752b5c9..732cd64 100644 --- a/fastcan/_beam.py +++ b/fastcan/_beam.py @@ -45,18 +45,14 @@ def _beam_search( for i in range(n_features_to_select - n_inclusions): if i == 0: - mask, X_selected = _prepare_candidates( - X, mask_exclude, indices_include - ) + mask, X_selected = _prepare_candidates(X, mask_exclude, indices_include) if X_selected.shape[1] == 0: beams_scores = np.sum((X.T @ V) ** 2, axis=1) beams_scores[mask] = 0 else: W_selected = orth(X_selected) selected_score = np.sum((W_selected.T @ V) ** 2) - beams_scores = _mgs_ssc( - X, V, W_selected, mask, selected_score, tol - ) + beams_scores = _mgs_ssc(X, V, W_selected, mask, selected_score, tol) beams_selected_ids = [indices_include for _ in range(beam_width)] beams_selected_ids, top_k_scores = _select_top_k( beams_scores[None, :], diff --git a/fastcan/_minibatch.py b/fastcan/_minibatch.py index 2aae1e4..53974b9 100644 --- a/fastcan/_minibatch.py +++ b/fastcan/_minibatch.py @@ -5,6 +5,7 @@ # Authors: The fastcan developers # SPDX-License-Identifier: MIT +import warnings from numbers import Integral, Real import numpy as np @@ -12,6 +13,7 @@ from sklearn.utils._param_validation import Interval, validate_params from sklearn.utils.validation import check_X_y +from ._beam import _safe_normalize from ._cancorr_fast import _greedy_search # type: ignore[attr-defined] from ._fastcan import _prepare_search @@ -101,11 +103,18 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1): ) ) X_transformed_ = X - X.mean(0) - y_transformed_ = y - y.mean(0) + y_transformed_, const_mask = _safe_normalize(y - y.mean(0)) + if const_mask.any(): + warnings.warn( + f"Contain constant targets, whose indices are {np.where(const_mask)[0]}.", + UserWarning, + ) indices_include = np.zeros(0, dtype=int) # just an empty array indices_select = np.zeros(0, dtype=int) for i in range(n_outputs): + if const_mask[i]: + continue y_i = y_transformed_[:, [i]] n_selected_i = 0 while n_to_select_split[i] > n_selected_i: @@ -137,7 +146,9 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1): n_selected_i += batch_size_temp if verbose == 1: print( - f"Progress: {indices_select.size}/{n_features_to_select}", end="\r" + f"Progress: {indices_select.size}/{n_features_to_select}, " + f"Batch SSC: {scores.sum():.5f}", + end="\r", ) if verbose == 1: print() diff --git a/pixi.toml b/pixi.toml index 22abdee..db96c6b 100644 --- a/pixi.toml +++ b/pixi.toml @@ -84,8 +84,8 @@ scikit-learn = ">=1.6.0" fastcan = { path = ".", editable = true } [tasks] -time-h = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(3000, 100); y = np.random.rand(3000, 20)' 's = FastCan(100, verbose=0).fit(X, y)'" -time-eta = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(3000, 100); y = np.random.rand(3000, 20)' 's = FastCan(100, eta=True, verbose=0).fit(X, y)'" +time-h = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(30000, 100); y = np.random.rand(30000, 20)' 's = FastCan(100, verbose=0).fit(X, y)'" +time-eta = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(30000, 100); y = np.random.rand(30000, 20)' 's = FastCan(100, eta=True, verbose=0).fit(X, y)'" profile-minibatch = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan import minibatch; X = np.random.rand(100, 3000); y = np.random.rand(100, 20); cProfile.run('minibatch(X, y, 1000, 10, verbose=0)', sort='{{ SORT }}')"''', args = [{ arg = "SORT", default = "cumtime" }] } time-narx = '''python -m timeit -n 1 -s "import numpy as np; from fastcan.narx import make_narx; rng = np.random.default_rng(5); X = rng.random((1000, 10)); y = rng.random((1000, 2)); m = make_narx(X, y, 10, max_delay=2, poly_degree=2, verbose=0)" "m.fit(X, y, coef_init='one_step_ahead', verbose=1)"''' profile-narx = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan.narx import make_narx; rng = np.random.default_rng(8); X = rng.random((3000, 3)); y = rng.random((3000, 3)); m = make_narx(X, y, 10, max_delay=10, poly_degree=2, verbose=0); cProfile.run('m.fit(X, y, coef_init=[0]*33)', sort='{{ SORT }}')"''', args = [{ arg = "SORT", default = "tottime" }] } diff --git a/tests/test_minibatch.py b/tests/test_minibatch.py index b5b3a3b..34f4827 100644 --- a/tests/test_minibatch.py +++ b/tests/test_minibatch.py @@ -3,10 +3,11 @@ import numpy as np import pytest from sklearn.cluster import KMeans -from sklearn.datasets import load_iris, make_classification +from sklearn.datasets import load_iris, make_classification, make_regression from sklearn.preprocessing import OneHotEncoder from fastcan import minibatch +from fastcan.utils import ssc def test_data_pruning(): @@ -60,7 +61,7 @@ def test_select_minibatch_cls(): assert indices.size == n_to_select -def test_minibatch_error(): +def test_minibatch_error_warning(): # Test refine raise error. n_samples = 200 n_features = 20 @@ -83,3 +84,37 @@ def test_minibatch_error(): with pytest.raises(ValueError, match=r"n_features_to_select .*"): _ = minibatch(X, y, n_features + 1, batch_size=3) + + Y = OneHotEncoder(sparse_output=False).fit_transform(y.reshape(-1, 1)) + Y[:, 0] = 1 + with pytest.warns( + UserWarning, match=r"Contain constant targets, whose indices are .*" + ): + _ = minibatch(X, Y, 5, batch_size=3) + + +def test_minibatch_ssc_aligned(capsys): + # Test whether ssc of minibatch aligns with the true ssc score + n_features = 20 + n_targets = 5 + n_to_select = 10 + X, y = make_regression( + n_samples=100, + n_features=n_features, + n_informative=10, + n_targets=n_targets, + noise=0.1, + random_state=0, + ) + + # The last batch of features are selected for the last target. + # The number of features selected per target is n_to_select // n_targets + n_features_per_target = n_to_select // n_targets + indices = minibatch(X, y, n_to_select, batch_size=n_features_per_target + 1) + captured = capsys.readouterr() + + gtruth_ssc = ssc(X[:, indices[-n_features_per_target:]], y[:, [-1]]) + assert ( + f"Progress: {n_to_select}/{n_to_select}, " + f"Batch SSC: {gtruth_ssc:.5f}" in captured.out + )