diff --git a/HISTORY.rst b/HISTORY.rst index 4b499475..dd2691bd 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,12 @@ History ======= -0.1.8 (2024-08-29) +0.1.10 (2024-??-??) +------------------ +* Long EM and RPCA operations wrapped with tqdm progress bars +* Readme code sample updated, and results table made consistant + +0.1.9 (2024-08-29) ------------------ * Tutorials reproducibility improved with random_state parameters * RPCA now accepts random_state parameters diff --git a/README.rst b/README.rst index 1f292ebf..9f54849a 100644 --- a/README.rst +++ b/README.rst @@ -70,26 +70,26 @@ With just these few lines of code, you can see how easy it is to from qolmat.utils import data # load and prepare csv data + df_data = data.get_data("Beijing") columns = ["TEMP", "PRES", "WSPM"] df_data = df_data[columns] df_with_nan = data.add_holes(df_data, ratio_masked=0.2, mean_size=120) # impute and compare - imputer_mean = imputers.ImputerSimple(strategy="mean", groups=("station",)) + imputer_median = imputers.ImputerSimple(groups=("station",)) imputer_interpol = imputers.ImputerInterpolation(method="linear", groups=("station",)) imputer_var1 = imputers.ImputerEM(model="VAR", groups=("station",), method="mle", max_iter_em=50, n_iter_ou=15, dt=1e-3, p=1) dict_imputers = { - "mean": imputer_mean, + "median": imputer_median, "interpolation": imputer_interpol, "VAR(1) process": imputer_var1 } generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=4, ratio_masked=0.1) comparison = comparator.Comparator( dict_imputers, - columns, generator_holes = generator_holes, - metrics = ["mae", "wmape", "kl_columnwise", "ks_test", "energy"], + metrics = ["mae", "wmape", "kl_columnwise", "frechet"], ) results = comparison.compare(df_with_nan) results.style.highlight_min(color="lightsteelblue", axis=1) diff --git a/docs/images/readme_tabular_comparison.png b/docs/images/readme_tabular_comparison.png index f6676807..8dddeda9 100644 Binary files a/docs/images/readme_tabular_comparison.png and b/docs/images/readme_tabular_comparison.png differ diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 3a2c27b5..be9e77ff 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -128,7 +128,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=["mae", "wmape", "kl_columnwise", "wasserstein_columnwise"], max_evals=10, diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index f8c9691f..dfabd812 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -89,7 +89,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=metrics, max_evals=2, diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index e18044bb..398712e6 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -169,7 +169,6 @@ comparison = comparator.Comparator( dict_imputers, - selected_columns=df_data.columns, generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng), metrics=["mae", "kl_columnwise"], ) @@ -224,7 +223,6 @@ comparison = comparator.Comparator( dict_imputers, - selected_columns=df_data.columns, generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng), metrics=["mae", "kl_columnwise"], ) diff --git a/examples/tutorials/plot_tuto_mean_median.py b/examples/tutorials/plot_tuto_mean_median.py index 2508bfdf..021a2a18 100644 --- a/examples/tutorials/plot_tuto_mean_median.py +++ b/examples/tutorials/plot_tuto_mean_median.py @@ -123,7 +123,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=metrics, max_evals=5, diff --git a/pyproject.toml b/pyproject.toml index ca6327cb..7f3d6c37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ statsmodels = ">= 0.14.0" typed-ast = { version = "*", optional = true } category-encoders = "^2.6.3" dcor = ">= 0.6" +tqdm = "*" [tool.poetry.group.torch.dependencies] torch = "< 2.5" diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py index afcf2094..46bfdb7a 100644 --- a/qolmat/benchmark/comparator.py +++ b/qolmat/benchmark/comparator.py @@ -28,9 +28,6 @@ class Comparator: ---------- dict_models: Dict[str, any] dictionary of imputation methods - selected_columns: List[str]Œ - list of column's names selected (all with at least one null value will - be imputed) columnwise_evaluation : Optional[bool], optional whether the metric should be calculated column-wise or not, by default False @@ -46,7 +43,6 @@ class Comparator: def __init__( self, dict_models: Dict[str, Any], - selected_columns: List[str], generator_holes: _HoleGenerator, metrics: List = ["mae", "wmape", "kl_columnwise"], dict_config_opti: Optional[Dict[str, Any]] = {}, @@ -55,7 +51,6 @@ def __init__( verbose: bool = False, ): self.dict_imputers = dict_models - self.selected_columns = selected_columns self.generator_holes = generator_holes self.metrics = metrics self.dict_config_opti = dict_config_opti diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index 7286dbbd..3f826ae7 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -835,6 +835,7 @@ def sum_pairwise_distances( def frechet_distance_base( df1: pd.DataFrame, df2: pd.DataFrame, + df_mask: pd.DataFrame, ) -> pd.Series: """Compute the Fréchet distance between two dataframes df1 and df2. @@ -853,6 +854,8 @@ def frechet_distance_base( true dataframe df2 : pd.DataFrame predicted dataframe + df_mask : pd.DataFrame + Elements of the dataframes to compute on Returns ------- @@ -860,9 +863,15 @@ def frechet_distance_base( Frechet distance in a Series object """ - if df1.shape != df2.shape: + if df1.shape != df2.shape or df1.shape != df_mask.shape: raise Exception("inputs have to be of same dimensions.") + df1 = df1.copy() + df2 = df2.copy() + # Set to nan the values not in the mask + df1[~df_mask] = np.nan + df2[~df_mask] = np.nan + std = (np.std(df1) + np.std(df2) + EPS) / 2 mu = (np.nanmean(df1, axis=0) + np.nanmean(df2, axis=0)) / 2 df1 = (df1 - mu) / std @@ -911,7 +920,7 @@ def frechet_distance( """ if method == "single": - return frechet_distance_base(df1, df2) + return frechet_distance_base(df1, df2, df_mask) return pattern_based_weighted_mean_metric( df1, df2, diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 449b4f68..d6845079 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -11,6 +11,7 @@ from scipy import optimize as spo from sklearn import utils as sku from sklearn.base import BaseEstimator, TransformerMixin +from tqdm import tqdm from qolmat.utils import utils from qolmat.utils.utils import RandomSetting @@ -433,7 +434,11 @@ def fit_X(self, X: NDArray) -> None: X = self._maximize_likelihood(X_imp, mask_na) - for iter_em in range(self.max_iter_em): + for iter_em in tqdm( + range(self.max_iter_em), + desc="EM parameters estimation", + disable=not self.verbose, + ): X = self._sample_ou(X, mask_na) self.combine_parameters() @@ -474,6 +479,7 @@ def fit(self, X: NDArray) -> "EM": if hasattr(self, "p_to_fit") and self.p_to_fit: aics: List[float] = [] for p in range(self.max_lagp + 1): + print("p=", p) self.p = p self.fit_X(X) n1, n2 = self.X.shape diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py index 941ceef0..cad9a3ce 100644 --- a/qolmat/imputations/imputers_pytorch.py +++ b/qolmat/imputations/imputers_pytorch.py @@ -8,6 +8,7 @@ import pandas as pd from numpy.typing import NDArray from sklearn.preprocessing import StandardScaler +from tqdm import tqdm # from typing_extensions import Self from qolmat.benchmark import metrics @@ -106,23 +107,29 @@ def _fit_estimator( optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate) loss_fn = self.loss_fn - for epoch in range(self.epochs): - estimator.train() - optimizer.zero_grad() - - input_data = torch.Tensor(X.values) - target_data = torch.Tensor(y.values) - target_data = target_data.unsqueeze(1) - outputs = estimator(input_data) - loss = loss_fn(outputs, target_data) - - loss.backward() - optimizer.step() - if (epoch + 1) % 10 == 0: - logging.info( - f"Epoch [{epoch + 1}/{self.epochs}], " - f"Loss: {loss.item():.4f}" - ) + # if X.shape[0] != estimator[0].in_features: + # raise ValueError( + # "The number of features in X does not match the input " + # "features of the estimator. The estimator expects" + # f" {estimator[0].in_features} features, but X has " + # f"{X.shape[0]} features." + # ) + + with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar: + for _ in range(self.epochs): + estimator.train() + optimizer.zero_grad() + + input_data = torch.Tensor(X.values) + target_data = torch.Tensor(y.values) + target_data = target_data.unsqueeze(1) + outputs = estimator(input_data) + loss = loss_fn(outputs, target_data) + + loss.backward() + optimizer.step() + pbar.set_postfix(loss=f"{loss.item():.4f}") + pbar.update(1) return estimator def _predict_estimator( diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 59164a3b..7809c87a 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -11,6 +11,7 @@ from scipy.sparse import dok_matrix, identity from scipy.sparse.linalg import spsolve from sklearn import utils as sku +from tqdm import tqdm from qolmat.imputations.rpca import rpca_utils from qolmat.imputations.rpca.rpca import RPCA @@ -200,6 +201,7 @@ def decompose_with_basis( max_iterations=self.max_iterations, tolerance=self.tolerance, norm=self.norm, + verbose=self.verbose, ) self._check_cost_function_minimized(D, M, A, Omega, tau, lam) @@ -219,6 +221,7 @@ def minimise_loss( max_iterations: int = 10000, tolerance: float = 1e-6, norm: str = "L2", + verbose: bool = False, ) -> Tuple: """Compute the noisy RPCA with a L2 time penalisation. @@ -255,6 +258,9 @@ def minimise_loss( consecutive iterations. Defaults to 1e-6. norm : str, optional Error norm, can be "L1" or "L2". Defaults to "L2". + verbose : bool, optional + Verbosity level, if False the warnings are silenced. Defaults to + False. Returns ------- @@ -311,63 +317,73 @@ def minimise_loss( Ir = np.eye(rank) In = identity(n_rows) - for _ in range(max_iterations): - M_temp = M.copy() - A_temp = A.copy() - L_temp = L.copy() - Q_temp = Q.copy() - if norm == "L1": - R_temp = R.copy() - sums = np.zeros((n_rows, n_cols)) - for i_period, _ in enumerate(list_periods): - sums += mu * R[i_period] - list_H[i_period] @ Y - - M = spsolve( - (1 + mu) * In + HtH, - D - A + mu * L @ Q - Y + sums, - ) - else: - M = spsolve( - (1 + mu) * In + 2 * HtH, - D - A + mu * L @ Q - Y, - ) - M = M.reshape(D.shape) - - A_Omega = rpca_utils.soft_thresholding(D - M, lam) - A_Omega_C = D - M - A = np.where(Omega, A_Omega, A_Omega_C) - Q = scp.linalg.solve( - a=tau * Ir + mu * (L.T @ L), - b=L.T @ (mu * M + Y), - ) - - L = scp.linalg.solve( - a=tau * Ir + mu * (Q @ Q.T), - b=Q @ (mu * M.T + Y.T), - ).T - - Y += mu * (M - L @ Q) - if norm == "L1": - for i_period, _ in enumerate(list_periods): - eta = list_etas[i_period] - R[i_period] = rpca_utils.soft_thresholding( - R[i_period] / mu, eta / mu + with tqdm( + total=max_iterations, + desc="Noisy RPCA loss minimization", + unit="iteration", + disable=not verbose, + ) as pbar: + for _ in range(max_iterations): + M_temp = M.copy() + A_temp = A.copy() + L_temp = L.copy() + Q_temp = Q.copy() + if norm == "L1": + R_temp = R.copy() + sums = np.zeros((n_rows, n_cols)) + for i_period, _ in enumerate(list_periods): + sums += mu * R[i_period] - list_H[i_period] @ Y + + M = spsolve( + (1 + mu) * In + HtH, + D - A + mu * L @ Q - Y + sums, ) + else: + M = spsolve( + (1 + mu) * In + 2 * HtH, + D - A + mu * L @ Q - Y, + ) + M = M.reshape(D.shape) + + A_Omega = rpca_utils.soft_thresholding(D - M, lam) + A_Omega_C = D - M + A = np.where(Omega, A_Omega, A_Omega_C) + Q = scp.linalg.solve( + a=tau * Ir + mu * (L.T @ L), + b=L.T @ (mu * M + Y), + ) - mu = min(mu * rho, mu_bar) - - Mc = np.linalg.norm(M - M_temp, np.inf) - Ac = np.linalg.norm(A - A_temp, np.inf) - Lc = np.linalg.norm(L - L_temp, np.inf) - Qc = np.linalg.norm(Q - Q_temp, np.inf) - error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa - if norm == "L1": - for i_period, _ in enumerate(list_periods): - Rc = np.linalg.norm(R[i_period] - R_temp[i_period], np.inf) - error_max = max(error_max, Rc) # type: ignore # noqa - - if error_max < tolerance: - break + L = scp.linalg.solve( + a=tau * Ir + mu * (Q @ Q.T), + b=Q @ (mu * M.T + Y.T), + ).T + + Y += mu * (M - L @ Q) + if norm == "L1": + for i_period, _ in enumerate(list_periods): + eta = list_etas[i_period] + R[i_period] = rpca_utils.soft_thresholding( + R[i_period] / mu, eta / mu + ) + + mu = min(mu * rho, mu_bar) + + Mc = np.linalg.norm(M - M_temp, np.inf) + Ac = np.linalg.norm(A - A_temp, np.inf) + Lc = np.linalg.norm(L - L_temp, np.inf) + Qc = np.linalg.norm(Q - Q_temp, np.inf) + error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa + if norm == "L1": + for i_period, _ in enumerate(list_periods): + Rc = np.linalg.norm( + R[i_period] - R_temp[i_period], np.inf + ) + error_max = max(error_max, Rc) # type: ignore # noqa + + if error_max < tolerance: + break + pbar.set_postfix(error=f"{error_max.item():.4f}") + pbar.update(1) M = L @ Q diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index afb6dea3..e018dbf9 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -8,6 +8,7 @@ import numpy as np from numpy.typing import NDArray from sklearn import utils as sku +from tqdm import tqdm from qolmat.imputations.rpca import rpca_utils from qolmat.imputations.rpca.rpca import RPCA @@ -125,7 +126,11 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: errors: NDArray = np.full((self.max_iterations,), fill_value=np.nan) M: NDArray = D - A - for iteration in range(self.max_iterations): + for iteration in tqdm( + range(self.max_iterations), + desc="RPCA PCP decomposition", + disable=not self.verbose, + ): M = rpca_utils.svd_thresholding(D - A + Y / mu, 1 / mu) A = rpca_utils.soft_thresholding(D - M + Y / mu, lam / mu) A[~Omega] = (D - M)[~Omega] diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py index 912a9294..63688812 100644 --- a/qolmat/imputations/softimpute.py +++ b/qolmat/imputations/softimpute.py @@ -10,6 +10,7 @@ from numpy.typing import NDArray from sklearn import utils as sku from sklearn.base import BaseEstimator, TransformerMixin +from tqdm import tqdm from qolmat.imputations.rpca import rpca_utils from qolmat.utils import utils @@ -146,7 +147,11 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: B = V * D M = A @ B.T cost_start = SoftImpute.cost_function(X, M, A, Omega, tau) - for iter_ in range(self.max_iterations): + for iter_ in tqdm( + range(self.max_iterations), + desc="Soft Impute decomposition", + disable=not self.verbose, + ): U_old = U V_old = V D_old = D diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py index 28c73e6e..ef870c0c 100644 --- a/tests/benchmark/test_comparator.py +++ b/tests/benchmark/test_comparator.py @@ -40,7 +40,6 @@ def generator_holes_mock(mocker: MockerFixture) -> _HoleGenerator: def comparator(generator_holes_mock: _HoleGenerator) -> Comparator: return Comparator( dict_models={}, - selected_columns=["A", "B"], generator_holes=generator_holes_mock, metrics=["mae", "mse"], ) @@ -439,7 +438,6 @@ def test_compare_reproducibility(): ) comparator = Comparator( dict_models=dict_models, - selected_columns=df_data.columns, generator_holes=generator_holes, metrics=["mae", "mse"], ) diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py index 26fa0f7c..001a9267 100644 --- a/tests/benchmark/test_metrics.py +++ b/tests/benchmark/test_metrics.py @@ -171,12 +171,16 @@ def test_kl_divergence_gaussian( @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) -def test_frechet_distance_base(df1: pd.DataFrame, df2: pd.DataFrame) -> None: - result = metrics.frechet_distance_base(df1, df1) +@pytest.mark.parametrize("df_mask", [df_mask]) +def test_frechet_distance_base( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: + result = metrics.frechet_distance_base(df1, df1, df_mask) np.testing.assert_allclose(result, 0, atol=1e-3) - result = metrics.frechet_distance_base(df1, df2) - np.testing.assert_allclose(result, 0.134, atol=1e-3) + result = metrics.frechet_distance_base(df1, df2, df_mask) + assert np.all(0 < result) + assert np.all(result < 1) @pytest.mark.parametrize("df1", [df_incomplete]) @@ -360,7 +364,7 @@ def test_exception_raise_different_shapes( df1, df2, df_mask ) with pytest.raises(Exception): - metrics.frechet_distance_base(df1, df2) + metrics.frechet_distance_base(df1, df2, df_mask) @pytest.mark.parametrize("df1", [df_incomplete_cat])