diff --git a/HISTORY.rst b/HISTORY.rst index 321864b2..dd2691bd 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,6 +5,7 @@ History 0.1.10 (2024-??-??) ------------------ * Long EM and RPCA operations wrapped with tqdm progress bars +* Readme code sample updated, and results table made consistant 0.1.9 (2024-08-29) ------------------ diff --git a/README.rst b/README.rst index 83af908f..9f54849a 100644 --- a/README.rst +++ b/README.rst @@ -88,9 +88,8 @@ With just these few lines of code, you can see how easy it is to generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=4, ratio_masked=0.1) comparison = comparator.Comparator( dict_imputers, - columns, generator_holes = generator_holes, - metrics = ["mae", "wmape", "kl_columnwise", "ks_test", "energy"], + metrics = ["mae", "wmape", "kl_columnwise", "frechet"], ) results = comparison.compare(df_with_nan) results.style.highlight_min(color="lightsteelblue", axis=1) diff --git a/docs/images/readme_tabular_comparison.png b/docs/images/readme_tabular_comparison.png index f6676807..8dddeda9 100644 Binary files a/docs/images/readme_tabular_comparison.png and b/docs/images/readme_tabular_comparison.png differ diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 3a2c27b5..be9e77ff 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -128,7 +128,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=["mae", "wmape", "kl_columnwise", "wasserstein_columnwise"], max_evals=10, diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index f8c9691f..dfabd812 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -89,7 +89,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=metrics, max_evals=2, diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index e18044bb..398712e6 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -169,7 +169,6 @@ comparison = comparator.Comparator( dict_imputers, - selected_columns=df_data.columns, generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng), metrics=["mae", "kl_columnwise"], ) @@ -224,7 +223,6 @@ comparison = comparator.Comparator( dict_imputers, - selected_columns=df_data.columns, generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng), metrics=["mae", "kl_columnwise"], ) diff --git a/examples/tutorials/plot_tuto_mean_median.py b/examples/tutorials/plot_tuto_mean_median.py index 2508bfdf..021a2a18 100644 --- a/examples/tutorials/plot_tuto_mean_median.py +++ b/examples/tutorials/plot_tuto_mean_median.py @@ -123,7 +123,6 @@ comparison = comparator.Comparator( dict_imputers, - cols_to_impute, generator_holes=generator_holes, metrics=metrics, max_evals=5, diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py index afcf2094..46bfdb7a 100644 --- a/qolmat/benchmark/comparator.py +++ b/qolmat/benchmark/comparator.py @@ -28,9 +28,6 @@ class Comparator: ---------- dict_models: Dict[str, any] dictionary of imputation methods - selected_columns: List[str]Œ - list of column's names selected (all with at least one null value will - be imputed) columnwise_evaluation : Optional[bool], optional whether the metric should be calculated column-wise or not, by default False @@ -46,7 +43,6 @@ class Comparator: def __init__( self, dict_models: Dict[str, Any], - selected_columns: List[str], generator_holes: _HoleGenerator, metrics: List = ["mae", "wmape", "kl_columnwise"], dict_config_opti: Optional[Dict[str, Any]] = {}, @@ -55,7 +51,6 @@ def __init__( verbose: bool = False, ): self.dict_imputers = dict_models - self.selected_columns = selected_columns self.generator_holes = generator_holes self.metrics = metrics self.dict_config_opti = dict_config_opti diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index 7286dbbd..3f826ae7 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -835,6 +835,7 @@ def sum_pairwise_distances( def frechet_distance_base( df1: pd.DataFrame, df2: pd.DataFrame, + df_mask: pd.DataFrame, ) -> pd.Series: """Compute the Fréchet distance between two dataframes df1 and df2. @@ -853,6 +854,8 @@ def frechet_distance_base( true dataframe df2 : pd.DataFrame predicted dataframe + df_mask : pd.DataFrame + Elements of the dataframes to compute on Returns ------- @@ -860,9 +863,15 @@ def frechet_distance_base( Frechet distance in a Series object """ - if df1.shape != df2.shape: + if df1.shape != df2.shape or df1.shape != df_mask.shape: raise Exception("inputs have to be of same dimensions.") + df1 = df1.copy() + df2 = df2.copy() + # Set to nan the values not in the mask + df1[~df_mask] = np.nan + df2[~df_mask] = np.nan + std = (np.std(df1) + np.std(df2) + EPS) / 2 mu = (np.nanmean(df1, axis=0) + np.nanmean(df2, axis=0)) / 2 df1 = (df1 - mu) / std @@ -911,7 +920,7 @@ def frechet_distance( """ if method == "single": - return frechet_distance_base(df1, df2) + return frechet_distance_base(df1, df2, df_mask) return pattern_based_weighted_mean_metric( df1, df2, diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py index 941ceef0..35e4644f 100644 --- a/qolmat/imputations/imputers_pytorch.py +++ b/qolmat/imputations/imputers_pytorch.py @@ -8,6 +8,7 @@ import pandas as pd from numpy.typing import NDArray from sklearn.preprocessing import StandardScaler +from tqdm import tqdm # from typing_extensions import Self from qolmat.benchmark import metrics @@ -106,23 +107,21 @@ def _fit_estimator( optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate) loss_fn = self.loss_fn - for epoch in range(self.epochs): - estimator.train() - optimizer.zero_grad() - - input_data = torch.Tensor(X.values) - target_data = torch.Tensor(y.values) - target_data = target_data.unsqueeze(1) - outputs = estimator(input_data) - loss = loss_fn(outputs, target_data) - - loss.backward() - optimizer.step() - if (epoch + 1) % 10 == 0: - logging.info( - f"Epoch [{epoch + 1}/{self.epochs}], " - f"Loss: {loss.item():.4f}" - ) + with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar: + for _ in range(self.epochs): + estimator.train() + optimizer.zero_grad() + + input_data = torch.Tensor(X.values) + target_data = torch.Tensor(y.values) + target_data = target_data.unsqueeze(1) + outputs = estimator(input_data) + loss = loss_fn(outputs, target_data) + + loss.backward() + optimizer.step() + pbar.set_postfix(loss=f"{loss.item():.4f}") + pbar.update(1) return estimator def _predict_estimator( diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 62bd6f08..7809c87a 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -317,67 +317,73 @@ def minimise_loss( Ir = np.eye(rank) In = identity(n_rows) - for _ in tqdm( - range(max_iterations), + with tqdm( + total=max_iterations, desc="Noisy RPCA loss minimization", + unit="iteration", disable=not verbose, - ): - M_temp = M.copy() - A_temp = A.copy() - L_temp = L.copy() - Q_temp = Q.copy() - if norm == "L1": - R_temp = R.copy() - sums = np.zeros((n_rows, n_cols)) - for i_period, _ in enumerate(list_periods): - sums += mu * R[i_period] - list_H[i_period] @ Y - - M = spsolve( - (1 + mu) * In + HtH, - D - A + mu * L @ Q - Y + sums, - ) - else: - M = spsolve( - (1 + mu) * In + 2 * HtH, - D - A + mu * L @ Q - Y, - ) - M = M.reshape(D.shape) - - A_Omega = rpca_utils.soft_thresholding(D - M, lam) - A_Omega_C = D - M - A = np.where(Omega, A_Omega, A_Omega_C) - Q = scp.linalg.solve( - a=tau * Ir + mu * (L.T @ L), - b=L.T @ (mu * M + Y), - ) - - L = scp.linalg.solve( - a=tau * Ir + mu * (Q @ Q.T), - b=Q @ (mu * M.T + Y.T), - ).T - - Y += mu * (M - L @ Q) - if norm == "L1": - for i_period, _ in enumerate(list_periods): - eta = list_etas[i_period] - R[i_period] = rpca_utils.soft_thresholding( - R[i_period] / mu, eta / mu + ) as pbar: + for _ in range(max_iterations): + M_temp = M.copy() + A_temp = A.copy() + L_temp = L.copy() + Q_temp = Q.copy() + if norm == "L1": + R_temp = R.copy() + sums = np.zeros((n_rows, n_cols)) + for i_period, _ in enumerate(list_periods): + sums += mu * R[i_period] - list_H[i_period] @ Y + + M = spsolve( + (1 + mu) * In + HtH, + D - A + mu * L @ Q - Y + sums, ) + else: + M = spsolve( + (1 + mu) * In + 2 * HtH, + D - A + mu * L @ Q - Y, + ) + M = M.reshape(D.shape) + + A_Omega = rpca_utils.soft_thresholding(D - M, lam) + A_Omega_C = D - M + A = np.where(Omega, A_Omega, A_Omega_C) + Q = scp.linalg.solve( + a=tau * Ir + mu * (L.T @ L), + b=L.T @ (mu * M + Y), + ) - mu = min(mu * rho, mu_bar) - - Mc = np.linalg.norm(M - M_temp, np.inf) - Ac = np.linalg.norm(A - A_temp, np.inf) - Lc = np.linalg.norm(L - L_temp, np.inf) - Qc = np.linalg.norm(Q - Q_temp, np.inf) - error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa - if norm == "L1": - for i_period, _ in enumerate(list_periods): - Rc = np.linalg.norm(R[i_period] - R_temp[i_period], np.inf) - error_max = max(error_max, Rc) # type: ignore # noqa - - if error_max < tolerance: - break + L = scp.linalg.solve( + a=tau * Ir + mu * (Q @ Q.T), + b=Q @ (mu * M.T + Y.T), + ).T + + Y += mu * (M - L @ Q) + if norm == "L1": + for i_period, _ in enumerate(list_periods): + eta = list_etas[i_period] + R[i_period] = rpca_utils.soft_thresholding( + R[i_period] / mu, eta / mu + ) + + mu = min(mu * rho, mu_bar) + + Mc = np.linalg.norm(M - M_temp, np.inf) + Ac = np.linalg.norm(A - A_temp, np.inf) + Lc = np.linalg.norm(L - L_temp, np.inf) + Qc = np.linalg.norm(Q - Q_temp, np.inf) + error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa + if norm == "L1": + for i_period, _ in enumerate(list_periods): + Rc = np.linalg.norm( + R[i_period] - R_temp[i_period], np.inf + ) + error_max = max(error_max, Rc) # type: ignore # noqa + + if error_max < tolerance: + break + pbar.set_postfix(error=f"{error_max.item():.4f}") + pbar.update(1) M = L @ Q diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py index 28c73e6e..ef870c0c 100644 --- a/tests/benchmark/test_comparator.py +++ b/tests/benchmark/test_comparator.py @@ -40,7 +40,6 @@ def generator_holes_mock(mocker: MockerFixture) -> _HoleGenerator: def comparator(generator_holes_mock: _HoleGenerator) -> Comparator: return Comparator( dict_models={}, - selected_columns=["A", "B"], generator_holes=generator_holes_mock, metrics=["mae", "mse"], ) @@ -439,7 +438,6 @@ def test_compare_reproducibility(): ) comparator = Comparator( dict_models=dict_models, - selected_columns=df_data.columns, generator_holes=generator_holes, metrics=["mae", "mse"], ) diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py index 26fa0f7c..001a9267 100644 --- a/tests/benchmark/test_metrics.py +++ b/tests/benchmark/test_metrics.py @@ -171,12 +171,16 @@ def test_kl_divergence_gaussian( @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) -def test_frechet_distance_base(df1: pd.DataFrame, df2: pd.DataFrame) -> None: - result = metrics.frechet_distance_base(df1, df1) +@pytest.mark.parametrize("df_mask", [df_mask]) +def test_frechet_distance_base( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: + result = metrics.frechet_distance_base(df1, df1, df_mask) np.testing.assert_allclose(result, 0, atol=1e-3) - result = metrics.frechet_distance_base(df1, df2) - np.testing.assert_allclose(result, 0.134, atol=1e-3) + result = metrics.frechet_distance_base(df1, df2, df_mask) + assert np.all(0 < result) + assert np.all(result < 1) @pytest.mark.parametrize("df1", [df_incomplete]) @@ -360,7 +364,7 @@ def test_exception_raise_different_shapes( df1, df2, df_mask ) with pytest.raises(Exception): - metrics.frechet_distance_base(df1, df2) + metrics.frechet_distance_base(df1, df2, df_mask) @pytest.mark.parametrize("df1", [df_incomplete_cat])