diff --git a/sdmetrics/column_pairs/statistical/contingency_similarity.py b/sdmetrics/column_pairs/statistical/contingency_similarity.py index 7f223ad8..d47e3cf5 100644 --- a/sdmetrics/column_pairs/statistical/contingency_similarity.py +++ b/sdmetrics/column_pairs/statistical/contingency_similarity.py @@ -27,7 +27,9 @@ class ContingencySimilarity(ColumnPairsMetric): max_value = 1.0 @staticmethod - def _validate_inputs(real_data, synthetic_data, continuous_column_names, num_discrete_bins): + def _validate_inputs( + real_data, synthetic_data, continuous_column_names, num_discrete_bins, num_rows_subsample + ): for data in [real_data, synthetic_data]: if not isinstance(data, pd.DataFrame) or len(data.columns) != 2: raise ValueError('The data must be a pandas DataFrame with two columns.') @@ -47,31 +49,36 @@ def _validate_inputs(real_data, synthetic_data, continuous_column_names, num_dis if not isinstance(num_discrete_bins, int) or num_discrete_bins <= 0: raise ValueError('`num_discrete_bins` must be an integer greater than zero.') + if num_rows_subsample is not None: + if not isinstance(num_rows_subsample, int) or num_rows_subsample <= 0: + raise ValueError('`num_rows_subsample` must be an integer greater than zero.') + @classmethod - def compute(cls, real_data, synthetic_data, continuous_column_names=None, num_discrete_bins=10): - """Compare the contingency similarity of two discrete columns. + def compute_breakdown( + cls, + real_data, + synthetic_data, + continuous_column_names=None, + num_discrete_bins=10, + num_rows_subsample=None, + ): + """Compute the breakdown of this metric.""" + cls._validate_inputs( + real_data, + synthetic_data, + continuous_column_names, + num_discrete_bins, + num_rows_subsample, + ) + columns = real_data.columns[:2] - Args: - real_data (pd.DataFrame): - The values from the real dataset. - synthetic_data (pd.DataFrame): - The values from the synthetic dataset. - continuous_column_names (list[str], optional): - The list of columns to discretize before running the metric. The column names in - this list should match the column names in the real and synthetic data. Defaults - to ``None``. - num_discrete_bins (int, optional): - The number of bins to create for the continuous columns. Defaults to 10. + if num_rows_subsample is not None: + real_data = real_data.sample(min(num_rows_subsample, len(real_data))) + synthetic_data = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))) - Returns: - float: - The contingency similarity of the two columns. - """ - cls._validate_inputs(real_data, synthetic_data, continuous_column_names, num_discrete_bins) - columns = real_data.columns[:2] real = real_data[columns] synthetic = synthetic_data[columns] - if continuous_column_names is not None: + if continuous_column_names: for column in continuous_column_names: real[column], synthetic[column] = discretize_column( real[column], synthetic[column], num_discrete_bins=num_discrete_bins @@ -86,7 +93,45 @@ def compute(cls, real_data, synthetic_data, continuous_column_names=None, num_di contingency_real = contingency_real.reindex(combined_index, fill_value=0) diff = abs(contingency_real - contingency_synthetic).fillna(0) variation = diff / 2 - return 1 - variation.sum() + return {'score': 1 - variation.sum()} + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + continuous_column_names=None, + num_discrete_bins=10, + num_rows_subsample=None, + ): + """Compare the contingency similarity of two discrete columns. + + Args: + real_data (pd.DataFrame): + The values from the real dataset. + synthetic_data (pd.DataFrame): + The values from the synthetic dataset. + continuous_column_names (list[str], optional): + The list of columns to discretize before running the metric. The column names in + this list should match the column names in the real and synthetic data. Defaults + to ``None``. + num_discrete_bins (int, optional): + The number of bins to create for the continuous columns. Defaults to 10. + num_rows_subsample (int, optional): + The number of rows to subsample from the real and synthetic data before computing + the metric. Defaults to ``None``. + + Returns: + float: + The contingency similarity of the two columns. + """ + return cls.compute_breakdown( + real_data, + synthetic_data, + continuous_column_names, + num_discrete_bins, + num_rows_subsample, + )['score'] @classmethod def normalize(cls, raw_score): diff --git a/tests/integration/column_pairs/statistical/test_contingency_similarity.py b/tests/integration/column_pairs/statistical/test_contingency_similarity.py new file mode 100644 index 00000000..6fce63d3 --- /dev/null +++ b/tests/integration/column_pairs/statistical/test_contingency_similarity.py @@ -0,0 +1,40 @@ +import numpy as np + +from sdmetrics.column_pairs.statistical import ContingencySimilarity +from sdmetrics.demos import load_demo + + +def test_with_num_rows_subsample(): + """Test the metric with `num_rows_subsample`. + + Here the `real_data` and `syntehtic_data` have 218 rows. + """ + # Setup + real_data, synthetic_data, _ = load_demo('single_table') + real_data = real_data[['degree_type', 'high_spec']] + synthetic_data = synthetic_data[['degree_type', 'high_spec']] + num_rows_subsample = 100 + + # Run + result_1 = ContingencySimilarity.compute( + real_data=real_data, + synthetic_data=synthetic_data, + num_rows_subsample=num_rows_subsample, + ) + result_2 = ContingencySimilarity.compute( + real_data=real_data, + synthetic_data=synthetic_data, + num_rows_subsample=num_rows_subsample, + ) + result_entire_data = ContingencySimilarity.compute( + real_data=real_data, + synthetic_data=synthetic_data, + num_rows_subsample=None, + ) + + # Assert + assert result_1 != result_2 + assert result_1 != result_entire_data + assert result_2 != result_entire_data + assert np.isclose(result_1, result_entire_data, atol=0.1) + assert np.isclose(result_2, result_entire_data, atol=0.1) diff --git a/tests/unit/column_pairs/statistical/test_contingency_similarity.py b/tests/unit/column_pairs/statistical/test_contingency_similarity.py index 51d2c57c..c9c2ca1c 100644 --- a/tests/unit/column_pairs/statistical/test_contingency_similarity.py +++ b/tests/unit/column_pairs/statistical/test_contingency_similarity.py @@ -1,6 +1,7 @@ import re from unittest.mock import patch +import numpy as np import pandas as pd import pytest @@ -17,8 +18,16 @@ def test__validate_inputs(self): synthetic_data = pd.DataFrame({'col1': range(5), 'col2': range(5)}) bad_continous_columns = ['col1', 'missing_col'] bad_num_discrete_bins = -1 + bad_num_rows_subsample = -1 # Run and Assert + ContingencySimilarity._validate_inputs( + real_data=real_data, + synthetic_data=synthetic_data, + continuous_column_names=None, + num_discrete_bins=10, + num_rows_subsample=3, + ) expected_bad_data = re.escape('The data must be a pandas DataFrame with two columns.') with pytest.raises(ValueError, match=expected_bad_data): ContingencySimilarity._validate_inputs( @@ -26,6 +35,7 @@ def test__validate_inputs(self): synthetic_data=bad_data, continuous_column_names=None, num_discrete_bins=10, + num_rows_subsample=3, ) expected_mismatch_columns_error = re.escape( @@ -37,6 +47,7 @@ def test__validate_inputs(self): synthetic_data=bad_synthetic_data, continuous_column_names=None, num_discrete_bins=10, + num_rows_subsample=3, ) expected_bad_continous_column_error = re.escape( @@ -48,6 +59,7 @@ def test__validate_inputs(self): synthetic_data=synthetic_data, continuous_column_names=bad_continous_columns, num_discrete_bins=10, + num_rows_subsample=3, ) expected_bad_num_discrete_bins_error = re.escape( @@ -59,9 +71,41 @@ def test__validate_inputs(self): synthetic_data=synthetic_data, continuous_column_names=['col1'], num_discrete_bins=bad_num_discrete_bins, + num_rows_subsample=3, + ) + expected_bad_num_rows_subsample_error = re.escape( + '`num_rows_subsample` must be an integer greater than zero.' + ) + with pytest.raises(ValueError, match=expected_bad_num_rows_subsample_error): + ContingencySimilarity._validate_inputs( + real_data=real_data, + synthetic_data=synthetic_data, + continuous_column_names=['col1'], + num_discrete_bins=10, + num_rows_subsample=bad_num_rows_subsample, ) - def test_compute(self): + @patch( + 'sdmetrics.column_pairs.statistical.contingency_similarity.ContingencySimilarity.compute_breakdown' + ) + def test_compute_mock(self, compute_breakdown_mock): + """Test that the ``compute`` method calls the ``compute_breakdown`` method.""" + # Setup + real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]}) + synthetic_data = pd.DataFrame({'col1': [1.0, 1.8, 2.6, 1.0], 'col2': [2, 3, 7, -10]}) + compute_breakdown_mock.return_value = {'score': 0.25} + + # Run + score = ContingencySimilarity.compute(real_data, synthetic_data) + + # Assert + compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None, 10, None) + assert score == 0.25 + + @patch( + 'sdmetrics.column_pairs.statistical.contingency_similarity.ContingencySimilarity._validate_inputs' + ) + def test_compute_breakdown(self, validate_inputs_mock): """Test the ``compute`` method. Expect that the total variation distance of the two contingency matricies @@ -81,9 +125,47 @@ def test_compute(self): # Run metric = ContingencySimilarity() - result = metric.compute(real_data, synthetic_data) + result = metric.compute_breakdown(real_data, synthetic_data) + + # Assert + validate_inputs_mock.assert_called_once_with( + real_data, + synthetic_data, + None, + 10, + None, + ) + assert result == {'score': expected_score} + + @patch('sdmetrics.column_pairs.statistical.contingency_similarity.discretize_column') + def test_compute_with_num_rows_subsample(self, discretize_column_mock): + """Test the ``compute`` method with ``num_rows_subsample``.""" + # Setup + np.random.seed(0) + real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]}) + synthetic_data = pd.DataFrame({'col1': [1.0, 1.8], 'col2': [2, 3]}) + discretize_column_mock.return_value = ( + pd.DataFrame({'col2': [1, 2, 3]}), + pd.DataFrame({'col2': [2, 3]}), + ) + expected_score = 0.0 + + # Run + metric = ContingencySimilarity() + result = metric.compute( + real_data, + synthetic_data, + continuous_column_names=['col2'], + num_discrete_bins=4, + num_rows_subsample=3, + ) # Assert + arg_mock = discretize_column_mock.call_args + expected_real = pd.Series([3, 4, 2], name='col2', index=[2, 3, 1]) + expected_synthetic = pd.Series([2, 3], name='col2', index=[0, 1]) + pd.testing.assert_series_equal(arg_mock[0][0], expected_real) + pd.testing.assert_series_equal(arg_mock[0][1], expected_synthetic) assert result == expected_score def test_compute_with_discretization(self):