From 6f7919f8dbd95076946fa8c355083316f718bb36 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 15 Sep 2022 12:45:06 -0400 Subject: [PATCH 1/9] Add synthetic uniqueness single table metric and tests --- sdmetrics/multi_table/multi_single_table.py | 6 + sdmetrics/single_table/__init__.py | 2 + .../single_table/synthetic_uniqueness.py | 87 ++++++++++++++ .../single_table/test_synthetic_uniqueness.py | 110 ++++++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 sdmetrics/single_table/synthetic_uniqueness.py create mode 100644 tests/unit/single_table/test_synthetic_uniqueness.py diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py index c3eff8cb..625a99ea 100644 --- a/sdmetrics/multi_table/multi_single_table.py +++ b/sdmetrics/multi_table/multi_single_table.py @@ -241,6 +241,12 @@ class BNLikelihood(MultiSingleTableMetric): single_table_metric = single_table.bayesian_network.BNLikelihood +class SyntheticUniqueness(MultiSingleTableMetric): + """MultiSingleTableMetric based on SingleTable SyntheticUniqueness.""" + + single_table_metric = single_table.synthetic_uniqueness.SyntheticUniqueness + + class BNLogLikelihood(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable BNLogLikelihood.""" diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index fc6beebf..690ceefe 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -31,6 +31,7 @@ from sdmetrics.single_table.privacy.numerical_sklearn import ( NumericalLR, NumericalMLP, NumericalSVR) from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor +from sdmetrics.single_table.synthetic_uniqueness import SyntheticUniqueness __all__ = [ 'bayesian_network', @@ -88,4 +89,5 @@ 'StatisticSimilarity', 'TVComplement', 'RangeCoverage', + 'SyntheticUniqueness', ] diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/synthetic_uniqueness.py new file mode 100644 index 00000000..a38456cc --- /dev/null +++ b/sdmetrics/single_table/synthetic_uniqueness.py @@ -0,0 +1,87 @@ +"""Synthetic uniqueness metrics for single table.""" +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_table.base import SingleTableMetric + + +class SyntheticUniqueness(SingleTableMetric): + """SyntheticUniqueness Single Table metric. + + This metric measures whether each row in the synthetic data is unique, + or whether it exactly matches a row in the real data. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'SyntheticUniqueness' + goal = Goal.MAXIMIZE + min_value = 0 + max_value = 1 + + @classmethod + def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01, + synthetic_sample_size=None): + """Compute this metric. + + This metric looks for matches between the real and synthetic data for + the compatible columns. This metric also looks for matches in missing values. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. + numerical_match_tolerance (float): + A float >0.0 representing how close two numerical values have to be + in order to be considered a match. + synthetic_sample_size (int): + The number of synthetic rows to sample before computing this metric. + Use this to speed up the computation time if you have a large amount + of synthetic data. Note that the final score may not be as precise if + your sample size is low. Defaults to ``None``, which does not sample, + and uses all of the provided rows. + + Returns: + float: + The synthetic uniqueness score. + """ + if synthetic_sample_size is not None: + synthetic_data = synthetic_data.sample(n=synthetic_sample_size) + + value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False) + value_counts.name = 'value_counts' + value_counts = value_counts.reset_index() + + columns = real_data.columns.to_list() + synthetic_value_counts = synthetic_data.merge( + value_counts, how='left', left_on=columns, right_on=columns) + num_unique_rows = (synthetic_value_counts['value_counts'] == 1).sum() + + return num_unique_rows / len(synthetic_data) + + @classmethod + def normalize(cls, raw_score): + """Normalize the log-likelihood value. + + Notice that this is not the mean likelihood. + + Args: + raw_score (float): + The value of the metric from `compute`. + + Returns: + float: + The normalized value of the metric + """ + return super().normalize(raw_score) diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_synthetic_uniqueness.py new file mode 100644 index 00000000..7aca66ec --- /dev/null +++ b/tests/unit/single_table/test_synthetic_uniqueness.py @@ -0,0 +1,110 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd + +from sdmetrics.single_table import SyntheticUniqueness + + +class TestSyntheticUniqueness: + + def test_compute(self): + """Test the ``compute`` method. + + Expect that the synthetic uniqueness is returned. + + Input: + - real data + - synthetic data + + Output: + - the evaluated metric + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 4], + 'col2': ['a', 'b', 'c', 'd', 'b'], + 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 3, 4, 2, 2], + 'col2': ['a', 'b', 'c', 'b', 'e'], + 'col3': [1.32, 1.56, 1.21, np.nan, 1.90], + }) + metadata = { + 'fields': { + 'col1': {'type': 'numerical', 'subtype': 'int'}, + 'col2': {'type': 'categorical'}, + 'col3': {'type': 'numerical', 'subtype': 'float'}, + }, + } + + # Run + metric = SyntheticUniqueness() + score = metric.compute(real_data, synthetic_data, metadata) + + # Assert + assert score == 0.6 + + def test_compute_with_sample_size(self): + """Test the ``compute`` method with a sample size. + + Expect that the synthetic uniqueness is returned. + + Input: + - real data + - synthetic data + + Output: + - the evaluated metric + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 4], + 'col2': ['a', 'b', 'c', 'd', 'b'], + 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 3, 4, 2, 2], + 'col2': ['a', 'b', 'c', 'd', 'e'], + 'col3': [1.33, 1.56, 1.21, np.nan, 1.92], + }) + metadata = { + 'fields': { + 'col1': {'type': 'numerical', 'subtype': 'int'}, + 'col2': {'type': 'categorical'}, + 'col3': {'type': 'numerical', 'subtype': 'float'}, + }, + } + sample_size = 2 + + # Run + metric = SyntheticUniqueness() + score = metric.compute( + real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) + + # Assert + assert score == 1 + + @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize') + def test_normalize(self, normalize_mock): + """Test the ``normalize`` method. + + Expect that the inherited ``normalize`` method is called. + + Input: + - raw score + + Output: + - the output of the inherited ``normalize`` method. + """ + # Setup + metric = SyntheticUniqueness() + raw_score = 0.9 + + # Run + result = metric.normalize(raw_score) + + # Assert + normalize_mock.assert_called_once_with(raw_score) + assert result == normalize_mock.return_value From 2e56f429f618aa3dbacfb1e3584a6cf9297730de Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 15 Sep 2022 12:53:52 -0400 Subject: [PATCH 2/9] Add warning for edge case --- .../single_table/synthetic_uniqueness.py | 9 +++- .../single_table/test_synthetic_uniqueness.py | 45 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/synthetic_uniqueness.py index a38456cc..92adc664 100644 --- a/sdmetrics/single_table/synthetic_uniqueness.py +++ b/sdmetrics/single_table/synthetic_uniqueness.py @@ -1,4 +1,6 @@ """Synthetic uniqueness metrics for single table.""" +import warnings + import pandas as pd from sdmetrics.goal import Goal @@ -57,7 +59,12 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler The synthetic uniqueness score. """ if synthetic_sample_size is not None: - synthetic_data = synthetic_data.sample(n=synthetic_sample_size) + if synthetic_sample_size > len(synthetic_data): + warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} ' + 'is larger than the number of synthetic data rows ' + f'({len(synthetic_data)}). Proceeding without sampling.') + else: + synthetic_data = synthetic_data.sample(n=synthetic_sample_size) value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False) value_counts.name = 'value_counts' diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_synthetic_uniqueness.py index 7aca66ec..9b73fe5e 100644 --- a/tests/unit/single_table/test_synthetic_uniqueness.py +++ b/tests/unit/single_table/test_synthetic_uniqueness.py @@ -86,6 +86,51 @@ def test_compute_with_sample_size(self): # Assert assert score == 1 + @patch('sdmetrics.single_table.synthetic_uniqueness.warnings') + def test_compute_with_sample_size_too_large(self, warnings_mock): + """Test the ``compute`` method with a sample size larger than the number of rows. + + Expect that the synthetic uniqueness is returned. Expect a warning to be raised. + + Input: + - real data + - synthetic data + + Output: + - the evaluated metric + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 4], + 'col2': ['a', 'b', 'c', 'd', 'b'], + 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 3, 4, 2, 2], + 'col2': ['a', 'b', 'c', 'd', 'e'], + 'col3': [1.33, 1.56, 1.21, np.nan, 1.92], + }) + metadata = { + 'fields': { + 'col1': {'type': 'numerical', 'subtype': 'int'}, + 'col2': {'type': 'categorical'}, + 'col3': {'type': 'numerical', 'subtype': 'float'}, + }, + } + sample_size = 15 + + # Run + metric = SyntheticUniqueness() + score = metric.compute( + real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) + + # Assert + assert score == 1 + warnings_mock.warn.assert_called_once_with( + 'The provided `synthetic_sample_size` of 15 is larger than the number of ' + 'synthetic data rows (5). Proceeding without sampling.' + ) + @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize') def test_normalize(self, normalize_mock): """Test the ``normalize`` method. From 79398d0dc78b3950350cea19e1507de3f768c7ad Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Wed, 21 Sep 2022 12:05:32 -0400 Subject: [PATCH 3/9] Update metric name --- sdmetrics/multi_table/multi_single_table.py | 6 +-- sdmetrics/single_table/__init__.py | 4 +- ...tic_uniqueness.py => new_row_synthesis.py} | 12 ++--- ...niqueness.py => test_new_row_synthesis.py} | 52 ++++--------------- 4 files changed, 22 insertions(+), 52 deletions(-) rename sdmetrics/single_table/{synthetic_uniqueness.py => new_row_synthesis.py} (93%) rename tests/unit/single_table/{test_synthetic_uniqueness.py => test_new_row_synthesis.py} (75%) diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py index 625a99ea..c041855f 100644 --- a/sdmetrics/multi_table/multi_single_table.py +++ b/sdmetrics/multi_table/multi_single_table.py @@ -241,10 +241,10 @@ class BNLikelihood(MultiSingleTableMetric): single_table_metric = single_table.bayesian_network.BNLikelihood -class SyntheticUniqueness(MultiSingleTableMetric): - """MultiSingleTableMetric based on SingleTable SyntheticUniqueness.""" +class NewRowSynthesis(MultiSingleTableMetric): + """MultiSingleTableMetric based on SingleTable NewRowSynthesis.""" - single_table_metric = single_table.synthetic_uniqueness.SyntheticUniqueness + single_table_metric = single_table.new_row_synthesis.NewRowSynthesis class BNLogLikelihood(MultiSingleTableMetric): diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index 690ceefe..35704626 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -22,6 +22,7 @@ from sdmetrics.single_table.multi_single_column import ( BoundaryAdherence, CategoryCoverage, CSTest, KSComplement, MissingValueSimilarity, MultiSingleColumnMetric, RangeCoverage, StatisticSimilarity, TVComplement) +from sdmetrics.single_table.new_row_synthesis import NewRowSynthesis from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric from sdmetrics.single_table.privacy.cap import ( CategoricalCAP, CategoricalGeneralizedCAP, CategoricalZeroCAP) @@ -31,7 +32,6 @@ from sdmetrics.single_table.privacy.numerical_sklearn import ( NumericalLR, NumericalMLP, NumericalSVR) from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor -from sdmetrics.single_table.synthetic_uniqueness import SyntheticUniqueness __all__ = [ 'bayesian_network', @@ -89,5 +89,5 @@ 'StatisticSimilarity', 'TVComplement', 'RangeCoverage', - 'SyntheticUniqueness', + 'NewRowSynthesis', ] diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/new_row_synthesis.py similarity index 93% rename from sdmetrics/single_table/synthetic_uniqueness.py rename to sdmetrics/single_table/new_row_synthesis.py index 92adc664..9782d9bb 100644 --- a/sdmetrics/single_table/synthetic_uniqueness.py +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -1,4 +1,4 @@ -"""Synthetic uniqueness metrics for single table.""" +"""New Row Synthesis metric for single table.""" import warnings import pandas as pd @@ -7,10 +7,10 @@ from sdmetrics.single_table.base import SingleTableMetric -class SyntheticUniqueness(SingleTableMetric): - """SyntheticUniqueness Single Table metric. +class NewRowSynthesis(SingleTableMetric): + """NewRowSynthesis Single Table metric. - This metric measures whether each row in the synthetic data is unique, + This metric measures whether each row in the synthetic data is new, or whether it exactly matches a row in the real data. Attributes: @@ -24,7 +24,7 @@ class SyntheticUniqueness(SingleTableMetric): Maximum value or values that this metric can take. """ - name = 'SyntheticUniqueness' + name = 'NewRowSynthesis' goal = Goal.MAXIMIZE min_value = 0 max_value = 1 @@ -56,7 +56,7 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler Returns: float: - The synthetic uniqueness score. + The new row synthesis score. """ if synthetic_sample_size is not None: if synthetic_sample_size > len(synthetic_data): diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_new_row_synthesis.py similarity index 75% rename from tests/unit/single_table/test_synthetic_uniqueness.py rename to tests/unit/single_table/test_new_row_synthesis.py index 9b73fe5e..e903df2c 100644 --- a/tests/unit/single_table/test_synthetic_uniqueness.py +++ b/tests/unit/single_table/test_new_row_synthesis.py @@ -3,23 +3,13 @@ import numpy as np import pandas as pd -from sdmetrics.single_table import SyntheticUniqueness +from sdmetrics.single_table import NewRowSynthesis -class TestSyntheticUniqueness: +class TestNewRowSynthesis: def test_compute(self): - """Test the ``compute`` method. - - Expect that the synthetic uniqueness is returned. - - Input: - - real data - - synthetic data - - Output: - - the evaluated metric - """ + """Test the ``compute`` method and expect that the new row synthesis score is returned.""" # Setup real_data = pd.DataFrame({ 'col1': [1, 2, 1, 3, 4], @@ -38,9 +28,9 @@ def test_compute(self): 'col3': {'type': 'numerical', 'subtype': 'float'}, }, } + metric = NewRowSynthesis() # Run - metric = SyntheticUniqueness() score = metric.compute(real_data, synthetic_data, metadata) # Assert @@ -49,14 +39,7 @@ def test_compute(self): def test_compute_with_sample_size(self): """Test the ``compute`` method with a sample size. - Expect that the synthetic uniqueness is returned. - - Input: - - real data - - synthetic data - - Output: - - the evaluated metric + Expect that the new row synthesis score is returned. """ # Setup real_data = pd.DataFrame({ @@ -77,27 +60,20 @@ def test_compute_with_sample_size(self): }, } sample_size = 2 + metric = NewRowSynthesis() # Run - metric = SyntheticUniqueness() score = metric.compute( real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) # Assert assert score == 1 - @patch('sdmetrics.single_table.synthetic_uniqueness.warnings') + @patch('sdmetrics.single_table.new_row_synthesis.warnings') def test_compute_with_sample_size_too_large(self, warnings_mock): """Test the ``compute`` method with a sample size larger than the number of rows. - Expect that the synthetic uniqueness is returned. Expect a warning to be raised. - - Input: - - real data - - synthetic data - - Output: - - the evaluated metric + Expect that the new row synthesis is returned. Expect a warning to be raised. """ # Setup real_data = pd.DataFrame({ @@ -118,9 +94,9 @@ def test_compute_with_sample_size_too_large(self, warnings_mock): }, } sample_size = 15 + metric = NewRowSynthesis() # Run - metric = SyntheticUniqueness() score = metric.compute( real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) @@ -131,20 +107,14 @@ def test_compute_with_sample_size_too_large(self, warnings_mock): 'synthetic data rows (5). Proceeding without sampling.' ) - @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize') + @patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize') def test_normalize(self, normalize_mock): """Test the ``normalize`` method. Expect that the inherited ``normalize`` method is called. - - Input: - - raw score - - Output: - - the output of the inherited ``normalize`` method. """ # Setup - metric = SyntheticUniqueness() + metric = NewRowSynthesis() raw_score = 0.9 # Run From 28da18f22cfde11c43746622fb82b045c0a388be Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Wed, 21 Sep 2022 13:08:07 -0400 Subject: [PATCH 4/9] Update implementation --- sdmetrics/single_table/new_row_synthesis.py | 35 ++++++++++++++----- .../single_table/test_new_row_synthesis.py | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py index 9782d9bb..c21a5611 100644 --- a/sdmetrics/single_table/new_row_synthesis.py +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -1,6 +1,7 @@ """New Row Synthesis metric for single table.""" import warnings +import numpy as np import pandas as pd from sdmetrics.goal import Goal @@ -66,14 +67,32 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler else: synthetic_data = synthetic_data.sample(n=synthetic_sample_size) - value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False) - value_counts.name = 'value_counts' - value_counts = value_counts.reset_index() - - columns = real_data.columns.to_list() - synthetic_value_counts = synthetic_data.merge( - value_counts, how='left', left_on=columns, right_on=columns) - num_unique_rows = (synthetic_value_counts['value_counts'] == 1).sum() + numerical_fields = [] + discrete_fields = [] + for field, field_meta in metadata['fields'].items(): + if field_meta['type'] == 'datetime': + real_data[field] = pd.to_datetime(real_data[field]) + synthetic_data[field] = pd.to_datetime(synthetic_data[field]) + numerical_fields.append(field) + elif field_meta['type'] == 'numerical': + numerical_fields.append(field) + else: + discrete_fields.append(field) + + num_unique_rows = 0 + for index, row in synthetic_data.iterrows(): + row_filter = [] + for field in real_data.columns: + if field in numerical_fields: + field_filter = f'{field}.isnull()' if np.isnan(row[field]) else ( + f'abs({field} - {row[field]}) < {numerical_match_tolerance * row[field]}') + row_filter.append(field_filter) + else: + row_filter.append(f"{field} == '{row[field]}'") + + matches = real_data.query(' and '.join(row_filter)) + if matches is None or matches.empty: + num_unique_rows += 1 return num_unique_rows / len(synthetic_data) diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py index e903df2c..1135875b 100644 --- a/tests/unit/single_table/test_new_row_synthesis.py +++ b/tests/unit/single_table/test_new_row_synthesis.py @@ -84,7 +84,7 @@ def test_compute_with_sample_size_too_large(self, warnings_mock): synthetic_data = pd.DataFrame({ 'col1': [1, 3, 4, 2, 2], 'col2': ['a', 'b', 'c', 'd', 'e'], - 'col3': [1.33, 1.56, 1.21, np.nan, 1.92], + 'col3': [1.35, 1.56, 1.21, np.nan, 1.92], }) metadata = { 'fields': { From 1cbf7e829735670c3667e692ba45681a63707117 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Wed, 21 Sep 2022 13:53:02 -0400 Subject: [PATCH 5/9] Add input validation --- sdmetrics/single_table/new_row_synthesis.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py index c21a5611..35033886 100644 --- a/sdmetrics/single_table/new_row_synthesis.py +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -59,6 +59,9 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler float: The new row synthesis score. """ + real_data, synthetic_data, metadata = cls._validate_inputs( + real_data, synthetic_data, metadata) + if synthetic_sample_size is not None: if synthetic_sample_size > len(synthetic_data): warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} ' From 376bec499e119436511d15cc6128b6a73bb19f05 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 22 Sep 2022 13:22:48 -0400 Subject: [PATCH 6/9] fix unit test --- sdmetrics/reports/single_table/plot_utils.py | 3 +++ tests/unit/single_table/test_new_row_synthesis.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py index 586e1cb7..51ef64ba 100644 --- a/sdmetrics/reports/single_table/plot_utils.py +++ b/sdmetrics/reports/single_table/plot_utils.py @@ -89,6 +89,9 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns): Returns: pandas.DataFrame """ + if isinstance(columns, set): + columns = list(columns) + similarity_correlation = pd.DataFrame( index=columns, columns=columns, diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py index 1135875b..a8dc240d 100644 --- a/tests/unit/single_table/test_new_row_synthesis.py +++ b/tests/unit/single_table/test_new_row_synthesis.py @@ -50,7 +50,7 @@ def test_compute_with_sample_size(self): synthetic_data = pd.DataFrame({ 'col1': [1, 3, 4, 2, 2], 'col2': ['a', 'b', 'c', 'd', 'e'], - 'col3': [1.33, 1.56, 1.21, np.nan, 1.92], + 'col3': [1.46, 1.56, 1.21, np.nan, 1.92], }) metadata = { 'fields': { From d97831f77fd80baba755b170a8db1157202481a2 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:22:57 -0400 Subject: [PATCH 7/9] Fix edge cases in new row synthesis query --- sdmetrics/reports/single_table/plot_utils.py | 2 ++ sdmetrics/single_table/new_row_synthesis.py | 29 +++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py index 51ef64ba..1cfb0732 100644 --- a/sdmetrics/reports/single_table/plot_utils.py +++ b/sdmetrics/reports/single_table/plot_utils.py @@ -85,6 +85,8 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns): Args: score_breakdowns (dict): Mapping of metric to the score breakdown result. + columns (list[string] or set[string]): + A list or set of column names. Returns: pandas.DataFrame diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py index 35033886..c4174f20 100644 --- a/sdmetrics/single_table/new_row_synthesis.py +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -1,7 +1,6 @@ """New Row Synthesis metric for single table.""" import warnings -import numpy as np import pandas as pd from sdmetrics.goal import Goal @@ -46,8 +45,8 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler metadata (dict): Table metadata dict. numerical_match_tolerance (float): - A float >0.0 representing how close two numerical values have to be - in order to be considered a match. + A float larger than 0 representing how close two numerical values have to be + in order to be considered a match. Defaults to `0.01`. synthetic_sample_size (int): The number of synthetic rows to sample before computing this metric. Use this to speed up the computation time if you have a large amount @@ -72,13 +71,16 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler numerical_fields = [] discrete_fields = [] + categorical_fields = [] for field, field_meta in metadata['fields'].items(): if field_meta['type'] == 'datetime': - real_data[field] = pd.to_datetime(real_data[field]) - synthetic_data[field] = pd.to_datetime(synthetic_data[field]) + real_data[field] = pd.to_numeric(real_data[field]) + synthetic_data[field] = pd.to_numeric(synthetic_data[field]) numerical_fields.append(field) elif field_meta['type'] == 'numerical': numerical_fields.append(field) + elif field_meta['type'] == 'categorical': + categorical_fields.append(field) else: discrete_fields.append(field) @@ -86,12 +88,19 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler for index, row in synthetic_data.iterrows(): row_filter = [] for field in real_data.columns: - if field in numerical_fields: - field_filter = f'{field}.isnull()' if np.isnan(row[field]) else ( - f'abs({field} - {row[field]}) < {numerical_match_tolerance * row[field]}') - row_filter.append(field_filter) + if pd.isna(row[field]): + field_filter = f'{field}.isnull()' + elif field in numerical_fields: + field_filter = ( + f'abs({field} - {row[field]}) <= ' + f'{abs(numerical_match_tolerance * row[field])}' + ) + elif field in categorical_fields: + field_filter = f"{field} == '{row[field]}'" else: - row_filter.append(f"{field} == '{row[field]}'") + field_filter = f'{field} == {row[field]}' + + row_filter.append(field_filter) matches = real_data.query(' and '.join(row_filter)) if matches is None or matches.empty: From b17f2994240c1f62c8b04ddeee50fce1c5b532a5 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:51:12 -0400 Subject: [PATCH 8/9] Update query logic --- sdmetrics/single_table/new_row_synthesis.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py index c4174f20..854a7db7 100644 --- a/sdmetrics/single_table/new_row_synthesis.py +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -71,7 +71,6 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler numerical_fields = [] discrete_fields = [] - categorical_fields = [] for field, field_meta in metadata['fields'].items(): if field_meta['type'] == 'datetime': real_data[field] = pd.to_numeric(real_data[field]) @@ -79,8 +78,6 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler numerical_fields.append(field) elif field_meta['type'] == 'numerical': numerical_fields.append(field) - elif field_meta['type'] == 'categorical': - categorical_fields.append(field) else: discrete_fields.append(field) @@ -95,10 +92,11 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler f'abs({field} - {row[field]}) <= ' f'{abs(numerical_match_tolerance * row[field])}' ) - elif field in categorical_fields: - field_filter = f"{field} == '{row[field]}'" else: - field_filter = f'{field} == {row[field]}' + if real_data[field].dtype == 'O': + field_filter = f"{field} == '{row[field]}'" + else: + field_filter = f'{field} == {row[field]}' row_filter.append(field_filter) From f5d43c075679fca0b5d05e3773256ebe840d027f Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:59:57 -0400 Subject: [PATCH 9/9] Update unit test --- sdmetrics/single_table/base.py | 6 ++++ .../single_table/test_new_row_synthesis.py | 29 ++++++++++++------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/sdmetrics/single_table/base.py b/sdmetrics/single_table/base.py index e9138494..2f1dcd95 100644 --- a/sdmetrics/single_table/base.py +++ b/sdmetrics/single_table/base.py @@ -1,5 +1,6 @@ """Base Single Table metric class.""" +import copy from operator import attrgetter import pandas as pd @@ -103,6 +104,11 @@ def _validate_inputs(cls, real_data, synthetic_data, metadata=None): (pandas.DataFrame, pandas.DataFrame, dict): The validated data and metadata. """ + real_data = real_data.copy() + synthetic_data = synthetic_data.copy() + if metadata is not None: + metadata = copy.deepcopy(metadata) + if set(real_data.columns) != set(synthetic_data.columns): raise ValueError('`real_data` and `synthetic_data` must have the same columns') diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py index a8dc240d..7bec691c 100644 --- a/tests/unit/single_table/test_new_row_synthesis.py +++ b/tests/unit/single_table/test_new_row_synthesis.py @@ -12,20 +12,29 @@ def test_compute(self): """Test the ``compute`` method and expect that the new row synthesis score is returned.""" # Setup real_data = pd.DataFrame({ - 'col1': [1, 2, 1, 3, 4], - 'col2': ['a', 'b', 'c', 'd', 'b'], - 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + 'col1': [0, 1, 2, 3, 4], + 'col2': [1, 2, 1, 3, 4], + 'col3': ['a', 'b', 'c', 'd', 'b'], + 'col4': [1.32, np.nan, 1.43, np.nan, 2.0], + 'col5': [51, 52, 53, 54, 55], + 'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'], }) synthetic_data = pd.DataFrame({ - 'col1': [1, 3, 4, 2, 2], - 'col2': ['a', 'b', 'c', 'b', 'e'], - 'col3': [1.32, 1.56, 1.21, np.nan, 1.90], + 'col1': [0, 1, 2, 3, 4], + 'col2': [1, 3, 4, 2, 2], + 'col3': ['a', 'b', 'c', 'b', 'e'], + 'col4': [1.32, 1.56, 1.21, np.nan, 1.90], + 'col5': [51, 51, 54, 55, 53], + 'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'], }) metadata = { 'fields': { - 'col1': {'type': 'numerical', 'subtype': 'int'}, - 'col2': {'type': 'categorical'}, - 'col3': {'type': 'numerical', 'subtype': 'float'}, + 'col1': {'type': 'id', 'subtype': 'int'}, + 'col2': {'type': 'numerical', 'subtype': 'int'}, + 'col3': {'type': 'categorical'}, + 'col4': {'type': 'numerical', 'subtype': 'float'}, + 'col5': {'type': 'categorical'}, + 'col6': {'type': 'datetime', 'format': '%Y-%m-%d'}, }, } metric = NewRowSynthesis() @@ -34,7 +43,7 @@ def test_compute(self): score = metric.compute(real_data, synthetic_data, metadata) # Assert - assert score == 0.6 + assert score == 0.8 def test_compute_with_sample_size(self): """Test the ``compute`` method with a sample size.