diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py index c3eff8cb..c041855f 100644 --- a/sdmetrics/multi_table/multi_single_table.py +++ b/sdmetrics/multi_table/multi_single_table.py @@ -241,6 +241,12 @@ class BNLikelihood(MultiSingleTableMetric): single_table_metric = single_table.bayesian_network.BNLikelihood +class NewRowSynthesis(MultiSingleTableMetric): + """MultiSingleTableMetric based on SingleTable NewRowSynthesis.""" + + single_table_metric = single_table.new_row_synthesis.NewRowSynthesis + + class BNLogLikelihood(MultiSingleTableMetric): """MultiSingleTableMetric based on SingleTable BNLogLikelihood.""" diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py index 586e1cb7..1cfb0732 100644 --- a/sdmetrics/reports/single_table/plot_utils.py +++ b/sdmetrics/reports/single_table/plot_utils.py @@ -85,10 +85,15 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns): Args: score_breakdowns (dict): Mapping of metric to the score breakdown result. + columns (list[string] or set[string]): + A list or set of column names. Returns: pandas.DataFrame """ + if isinstance(columns, set): + columns = list(columns) + similarity_correlation = pd.DataFrame( index=columns, columns=columns, diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index fc6beebf..35704626 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -22,6 +22,7 @@ from sdmetrics.single_table.multi_single_column import ( BoundaryAdherence, CategoryCoverage, CSTest, KSComplement, MissingValueSimilarity, MultiSingleColumnMetric, RangeCoverage, StatisticSimilarity, TVComplement) +from sdmetrics.single_table.new_row_synthesis import NewRowSynthesis from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric from sdmetrics.single_table.privacy.cap import ( CategoricalCAP, CategoricalGeneralizedCAP, CategoricalZeroCAP) @@ -88,4 +89,5 @@ 'StatisticSimilarity', 'TVComplement', 'RangeCoverage', + 'NewRowSynthesis', ] diff --git a/sdmetrics/single_table/base.py b/sdmetrics/single_table/base.py index e9138494..2f1dcd95 100644 --- a/sdmetrics/single_table/base.py +++ b/sdmetrics/single_table/base.py @@ -1,5 +1,6 @@ """Base Single Table metric class.""" +import copy from operator import attrgetter import pandas as pd @@ -103,6 +104,11 @@ def _validate_inputs(cls, real_data, synthetic_data, metadata=None): (pandas.DataFrame, pandas.DataFrame, dict): The validated data and metadata. """ + real_data = real_data.copy() + synthetic_data = synthetic_data.copy() + if metadata is not None: + metadata = copy.deepcopy(metadata) + if set(real_data.columns) != set(synthetic_data.columns): raise ValueError('`real_data` and `synthetic_data` must have the same columns') diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py new file mode 100644 index 00000000..854a7db7 --- /dev/null +++ b/sdmetrics/single_table/new_row_synthesis.py @@ -0,0 +1,123 @@ +"""New Row Synthesis metric for single table.""" +import warnings + +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_table.base import SingleTableMetric + + +class NewRowSynthesis(SingleTableMetric): + """NewRowSynthesis Single Table metric. + + This metric measures whether each row in the synthetic data is new, + or whether it exactly matches a row in the real data. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'NewRowSynthesis' + goal = Goal.MAXIMIZE + min_value = 0 + max_value = 1 + + @classmethod + def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01, + synthetic_sample_size=None): + """Compute this metric. + + This metric looks for matches between the real and synthetic data for + the compatible columns. This metric also looks for matches in missing values. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. + numerical_match_tolerance (float): + A float larger than 0 representing how close two numerical values have to be + in order to be considered a match. Defaults to `0.01`. + synthetic_sample_size (int): + The number of synthetic rows to sample before computing this metric. + Use this to speed up the computation time if you have a large amount + of synthetic data. Note that the final score may not be as precise if + your sample size is low. Defaults to ``None``, which does not sample, + and uses all of the provided rows. + + Returns: + float: + The new row synthesis score. + """ + real_data, synthetic_data, metadata = cls._validate_inputs( + real_data, synthetic_data, metadata) + + if synthetic_sample_size is not None: + if synthetic_sample_size > len(synthetic_data): + warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} ' + 'is larger than the number of synthetic data rows ' + f'({len(synthetic_data)}). Proceeding without sampling.') + else: + synthetic_data = synthetic_data.sample(n=synthetic_sample_size) + + numerical_fields = [] + discrete_fields = [] + for field, field_meta in metadata['fields'].items(): + if field_meta['type'] == 'datetime': + real_data[field] = pd.to_numeric(real_data[field]) + synthetic_data[field] = pd.to_numeric(synthetic_data[field]) + numerical_fields.append(field) + elif field_meta['type'] == 'numerical': + numerical_fields.append(field) + else: + discrete_fields.append(field) + + num_unique_rows = 0 + for index, row in synthetic_data.iterrows(): + row_filter = [] + for field in real_data.columns: + if pd.isna(row[field]): + field_filter = f'{field}.isnull()' + elif field in numerical_fields: + field_filter = ( + f'abs({field} - {row[field]}) <= ' + f'{abs(numerical_match_tolerance * row[field])}' + ) + else: + if real_data[field].dtype == 'O': + field_filter = f"{field} == '{row[field]}'" + else: + field_filter = f'{field} == {row[field]}' + + row_filter.append(field_filter) + + matches = real_data.query(' and '.join(row_filter)) + if matches is None or matches.empty: + num_unique_rows += 1 + + return num_unique_rows / len(synthetic_data) + + @classmethod + def normalize(cls, raw_score): + """Normalize the log-likelihood value. + + Notice that this is not the mean likelihood. + + Args: + raw_score (float): + The value of the metric from `compute`. + + Returns: + float: + The normalized value of the metric + """ + return super().normalize(raw_score) diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py new file mode 100644 index 00000000..7bec691c --- /dev/null +++ b/tests/unit/single_table/test_new_row_synthesis.py @@ -0,0 +1,134 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd + +from sdmetrics.single_table import NewRowSynthesis + + +class TestNewRowSynthesis: + + def test_compute(self): + """Test the ``compute`` method and expect that the new row synthesis score is returned.""" + # Setup + real_data = pd.DataFrame({ + 'col1': [0, 1, 2, 3, 4], + 'col2': [1, 2, 1, 3, 4], + 'col3': ['a', 'b', 'c', 'd', 'b'], + 'col4': [1.32, np.nan, 1.43, np.nan, 2.0], + 'col5': [51, 52, 53, 54, 55], + 'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'], + }) + synthetic_data = pd.DataFrame({ + 'col1': [0, 1, 2, 3, 4], + 'col2': [1, 3, 4, 2, 2], + 'col3': ['a', 'b', 'c', 'b', 'e'], + 'col4': [1.32, 1.56, 1.21, np.nan, 1.90], + 'col5': [51, 51, 54, 55, 53], + 'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'], + }) + metadata = { + 'fields': { + 'col1': {'type': 'id', 'subtype': 'int'}, + 'col2': {'type': 'numerical', 'subtype': 'int'}, + 'col3': {'type': 'categorical'}, + 'col4': {'type': 'numerical', 'subtype': 'float'}, + 'col5': {'type': 'categorical'}, + 'col6': {'type': 'datetime', 'format': '%Y-%m-%d'}, + }, + } + metric = NewRowSynthesis() + + # Run + score = metric.compute(real_data, synthetic_data, metadata) + + # Assert + assert score == 0.8 + + def test_compute_with_sample_size(self): + """Test the ``compute`` method with a sample size. + + Expect that the new row synthesis score is returned. + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 4], + 'col2': ['a', 'b', 'c', 'd', 'b'], + 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 3, 4, 2, 2], + 'col2': ['a', 'b', 'c', 'd', 'e'], + 'col3': [1.46, 1.56, 1.21, np.nan, 1.92], + }) + metadata = { + 'fields': { + 'col1': {'type': 'numerical', 'subtype': 'int'}, + 'col2': {'type': 'categorical'}, + 'col3': {'type': 'numerical', 'subtype': 'float'}, + }, + } + sample_size = 2 + metric = NewRowSynthesis() + + # Run + score = metric.compute( + real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) + + # Assert + assert score == 1 + + @patch('sdmetrics.single_table.new_row_synthesis.warnings') + def test_compute_with_sample_size_too_large(self, warnings_mock): + """Test the ``compute`` method with a sample size larger than the number of rows. + + Expect that the new row synthesis is returned. Expect a warning to be raised. + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 4], + 'col2': ['a', 'b', 'c', 'd', 'b'], + 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 3, 4, 2, 2], + 'col2': ['a', 'b', 'c', 'd', 'e'], + 'col3': [1.35, 1.56, 1.21, np.nan, 1.92], + }) + metadata = { + 'fields': { + 'col1': {'type': 'numerical', 'subtype': 'int'}, + 'col2': {'type': 'categorical'}, + 'col3': {'type': 'numerical', 'subtype': 'float'}, + }, + } + sample_size = 15 + metric = NewRowSynthesis() + + # Run + score = metric.compute( + real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) + + # Assert + assert score == 1 + warnings_mock.warn.assert_called_once_with( + 'The provided `synthetic_sample_size` of 15 is larger than the number of ' + 'synthetic data rows (5). Proceeding without sampling.' + ) + + @patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize') + def test_normalize(self, normalize_mock): + """Test the ``normalize`` method. + + Expect that the inherited ``normalize`` method is called. + """ + # Setup + metric = NewRowSynthesis() + raw_score = 0.9 + + # Run + result = metric.normalize(raw_score) + + # Assert + normalize_mock.assert_called_once_with(raw_score) + assert result == normalize_mock.return_value