-
Notifications
You must be signed in to change notification settings - Fork 50
Add new row synthesis single table metric #226
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6f7919f
2e56f42
79398d0
28da18f
1cbf7e8
376bec4
d97831f
b17f299
f5d43c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,10 +85,15 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns): | |
| Args: | ||
| score_breakdowns (dict): | ||
| Mapping of metric to the score breakdown result. | ||
| columns (list[string] or set[string]): | ||
| A list or set of column names. | ||
|
|
||
| Returns: | ||
| pandas.DataFrame | ||
| """ | ||
| if isinstance(columns, set): | ||
| columns = list(columns) | ||
|
Comment on lines
+94
to
+95
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor, but not sure what In [1]: columns_list = ['a', 'b', 'c']
In [2]: columns_set = {'a', 'b', 'c'}
In [3]: columns_tuple = ('a', 'b', 'c')
In [4]: list(columns_list)
Out[4]: ['a', 'b', 'c']
In [5]: list(columns_set)
Out[5]: ['b', 'c', 'a']
In [6]: list(columns_tuple)
Out[6]: ['a', 'b', 'c']If
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should always be a list or a set. |
||
|
|
||
| similarity_correlation = pd.DataFrame( | ||
| index=columns, | ||
| columns=columns, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| """New Row Synthesis metric for single table.""" | ||
| import warnings | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from sdmetrics.goal import Goal | ||
| from sdmetrics.single_table.base import SingleTableMetric | ||
|
|
||
|
|
||
| class NewRowSynthesis(SingleTableMetric): | ||
| """NewRowSynthesis Single Table metric. | ||
|
|
||
| This metric measures whether each row in the synthetic data is new, | ||
| or whether it exactly matches a row in the real data. | ||
|
|
||
| Attributes: | ||
| name (str): | ||
| Name to use when reports about this metric are printed. | ||
| goal (sdmetrics.goal.Goal): | ||
| The goal of this metric. | ||
| min_value (Union[float, tuple[float]]): | ||
| Minimum value or values that this metric can take. | ||
| max_value (Union[float, tuple[float]]): | ||
| Maximum value or values that this metric can take. | ||
| """ | ||
|
|
||
| name = 'NewRowSynthesis' | ||
| goal = Goal.MAXIMIZE | ||
| min_value = 0 | ||
| max_value = 1 | ||
|
|
||
| @classmethod | ||
| def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01, | ||
| synthetic_sample_size=None): | ||
| """Compute this metric. | ||
|
|
||
| This metric looks for matches between the real and synthetic data for | ||
| the compatible columns. This metric also looks for matches in missing values. | ||
|
|
||
| Args: | ||
| real_data (Union[numpy.ndarray, pandas.DataFrame]): | ||
| The values from the real dataset. | ||
| synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): | ||
| The values from the synthetic dataset. | ||
| metadata (dict): | ||
| Table metadata dict. | ||
| numerical_match_tolerance (float): | ||
| A float larger than 0 representing how close two numerical values have to be | ||
| in order to be considered a match. Defaults to `0.01`. | ||
| synthetic_sample_size (int): | ||
| The number of synthetic rows to sample before computing this metric. | ||
| Use this to speed up the computation time if you have a large amount | ||
| of synthetic data. Note that the final score may not be as precise if | ||
| your sample size is low. Defaults to ``None``, which does not sample, | ||
| and uses all of the provided rows. | ||
|
|
||
| Returns: | ||
| float: | ||
| The new row synthesis score. | ||
| """ | ||
| real_data, synthetic_data, metadata = cls._validate_inputs( | ||
| real_data, synthetic_data, metadata) | ||
|
|
||
| if synthetic_sample_size is not None: | ||
| if synthetic_sample_size > len(synthetic_data): | ||
| warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} ' | ||
| 'is larger than the number of synthetic data rows ' | ||
| f'({len(synthetic_data)}). Proceeding without sampling.') | ||
| else: | ||
| synthetic_data = synthetic_data.sample(n=synthetic_sample_size) | ||
|
|
||
| numerical_fields = [] | ||
| discrete_fields = [] | ||
| for field, field_meta in metadata['fields'].items(): | ||
| if field_meta['type'] == 'datetime': | ||
| real_data[field] = pd.to_numeric(real_data[field]) | ||
| synthetic_data[field] = pd.to_numeric(synthetic_data[field]) | ||
| numerical_fields.append(field) | ||
| elif field_meta['type'] == 'numerical': | ||
| numerical_fields.append(field) | ||
| else: | ||
| discrete_fields.append(field) | ||
|
|
||
| num_unique_rows = 0 | ||
| for index, row in synthetic_data.iterrows(): | ||
| row_filter = [] | ||
| for field in real_data.columns: | ||
| if pd.isna(row[field]): | ||
| field_filter = f'{field}.isnull()' | ||
| elif field in numerical_fields: | ||
| field_filter = ( | ||
| f'abs({field} - {row[field]}) <= ' | ||
| f'{abs(numerical_match_tolerance * row[field])}' | ||
| ) | ||
| else: | ||
| if real_data[field].dtype == 'O': | ||
| field_filter = f"{field} == '{row[field]}'" | ||
| else: | ||
| field_filter = f'{field} == {row[field]}' | ||
|
|
||
| row_filter.append(field_filter) | ||
|
|
||
| matches = real_data.query(' and '.join(row_filter)) | ||
| if matches is None or matches.empty: | ||
| num_unique_rows += 1 | ||
|
|
||
| return num_unique_rows / len(synthetic_data) | ||
|
|
||
| @classmethod | ||
| def normalize(cls, raw_score): | ||
| """Normalize the log-likelihood value. | ||
|
|
||
| Notice that this is not the mean likelihood. | ||
|
|
||
| Args: | ||
| raw_score (float): | ||
| The value of the metric from `compute`. | ||
|
|
||
| Returns: | ||
| float: | ||
| The normalized value of the metric | ||
| """ | ||
| return super().normalize(raw_score) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,134 @@ | ||
| from unittest.mock import patch | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
| from sdmetrics.single_table import NewRowSynthesis | ||
|
|
||
|
|
||
| class TestNewRowSynthesis: | ||
|
|
||
| def test_compute(self): | ||
| """Test the ``compute`` method and expect that the new row synthesis score is returned.""" | ||
| # Setup | ||
| real_data = pd.DataFrame({ | ||
| 'col1': [0, 1, 2, 3, 4], | ||
| 'col2': [1, 2, 1, 3, 4], | ||
| 'col3': ['a', 'b', 'c', 'd', 'b'], | ||
| 'col4': [1.32, np.nan, 1.43, np.nan, 2.0], | ||
| 'col5': [51, 52, 53, 54, 55], | ||
| 'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'], | ||
| }) | ||
| synthetic_data = pd.DataFrame({ | ||
| 'col1': [0, 1, 2, 3, 4], | ||
| 'col2': [1, 3, 4, 2, 2], | ||
| 'col3': ['a', 'b', 'c', 'b', 'e'], | ||
| 'col4': [1.32, 1.56, 1.21, np.nan, 1.90], | ||
| 'col5': [51, 51, 54, 55, 53], | ||
| 'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'], | ||
| }) | ||
| metadata = { | ||
| 'fields': { | ||
| 'col1': {'type': 'id', 'subtype': 'int'}, | ||
| 'col2': {'type': 'numerical', 'subtype': 'int'}, | ||
| 'col3': {'type': 'categorical'}, | ||
| 'col4': {'type': 'numerical', 'subtype': 'float'}, | ||
| 'col5': {'type': 'categorical'}, | ||
| 'col6': {'type': 'datetime', 'format': '%Y-%m-%d'}, | ||
| }, | ||
| } | ||
| metric = NewRowSynthesis() | ||
|
|
||
| # Run | ||
| score = metric.compute(real_data, synthetic_data, metadata) | ||
|
|
||
| # Assert | ||
| assert score == 0.8 | ||
|
|
||
| def test_compute_with_sample_size(self): | ||
| """Test the ``compute`` method with a sample size. | ||
|
|
||
| Expect that the new row synthesis score is returned. | ||
| """ | ||
| # Setup | ||
| real_data = pd.DataFrame({ | ||
| 'col1': [1, 2, 1, 3, 4], | ||
| 'col2': ['a', 'b', 'c', 'd', 'b'], | ||
| 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], | ||
| }) | ||
| synthetic_data = pd.DataFrame({ | ||
| 'col1': [1, 3, 4, 2, 2], | ||
| 'col2': ['a', 'b', 'c', 'd', 'e'], | ||
| 'col3': [1.46, 1.56, 1.21, np.nan, 1.92], | ||
| }) | ||
| metadata = { | ||
| 'fields': { | ||
| 'col1': {'type': 'numerical', 'subtype': 'int'}, | ||
| 'col2': {'type': 'categorical'}, | ||
| 'col3': {'type': 'numerical', 'subtype': 'float'}, | ||
| }, | ||
| } | ||
| sample_size = 2 | ||
| metric = NewRowSynthesis() | ||
|
|
||
| # Run | ||
| score = metric.compute( | ||
| real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) | ||
|
|
||
| # Assert | ||
| assert score == 1 | ||
|
|
||
| @patch('sdmetrics.single_table.new_row_synthesis.warnings') | ||
| def test_compute_with_sample_size_too_large(self, warnings_mock): | ||
| """Test the ``compute`` method with a sample size larger than the number of rows. | ||
|
|
||
| Expect that the new row synthesis is returned. Expect a warning to be raised. | ||
| """ | ||
| # Setup | ||
| real_data = pd.DataFrame({ | ||
| 'col1': [1, 2, 1, 3, 4], | ||
| 'col2': ['a', 'b', 'c', 'd', 'b'], | ||
| 'col3': [1.32, np.nan, 1.43, np.nan, 2.0], | ||
| }) | ||
| synthetic_data = pd.DataFrame({ | ||
| 'col1': [1, 3, 4, 2, 2], | ||
| 'col2': ['a', 'b', 'c', 'd', 'e'], | ||
| 'col3': [1.35, 1.56, 1.21, np.nan, 1.92], | ||
| }) | ||
| metadata = { | ||
| 'fields': { | ||
| 'col1': {'type': 'numerical', 'subtype': 'int'}, | ||
| 'col2': {'type': 'categorical'}, | ||
| 'col3': {'type': 'numerical', 'subtype': 'float'}, | ||
| }, | ||
| } | ||
| sample_size = 15 | ||
| metric = NewRowSynthesis() | ||
|
|
||
| # Run | ||
| score = metric.compute( | ||
| real_data, synthetic_data, metadata, synthetic_sample_size=sample_size) | ||
|
|
||
| # Assert | ||
| assert score == 1 | ||
| warnings_mock.warn.assert_called_once_with( | ||
| 'The provided `synthetic_sample_size` of 15 is larger than the number of ' | ||
| 'synthetic data rows (5). Proceeding without sampling.' | ||
| ) | ||
|
|
||
| @patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize') | ||
| def test_normalize(self, normalize_mock): | ||
| """Test the ``normalize`` method. | ||
|
|
||
| Expect that the inherited ``normalize`` method is called. | ||
| """ | ||
| # Setup | ||
| metric = NewRowSynthesis() | ||
| raw_score = 0.9 | ||
|
|
||
| # Run | ||
| result = metric.normalize(raw_score) | ||
|
|
||
| # Assert | ||
| normalize_mock.assert_called_once_with(raw_score) | ||
| assert result == normalize_mock.return_value |
Uh oh!
There was an error while loading. Please reload this page.