Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sdmetrics/multi_table/multi_single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ class BNLikelihood(MultiSingleTableMetric):
single_table_metric = single_table.bayesian_network.BNLikelihood


class NewRowSynthesis(MultiSingleTableMetric):
"""MultiSingleTableMetric based on SingleTable NewRowSynthesis."""

single_table_metric = single_table.new_row_synthesis.NewRowSynthesis


class BNLogLikelihood(MultiSingleTableMetric):
"""MultiSingleTableMetric based on SingleTable BNLogLikelihood."""

Expand Down
5 changes: 5 additions & 0 deletions sdmetrics/reports/single_table/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,15 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns):
Args:
score_breakdowns (dict):
Mapping of metric to the score breakdown result.
columns (list[string] or set[string]):
A list or set of column names.

Returns:
pandas.DataFrame
"""
if isinstance(columns, set):
columns = list(columns)
Comment on lines +94 to +95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor, but not sure what columns can be:
You can always cast to list and this will be one dimensional:

In [1]: columns_list = ['a', 'b', 'c']
In [2]: columns_set = {'a', 'b', 'c'}
In [3]: columns_tuple = ('a', 'b', 'c')

In [4]: list(columns_list)
Out[4]: ['a', 'b', 'c']

In [5]: list(columns_set)
Out[5]: ['b', 'c', 'a']

In [6]: list(columns_tuple)
Out[6]: ['a', 'b', 'c']

If columns can be a str then don't do this as it will split it in letters

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should always be a list or a set.


similarity_correlation = pd.DataFrame(
index=columns,
columns=columns,
Expand Down
2 changes: 2 additions & 0 deletions sdmetrics/single_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sdmetrics.single_table.multi_single_column import (
BoundaryAdherence, CategoryCoverage, CSTest, KSComplement, MissingValueSimilarity,
MultiSingleColumnMetric, RangeCoverage, StatisticSimilarity, TVComplement)
from sdmetrics.single_table.new_row_synthesis import NewRowSynthesis
from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric
from sdmetrics.single_table.privacy.cap import (
CategoricalCAP, CategoricalGeneralizedCAP, CategoricalZeroCAP)
Expand Down Expand Up @@ -88,4 +89,5 @@
'StatisticSimilarity',
'TVComplement',
'RangeCoverage',
'NewRowSynthesis',
]
6 changes: 6 additions & 0 deletions sdmetrics/single_table/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Base Single Table metric class."""

import copy
from operator import attrgetter

import pandas as pd
Expand Down Expand Up @@ -103,6 +104,11 @@ def _validate_inputs(cls, real_data, synthetic_data, metadata=None):
(pandas.DataFrame, pandas.DataFrame, dict):
The validated data and metadata.
"""
real_data = real_data.copy()
synthetic_data = synthetic_data.copy()
if metadata is not None:
metadata = copy.deepcopy(metadata)

if set(real_data.columns) != set(synthetic_data.columns):
raise ValueError('`real_data` and `synthetic_data` must have the same columns')

Expand Down
123 changes: 123 additions & 0 deletions sdmetrics/single_table/new_row_synthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""New Row Synthesis metric for single table."""
import warnings

import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric


class NewRowSynthesis(SingleTableMetric):
"""NewRowSynthesis Single Table metric.

This metric measures whether each row in the synthetic data is new,
or whether it exactly matches a row in the real data.

Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'NewRowSynthesis'
goal = Goal.MAXIMIZE
min_value = 0
max_value = 1

@classmethod
def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01,
synthetic_sample_size=None):
"""Compute this metric.

This metric looks for matches between the real and synthetic data for
the compatible columns. This metric also looks for matches in missing values.

Args:
real_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the real dataset.
synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the synthetic dataset.
metadata (dict):
Table metadata dict.
numerical_match_tolerance (float):
A float larger than 0 representing how close two numerical values have to be
in order to be considered a match. Defaults to `0.01`.
synthetic_sample_size (int):
The number of synthetic rows to sample before computing this metric.
Use this to speed up the computation time if you have a large amount
of synthetic data. Note that the final score may not be as precise if
your sample size is low. Defaults to ``None``, which does not sample,
and uses all of the provided rows.

Returns:
float:
The new row synthesis score.
"""
real_data, synthetic_data, metadata = cls._validate_inputs(
real_data, synthetic_data, metadata)

if synthetic_sample_size is not None:
if synthetic_sample_size > len(synthetic_data):
warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} '
'is larger than the number of synthetic data rows '
f'({len(synthetic_data)}). Proceeding without sampling.')
else:
synthetic_data = synthetic_data.sample(n=synthetic_sample_size)

numerical_fields = []
discrete_fields = []
for field, field_meta in metadata['fields'].items():
if field_meta['type'] == 'datetime':
real_data[field] = pd.to_numeric(real_data[field])
synthetic_data[field] = pd.to_numeric(synthetic_data[field])
numerical_fields.append(field)
elif field_meta['type'] == 'numerical':
numerical_fields.append(field)
else:
discrete_fields.append(field)

num_unique_rows = 0
for index, row in synthetic_data.iterrows():
row_filter = []
for field in real_data.columns:
if pd.isna(row[field]):
field_filter = f'{field}.isnull()'
elif field in numerical_fields:
field_filter = (
f'abs({field} - {row[field]}) <= '
f'{abs(numerical_match_tolerance * row[field])}'
)
else:
if real_data[field].dtype == 'O':
field_filter = f"{field} == '{row[field]}'"
else:
field_filter = f'{field} == {row[field]}'

row_filter.append(field_filter)

matches = real_data.query(' and '.join(row_filter))
if matches is None or matches.empty:
num_unique_rows += 1

return num_unique_rows / len(synthetic_data)

@classmethod
def normalize(cls, raw_score):
"""Normalize the log-likelihood value.

Notice that this is not the mean likelihood.

Args:
raw_score (float):
The value of the metric from `compute`.

Returns:
float:
The normalized value of the metric
"""
return super().normalize(raw_score)
134 changes: 134 additions & 0 deletions tests/unit/single_table/test_new_row_synthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

from sdmetrics.single_table import NewRowSynthesis


class TestNewRowSynthesis:

def test_compute(self):
"""Test the ``compute`` method and expect that the new row synthesis score is returned."""
# Setup
real_data = pd.DataFrame({
'col1': [0, 1, 2, 3, 4],
'col2': [1, 2, 1, 3, 4],
'col3': ['a', 'b', 'c', 'd', 'b'],
'col4': [1.32, np.nan, 1.43, np.nan, 2.0],
'col5': [51, 52, 53, 54, 55],
'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'],
})
synthetic_data = pd.DataFrame({
'col1': [0, 1, 2, 3, 4],
'col2': [1, 3, 4, 2, 2],
'col3': ['a', 'b', 'c', 'b', 'e'],
'col4': [1.32, 1.56, 1.21, np.nan, 1.90],
'col5': [51, 51, 54, 55, 53],
'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'],
})
metadata = {
'fields': {
'col1': {'type': 'id', 'subtype': 'int'},
'col2': {'type': 'numerical', 'subtype': 'int'},
'col3': {'type': 'categorical'},
'col4': {'type': 'numerical', 'subtype': 'float'},
'col5': {'type': 'categorical'},
'col6': {'type': 'datetime', 'format': '%Y-%m-%d'},
},
}
metric = NewRowSynthesis()

# Run
score = metric.compute(real_data, synthetic_data, metadata)

# Assert
assert score == 0.8

def test_compute_with_sample_size(self):
"""Test the ``compute`` method with a sample size.

Expect that the new row synthesis score is returned.
"""
# Setup
real_data = pd.DataFrame({
'col1': [1, 2, 1, 3, 4],
'col2': ['a', 'b', 'c', 'd', 'b'],
'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
})
synthetic_data = pd.DataFrame({
'col1': [1, 3, 4, 2, 2],
'col2': ['a', 'b', 'c', 'd', 'e'],
'col3': [1.46, 1.56, 1.21, np.nan, 1.92],
})
metadata = {
'fields': {
'col1': {'type': 'numerical', 'subtype': 'int'},
'col2': {'type': 'categorical'},
'col3': {'type': 'numerical', 'subtype': 'float'},
},
}
sample_size = 2
metric = NewRowSynthesis()

# Run
score = metric.compute(
real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)

# Assert
assert score == 1

@patch('sdmetrics.single_table.new_row_synthesis.warnings')
def test_compute_with_sample_size_too_large(self, warnings_mock):
"""Test the ``compute`` method with a sample size larger than the number of rows.

Expect that the new row synthesis is returned. Expect a warning to be raised.
"""
# Setup
real_data = pd.DataFrame({
'col1': [1, 2, 1, 3, 4],
'col2': ['a', 'b', 'c', 'd', 'b'],
'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
})
synthetic_data = pd.DataFrame({
'col1': [1, 3, 4, 2, 2],
'col2': ['a', 'b', 'c', 'd', 'e'],
'col3': [1.35, 1.56, 1.21, np.nan, 1.92],
})
metadata = {
'fields': {
'col1': {'type': 'numerical', 'subtype': 'int'},
'col2': {'type': 'categorical'},
'col3': {'type': 'numerical', 'subtype': 'float'},
},
}
sample_size = 15
metric = NewRowSynthesis()

# Run
score = metric.compute(
real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)

# Assert
assert score == 1
warnings_mock.warn.assert_called_once_with(
'The provided `synthetic_sample_size` of 15 is larger than the number of '
'synthetic data rows (5). Proceeding without sampling.'
)

@patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize')
def test_normalize(self, normalize_mock):
"""Test the ``normalize`` method.

Expect that the inherited ``normalize`` method is called.
"""
# Setup
metric = NewRowSynthesis()
raw_score = 0.9

# Run
result = metric.normalize(raw_score)

# Assert
normalize_mock.assert_called_once_with(raw_score)
assert result == normalize_mock.return_value