Skip to content

Commit

Permalink
Create single table synthesis property (#398)
Browse files Browse the repository at this point in the history
* definition

* unit test

* integration test

* docstring

* modify integration test

* typo

* quotes

* address comments

* fix lint

* add test error

* sample size

* blank line

* blank line
  • Loading branch information
R-Palazzo committed Aug 23, 2023
1 parent e6450eb commit 85cdba8
Show file tree
Hide file tree
Showing 4 changed files with 260 additions and 0 deletions.
2 changes: 2 additions & 0 deletions sdmetrics/reports/single_table/_properties/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
from sdmetrics.reports.single_table._properties.coverage import Coverage
from sdmetrics.reports.single_table._properties.synthesis import Synthesis

__all__ = [
'BaseSingleTableProperty',
'ColumnShapes',
'ColumnPairTrends',
'Coverage',
'Boundary',
'Synthesis',
]
95 changes: 95 additions & 0 deletions sdmetrics/reports/single_table/_properties/synthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import numpy as np
import pandas as pd
import plotly.express as px

from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
from sdmetrics.single_table import NewRowSynthesis


class Synthesis(BaseSingleTableProperty):
"""Synthesis property class for single table.
This property assesses the novelty of the syntetic data over the real data.
The ``NewRowSynthesis`` metric is computed over the real and synthetic table to
score the proportion of new rows in the synthetic data.
"""

metric = NewRowSynthesis

def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
"""Generate the _details dataframe for the synthesis property.
Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
metadata (dict):
The metadata of the table.
progress_bar (tqdm.tqdm or None):
The progress bar to use. Defaults to tqdm.
Returns:
pandas.DataFrame.
"""
name = self.metric.__name__
error_message = np.nan

sample_size = len(synthetic_data) if len(synthetic_data) < 10000 else 10000
try:
score_breakdown = self.metric.compute_breakdown(
real_data, synthetic_data, synthetic_sample_size=sample_size
)
score = score_breakdown['score']
num_matched_rows = score_breakdown['num_matched_rows']
num_new_rows = score_breakdown['num_new_rows']

except Exception as e:
score = np.nan
num_matched_rows = np.nan
num_new_rows = np.nan
error_message = f'Error: {type(e).__name__} {e}'

finally:
if progress_bar:
progress_bar.update()

result = pd.DataFrame({
'Metric': name,
'Score': score,
'Num Matched Rows': num_matched_rows,
'Num New Rows': num_new_rows,
'Error': error_message,
}, index=[0])

if pd.isna(result['Error'].iloc[0]):
result = result.drop('Error', axis=1)

return result

def get_visualization(self):
"""Create a plot to show the synthesis property.
Returns:
plotly.graph_objects._figure.Figure.
"""
labels = ['Exact Matches', 'Novel Rows']
values = list(self._details[['Num Matched Rows', 'Num New Rows']].iloc[0])

average_score = round(self._compute_average(), 2)

fig = px.pie(
values=values,
names=labels,
color=['Exact Matches', 'Novel Rows'],
color_discrete_map={
'Exact Matches': '#F16141',
'Novel Rows': '#36B37E'
},
hole=0.4,
title=f'Data Diagnostic: Synthesis (Score={average_score})'
)

fig.update_traces(hovertemplate='<b>%{label}</b><br>%{value} rows')

return fig
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np
import pandas as pd

from sdmetrics.demos import load_demo
from sdmetrics.reports.single_table._properties import Synthesis


class TestSynthesis:

def test_get_score(self):
"""Test the ``get_score`` method."""
# Setup
real_data, _, metadata = load_demo('single_table')

# Run
synthesis_property = Synthesis()
score = synthesis_property.get_score(real_data.iloc[:20], real_data.iloc[10:30], metadata)

# Assert
assert score == 0.5

def test_get_score_error(self):
"""Test the ``get_score`` method with an error.
Give an empty synthetic data to get an error.
"""
# Setup
real_data, _, metadata = load_demo('single_table')

# Run
synthesis_property = Synthesis()
score = synthesis_property.get_score(real_data.iloc[:20], [], metadata)

# Assert
assert pd.isna(score)

expected_details = pd.DataFrame({
'Metric': 'NewRowSynthesis',
'Score': np.nan,
'Num Matched Rows': np.nan,
'Num New Rows': np.nan,
'Error': "Error: AttributeError 'list' object has no attribute 'columns'"
}, index=[0])

pd.testing.assert_frame_equal(synthesis_property._details, expected_details)
118 changes: 118 additions & 0 deletions tests/unit/reports/single_table/_properties/test_synthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from unittest.mock import Mock, call, patch

import numpy as np
import pandas as pd

from sdmetrics.reports.single_table._properties.synthesis import Synthesis


class TestSynthesis:

@patch('sdmetrics.reports.single_table._properties.synthesis.'
'NewRowSynthesis.compute_breakdown')
def test__generate_details(self, newrowsynthesis_mock):
"""Test the ``_generate_details`` method.
If the synthetic data is larger than 10000 rows, then the synthetic sample size
should be 10000. Otherwise, the synthetic sample size should be the size of the
synthetic data.
"""
# Setup
real_data = Mock()
synthetic_data = [1] * 4
synthetic_data_20000 = [1] * 20000
metadata = Mock()

newrowsynthesis_mock.return_value = {
'score': 0.25,
'num_matched_rows': 3,
'num_new_rows': 1,
}

# Run
synthesis_property = Synthesis()
details = synthesis_property._generate_details(real_data, synthetic_data_20000, metadata)
details = synthesis_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls = [
call(real_data, synthetic_data_20000, synthetic_sample_size=10000),
call(real_data, synthetic_data, synthetic_sample_size=4)
]

newrowsynthesis_mock.assert_has_calls(expected_calls)

expected__details = pd.DataFrame({
'Metric': 'NewRowSynthesis',
'Score': 0.25,
'Num Matched Rows': 3,
'Num New Rows': 1,
}, index=[0])

pd.testing.assert_frame_equal(details, expected__details)

@patch('sdmetrics.reports.single_table._properties.synthesis.'
'NewRowSynthesis.compute_breakdown')
def test__generate_details_error(self, newrowsynthesis_mock):
"""Test the ``_generate_details`` method when the metric raises an error."""
# Setup
newrowsynthesis_mock.side_effect = ValueError('Mock Error')
real_data = Mock()
synthetic_data = [1] * 4
metadata = Mock()

# Run
synthesis_property = Synthesis()
details = synthesis_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls_synthesis = [
call(real_data, synthetic_data, synthetic_sample_size=4),
]

newrowsynthesis_mock.assert_has_calls(expected_calls_synthesis)

expected_details = pd.DataFrame({
'Metric': 'NewRowSynthesis',
'Score': np.nan,
'Num Matched Rows': np.nan,
'Num New Rows': np.nan,
'Error': 'Error: ValueError Mock Error'
}, index=[0])

pd.testing.assert_frame_equal(details, expected_details)

@patch('sdmetrics.reports.single_table._properties.synthesis.px')
def test_get_visualization(self, mock_px):
"""Test the ``get_visualization`` method."""
# Setup
synthesis_property = Synthesis()
synthesis_property._details = pd.DataFrame({
'Metric': 'NewRowSynthesis',
'Score': 0.25,
'Num Matched Rows': 3,
'Num New Rows': 1,
}, index=[0])

mock_pie = Mock()
mock_px.pie.return_value = mock_pie

# Run
synthesis_property.get_visualization()

# Assert
mock_px.pie.assert_called_once_with(
values=[3, 1],
names=['Exact Matches', 'Novel Rows'],
color=['Exact Matches', 'Novel Rows'],
color_discrete_map={
'Exact Matches': '#F16141',
'Novel Rows': '#36B37E'
},
hole=0.4,
title='Data Diagnostic: Synthesis (Score=0.25)'
)

mock_pie.update_traces.assert_called_once_with(
hovertemplate='<b>%{label}</b><br>%{value} rows'
)

0 comments on commit 85cdba8

Please sign in to comment.