-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create single table synthesis property (#398)
* definition * unit test * integration test * docstring * modify integration test * typo * quotes * address comments * fix lint * add test error * sample size * blank line * blank line
- Loading branch information
Showing
4 changed files
with
260 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import plotly.express as px | ||
|
||
from sdmetrics.reports.single_table._properties import BaseSingleTableProperty | ||
from sdmetrics.single_table import NewRowSynthesis | ||
|
||
|
||
class Synthesis(BaseSingleTableProperty): | ||
"""Synthesis property class for single table. | ||
This property assesses the novelty of the syntetic data over the real data. | ||
The ``NewRowSynthesis`` metric is computed over the real and synthetic table to | ||
score the proportion of new rows in the synthetic data. | ||
""" | ||
|
||
metric = NewRowSynthesis | ||
|
||
def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): | ||
"""Generate the _details dataframe for the synthesis property. | ||
Args: | ||
real_data (pandas.DataFrame): | ||
The real data. | ||
synthetic_data (pandas.DataFrame): | ||
The synthetic data. | ||
metadata (dict): | ||
The metadata of the table. | ||
progress_bar (tqdm.tqdm or None): | ||
The progress bar to use. Defaults to tqdm. | ||
Returns: | ||
pandas.DataFrame. | ||
""" | ||
name = self.metric.__name__ | ||
error_message = np.nan | ||
|
||
sample_size = len(synthetic_data) if len(synthetic_data) < 10000 else 10000 | ||
try: | ||
score_breakdown = self.metric.compute_breakdown( | ||
real_data, synthetic_data, synthetic_sample_size=sample_size | ||
) | ||
score = score_breakdown['score'] | ||
num_matched_rows = score_breakdown['num_matched_rows'] | ||
num_new_rows = score_breakdown['num_new_rows'] | ||
|
||
except Exception as e: | ||
score = np.nan | ||
num_matched_rows = np.nan | ||
num_new_rows = np.nan | ||
error_message = f'Error: {type(e).__name__} {e}' | ||
|
||
finally: | ||
if progress_bar: | ||
progress_bar.update() | ||
|
||
result = pd.DataFrame({ | ||
'Metric': name, | ||
'Score': score, | ||
'Num Matched Rows': num_matched_rows, | ||
'Num New Rows': num_new_rows, | ||
'Error': error_message, | ||
}, index=[0]) | ||
|
||
if pd.isna(result['Error'].iloc[0]): | ||
result = result.drop('Error', axis=1) | ||
|
||
return result | ||
|
||
def get_visualization(self): | ||
"""Create a plot to show the synthesis property. | ||
Returns: | ||
plotly.graph_objects._figure.Figure. | ||
""" | ||
labels = ['Exact Matches', 'Novel Rows'] | ||
values = list(self._details[['Num Matched Rows', 'Num New Rows']].iloc[0]) | ||
|
||
average_score = round(self._compute_average(), 2) | ||
|
||
fig = px.pie( | ||
values=values, | ||
names=labels, | ||
color=['Exact Matches', 'Novel Rows'], | ||
color_discrete_map={ | ||
'Exact Matches': '#F16141', | ||
'Novel Rows': '#36B37E' | ||
}, | ||
hole=0.4, | ||
title=f'Data Diagnostic: Synthesis (Score={average_score})' | ||
) | ||
|
||
fig.update_traces(hovertemplate='<b>%{label}</b><br>%{value} rows') | ||
|
||
return fig |
45 changes: 45 additions & 0 deletions
45
tests/integration/reports/single_table/_properties/test_synthesis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sdmetrics.demos import load_demo | ||
from sdmetrics.reports.single_table._properties import Synthesis | ||
|
||
|
||
class TestSynthesis: | ||
|
||
def test_get_score(self): | ||
"""Test the ``get_score`` method.""" | ||
# Setup | ||
real_data, _, metadata = load_demo('single_table') | ||
|
||
# Run | ||
synthesis_property = Synthesis() | ||
score = synthesis_property.get_score(real_data.iloc[:20], real_data.iloc[10:30], metadata) | ||
|
||
# Assert | ||
assert score == 0.5 | ||
|
||
def test_get_score_error(self): | ||
"""Test the ``get_score`` method with an error. | ||
Give an empty synthetic data to get an error. | ||
""" | ||
# Setup | ||
real_data, _, metadata = load_demo('single_table') | ||
|
||
# Run | ||
synthesis_property = Synthesis() | ||
score = synthesis_property.get_score(real_data.iloc[:20], [], metadata) | ||
|
||
# Assert | ||
assert pd.isna(score) | ||
|
||
expected_details = pd.DataFrame({ | ||
'Metric': 'NewRowSynthesis', | ||
'Score': np.nan, | ||
'Num Matched Rows': np.nan, | ||
'Num New Rows': np.nan, | ||
'Error': "Error: AttributeError 'list' object has no attribute 'columns'" | ||
}, index=[0]) | ||
|
||
pd.testing.assert_frame_equal(synthesis_property._details, expected_details) |
118 changes: 118 additions & 0 deletions
118
tests/unit/reports/single_table/_properties/test_synthesis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
from unittest.mock import Mock, call, patch | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sdmetrics.reports.single_table._properties.synthesis import Synthesis | ||
|
||
|
||
class TestSynthesis: | ||
|
||
@patch('sdmetrics.reports.single_table._properties.synthesis.' | ||
'NewRowSynthesis.compute_breakdown') | ||
def test__generate_details(self, newrowsynthesis_mock): | ||
"""Test the ``_generate_details`` method. | ||
If the synthetic data is larger than 10000 rows, then the synthetic sample size | ||
should be 10000. Otherwise, the synthetic sample size should be the size of the | ||
synthetic data. | ||
""" | ||
# Setup | ||
real_data = Mock() | ||
synthetic_data = [1] * 4 | ||
synthetic_data_20000 = [1] * 20000 | ||
metadata = Mock() | ||
|
||
newrowsynthesis_mock.return_value = { | ||
'score': 0.25, | ||
'num_matched_rows': 3, | ||
'num_new_rows': 1, | ||
} | ||
|
||
# Run | ||
synthesis_property = Synthesis() | ||
details = synthesis_property._generate_details(real_data, synthetic_data_20000, metadata) | ||
details = synthesis_property._generate_details(real_data, synthetic_data, metadata) | ||
|
||
# Assert | ||
expected_calls = [ | ||
call(real_data, synthetic_data_20000, synthetic_sample_size=10000), | ||
call(real_data, synthetic_data, synthetic_sample_size=4) | ||
] | ||
|
||
newrowsynthesis_mock.assert_has_calls(expected_calls) | ||
|
||
expected__details = pd.DataFrame({ | ||
'Metric': 'NewRowSynthesis', | ||
'Score': 0.25, | ||
'Num Matched Rows': 3, | ||
'Num New Rows': 1, | ||
}, index=[0]) | ||
|
||
pd.testing.assert_frame_equal(details, expected__details) | ||
|
||
@patch('sdmetrics.reports.single_table._properties.synthesis.' | ||
'NewRowSynthesis.compute_breakdown') | ||
def test__generate_details_error(self, newrowsynthesis_mock): | ||
"""Test the ``_generate_details`` method when the metric raises an error.""" | ||
# Setup | ||
newrowsynthesis_mock.side_effect = ValueError('Mock Error') | ||
real_data = Mock() | ||
synthetic_data = [1] * 4 | ||
metadata = Mock() | ||
|
||
# Run | ||
synthesis_property = Synthesis() | ||
details = synthesis_property._generate_details(real_data, synthetic_data, metadata) | ||
|
||
# Assert | ||
expected_calls_synthesis = [ | ||
call(real_data, synthetic_data, synthetic_sample_size=4), | ||
] | ||
|
||
newrowsynthesis_mock.assert_has_calls(expected_calls_synthesis) | ||
|
||
expected_details = pd.DataFrame({ | ||
'Metric': 'NewRowSynthesis', | ||
'Score': np.nan, | ||
'Num Matched Rows': np.nan, | ||
'Num New Rows': np.nan, | ||
'Error': 'Error: ValueError Mock Error' | ||
}, index=[0]) | ||
|
||
pd.testing.assert_frame_equal(details, expected_details) | ||
|
||
@patch('sdmetrics.reports.single_table._properties.synthesis.px') | ||
def test_get_visualization(self, mock_px): | ||
"""Test the ``get_visualization`` method.""" | ||
# Setup | ||
synthesis_property = Synthesis() | ||
synthesis_property._details = pd.DataFrame({ | ||
'Metric': 'NewRowSynthesis', | ||
'Score': 0.25, | ||
'Num Matched Rows': 3, | ||
'Num New Rows': 1, | ||
}, index=[0]) | ||
|
||
mock_pie = Mock() | ||
mock_px.pie.return_value = mock_pie | ||
|
||
# Run | ||
synthesis_property.get_visualization() | ||
|
||
# Assert | ||
mock_px.pie.assert_called_once_with( | ||
values=[3, 1], | ||
names=['Exact Matches', 'Novel Rows'], | ||
color=['Exact Matches', 'Novel Rows'], | ||
color_discrete_map={ | ||
'Exact Matches': '#F16141', | ||
'Novel Rows': '#36B37E' | ||
}, | ||
hole=0.4, | ||
title='Data Diagnostic: Synthesis (Score=0.25)' | ||
) | ||
|
||
mock_pie.update_traces.assert_called_once_with( | ||
hovertemplate='<b>%{label}</b><br>%{value} rows' | ||
) |