Create single table synthesis property (#398)

* definition * unit test * integration test * docstring * modify integration test * typo * quotes * address comments * fix lint * add test error * sample size * blank line * blank line
sdv-dev · Aug 23, 2023 · 85cdba8 · 85cdba8
1 parent e6450eb
commit 85cdba8
Show file tree

Hide file tree

Showing 4 changed files with 260 additions and 0 deletions.
diff --git a/sdmetrics/reports/single_table/_properties/__init__.py b/sdmetrics/reports/single_table/_properties/__init__.py
@@ -5,11 +5,13 @@
 from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
 from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
 from sdmetrics.reports.single_table._properties.coverage import Coverage
+from sdmetrics.reports.single_table._properties.synthesis import Synthesis
 
 __all__ = [
     'BaseSingleTableProperty',
     'ColumnShapes',
     'ColumnPairTrends',
     'Coverage',
     'Boundary',
+    'Synthesis',
 ]
diff --git a/sdmetrics/reports/single_table/_properties/synthesis.py b/sdmetrics/reports/single_table/_properties/synthesis.py
@@ -0,0 +1,95 @@
+import numpy as np
+import pandas as pd
+import plotly.express as px
+
+from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
+from sdmetrics.single_table import NewRowSynthesis
+
+
+class Synthesis(BaseSingleTableProperty):
+    """Synthesis property class for single table.
+
+    This property assesses the novelty of the syntetic data over the real data.
+    The ``NewRowSynthesis`` metric is computed over the real and synthetic table to
+    score the proportion of new rows in the synthetic data.
+    """
+
+    metric = NewRowSynthesis
+
+    def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
+        """Generate the _details dataframe for the synthesis property.
+
+        Args:
+            real_data (pandas.DataFrame):
+                The real data.
+            synthetic_data (pandas.DataFrame):
+                The synthetic data.
+            metadata (dict):
+                The metadata of the table.
+            progress_bar (tqdm.tqdm or None):
+                The progress bar to use. Defaults to tqdm.
+
+        Returns:
+            pandas.DataFrame.
+        """
+        name = self.metric.__name__
+        error_message = np.nan
+
+        sample_size = len(synthetic_data) if len(synthetic_data) < 10000 else 10000
+        try:
+            score_breakdown = self.metric.compute_breakdown(
+                real_data, synthetic_data, synthetic_sample_size=sample_size
+            )
+            score = score_breakdown['score']
+            num_matched_rows = score_breakdown['num_matched_rows']
+            num_new_rows = score_breakdown['num_new_rows']
+
+        except Exception as e:
+            score = np.nan
+            num_matched_rows = np.nan
+            num_new_rows = np.nan
+            error_message = f'Error: {type(e).__name__} {e}'
+
+        finally:
+            if progress_bar:
+                progress_bar.update()
+
+        result = pd.DataFrame({
+            'Metric': name,
+            'Score': score,
+            'Num Matched Rows': num_matched_rows,
+            'Num New Rows': num_new_rows,
+            'Error': error_message,
+        }, index=[0])
+
+        if pd.isna(result['Error'].iloc[0]):
+            result = result.drop('Error', axis=1)
+
+        return result
+
+    def get_visualization(self):
+        """Create a plot to show the synthesis property.
+
+        Returns:
+            plotly.graph_objects._figure.Figure.
+        """
+        labels = ['Exact Matches', 'Novel Rows']
+        values = list(self._details[['Num Matched Rows', 'Num New Rows']].iloc[0])
+
+        average_score = round(self._compute_average(), 2)
+
+        fig = px.pie(
+            values=values,
+            names=labels,
+            color=['Exact Matches', 'Novel Rows'],
+            color_discrete_map={
+                'Exact Matches': '#F16141',
+                'Novel Rows': '#36B37E'
+            },
+            hole=0.4,
+            title=f'Data Diagnostic: Synthesis (Score={average_score})'
+        )
+
+        fig.update_traces(hovertemplate='<b>%{label}</b><br>%{value} rows')
+
+        return fig
diff --git a/tests/integration/reports/single_table/_properties/test_synthesis.py b/tests/integration/reports/single_table/_properties/test_synthesis.py
@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+
+from sdmetrics.demos import load_demo
+from sdmetrics.reports.single_table._properties import Synthesis
+
+
+class TestSynthesis:
+
+    def test_get_score(self):
+        """Test the ``get_score`` method."""
+        # Setup
+        real_data, _, metadata = load_demo('single_table')
+
+        # Run
+        synthesis_property = Synthesis()
+        score = synthesis_property.get_score(real_data.iloc[:20], real_data.iloc[10:30], metadata)
+
+        # Assert
+        assert score == 0.5
+
+    def test_get_score_error(self):
+        """Test the ``get_score`` method with an error.
+
+        Give an empty synthetic data to get an error.
+        """
+        # Setup
+        real_data, _, metadata = load_demo('single_table')
+
+        # Run
+        synthesis_property = Synthesis()
+        score = synthesis_property.get_score(real_data.iloc[:20], [], metadata)
+
+        # Assert
+        assert pd.isna(score)
+
+        expected_details = pd.DataFrame({
+            'Metric': 'NewRowSynthesis',
+            'Score': np.nan,
+            'Num Matched Rows': np.nan,
+            'Num New Rows': np.nan,
+            'Error': "Error: AttributeError 'list' object has no attribute 'columns'"
+        }, index=[0])
+
+        pd.testing.assert_frame_equal(synthesis_property._details, expected_details)
diff --git a/tests/unit/reports/single_table/_properties/test_synthesis.py b/tests/unit/reports/single_table/_properties/test_synthesis.py
@@ -0,0 +1,118 @@
+from unittest.mock import Mock, call, patch
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.reports.single_table._properties.synthesis import Synthesis
+
+
+class TestSynthesis:
+
+    @patch('sdmetrics.reports.single_table._properties.synthesis.'
+           'NewRowSynthesis.compute_breakdown')
+    def test__generate_details(self, newrowsynthesis_mock):
+        """Test the ``_generate_details`` method.
+
+        If the synthetic data is larger than 10000 rows, then the synthetic sample size
+        should be 10000. Otherwise, the synthetic sample size should be the size of the
+        synthetic data.
+        """
+        # Setup
+        real_data = Mock()
+        synthetic_data = [1] * 4
+        synthetic_data_20000 = [1] * 20000
+        metadata = Mock()
+
+        newrowsynthesis_mock.return_value = {
+            'score': 0.25,
+            'num_matched_rows': 3,
+            'num_new_rows': 1,
+        }
+
+        # Run
+        synthesis_property = Synthesis()
+        details = synthesis_property._generate_details(real_data, synthetic_data_20000, metadata)
+        details = synthesis_property._generate_details(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_calls = [
+            call(real_data, synthetic_data_20000, synthetic_sample_size=10000),
+            call(real_data, synthetic_data, synthetic_sample_size=4)
+        ]
+
+        newrowsynthesis_mock.assert_has_calls(expected_calls)
+
+        expected__details = pd.DataFrame({
+            'Metric': 'NewRowSynthesis',
+            'Score': 0.25,
+            'Num Matched Rows': 3,
+            'Num New Rows': 1,
+        }, index=[0])
+
+        pd.testing.assert_frame_equal(details, expected__details)
+
+    @patch('sdmetrics.reports.single_table._properties.synthesis.'
+           'NewRowSynthesis.compute_breakdown')
+    def test__generate_details_error(self, newrowsynthesis_mock):
+        """Test the ``_generate_details`` method when the metric raises an error."""
+        # Setup
+        newrowsynthesis_mock.side_effect = ValueError('Mock Error')
+        real_data = Mock()
+        synthetic_data = [1] * 4
+        metadata = Mock()
+
+        # Run
+        synthesis_property = Synthesis()
+        details = synthesis_property._generate_details(real_data, synthetic_data, metadata)
+
+        # Assert
+        expected_calls_synthesis = [
+            call(real_data, synthetic_data, synthetic_sample_size=4),
+        ]
+
+        newrowsynthesis_mock.assert_has_calls(expected_calls_synthesis)
+
+        expected_details = pd.DataFrame({
+            'Metric': 'NewRowSynthesis',
+            'Score': np.nan,
+            'Num Matched Rows': np.nan,
+            'Num New Rows': np.nan,
+            'Error': 'Error: ValueError Mock Error'
+        }, index=[0])
+
+        pd.testing.assert_frame_equal(details, expected_details)
+
+    @patch('sdmetrics.reports.single_table._properties.synthesis.px')
+    def test_get_visualization(self, mock_px):
+        """Test the ``get_visualization`` method."""
+        # Setup
+        synthesis_property = Synthesis()
+        synthesis_property._details = pd.DataFrame({
+            'Metric': 'NewRowSynthesis',
+            'Score': 0.25,
+            'Num Matched Rows': 3,
+            'Num New Rows': 1,
+        }, index=[0])
+
+        mock_pie = Mock()
+        mock_px.pie.return_value = mock_pie
+
+        # Run
+        synthesis_property.get_visualization()
+
+        # Assert
+        mock_px.pie.assert_called_once_with(
+            values=[3, 1],
+            names=['Exact Matches', 'Novel Rows'],
+            color=['Exact Matches', 'Novel Rows'],
+            color_discrete_map={
+                'Exact Matches': '#F16141',
+                'Novel Rows': '#36B37E'
+            },
+            hole=0.4,
+            title='Data Diagnostic: Synthesis (Score=0.25)'
+        )
+
+        mock_pie.update_traces.assert_called_once_with(
+            hovertemplate='<b>%{label}</b><br>%{value} rows'
+        )