From 6f7919f8dbd95076946fa8c355083316f718bb36 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:45:06 -0400
Subject: [PATCH 1/9] Add synthetic uniqueness single table metric and tests

---
 sdmetrics/multi_table/multi_single_table.py   |   6 +
 sdmetrics/single_table/__init__.py            |   2 +
 .../single_table/synthetic_uniqueness.py      |  87 ++++++++++++++
 .../single_table/test_synthetic_uniqueness.py | 110 ++++++++++++++++++
 4 files changed, 205 insertions(+)
 create mode 100644 sdmetrics/single_table/synthetic_uniqueness.py
 create mode 100644 tests/unit/single_table/test_synthetic_uniqueness.py

diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py
index c3eff8cb..625a99ea 100644
--- a/sdmetrics/multi_table/multi_single_table.py
+++ b/sdmetrics/multi_table/multi_single_table.py
@@ -241,6 +241,12 @@ class BNLikelihood(MultiSingleTableMetric):
     single_table_metric = single_table.bayesian_network.BNLikelihood
 
 
+class SyntheticUniqueness(MultiSingleTableMetric):
+    """MultiSingleTableMetric based on SingleTable SyntheticUniqueness."""
+
+    single_table_metric = single_table.synthetic_uniqueness.SyntheticUniqueness
+
+
 class BNLogLikelihood(MultiSingleTableMetric):
     """MultiSingleTableMetric based on SingleTable BNLogLikelihood."""
 
diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py
index fc6beebf..690ceefe 100644
--- a/sdmetrics/single_table/__init__.py
+++ b/sdmetrics/single_table/__init__.py
@@ -31,6 +31,7 @@
 from sdmetrics.single_table.privacy.numerical_sklearn import (
     NumericalLR, NumericalMLP, NumericalSVR)
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
+from sdmetrics.single_table.synthetic_uniqueness import SyntheticUniqueness
 
 __all__ = [
     'bayesian_network',
@@ -88,4 +89,5 @@
     'StatisticSimilarity',
     'TVComplement',
     'RangeCoverage',
+    'SyntheticUniqueness',
 ]
diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/synthetic_uniqueness.py
new file mode 100644
index 00000000..a38456cc
--- /dev/null
+++ b/sdmetrics/single_table/synthetic_uniqueness.py
@@ -0,0 +1,87 @@
+"""Synthetic uniqueness metrics for single table."""
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_table.base import SingleTableMetric
+
+
+class SyntheticUniqueness(SingleTableMetric):
+    """SyntheticUniqueness Single Table metric.
+
+    This metric measures whether each row in the synthetic data is unique,
+    or whether it exactly matches a row in the real data.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'SyntheticUniqueness'
+    goal = Goal.MAXIMIZE
+    min_value = 0
+    max_value = 1
+
+    @classmethod
+    def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01,
+                synthetic_sample_size=None):
+        """Compute this metric.
+
+        This metric looks for matches between the real and synthetic data for
+        the compatible columns. This metric also looks for matches in missing values.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+            metadata (dict):
+                Table metadata dict.
+            numerical_match_tolerance (float):
+                A float >0.0 representing how close two numerical values have to be
+                in order to be considered a match.
+            synthetic_sample_size (int):
+                The number of synthetic rows to sample before computing this metric.
+                Use this to speed up the computation time if you have a large amount
+                of synthetic data. Note that the final score may not be as precise if
+                your sample size is low. Defaults to ``None``, which does not sample,
+                and uses all of the provided rows.
+
+        Returns:
+            float:
+                The synthetic uniqueness score.
+        """
+        if synthetic_sample_size is not None:
+            synthetic_data = synthetic_data.sample(n=synthetic_sample_size)
+
+        value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False)
+        value_counts.name = 'value_counts'
+        value_counts = value_counts.reset_index()
+
+        columns = real_data.columns.to_list()
+        synthetic_value_counts = synthetic_data.merge(
+            value_counts, how='left', left_on=columns, right_on=columns)
+        num_unique_rows = (synthetic_value_counts['value_counts'] == 1).sum()
+
+        return num_unique_rows / len(synthetic_data)
+
+    @classmethod
+    def normalize(cls, raw_score):
+        """Normalize the log-likelihood value.
+
+        Notice that this is not the mean likelihood.
+
+        Args:
+            raw_score (float):
+                The value of the metric from `compute`.
+
+        Returns:
+            float:
+                The normalized value of the metric
+        """
+        return super().normalize(raw_score)
diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_synthetic_uniqueness.py
new file mode 100644
index 00000000..7aca66ec
--- /dev/null
+++ b/tests/unit/single_table/test_synthetic_uniqueness.py
@@ -0,0 +1,110 @@
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.single_table import SyntheticUniqueness
+
+
+class TestSyntheticUniqueness:
+
+    def test_compute(self):
+        """Test the ``compute`` method.
+
+        Expect that the synthetic uniqueness is returned.
+
+        Input:
+        - real data
+        - synthetic data
+
+        Output:
+        - the evaluated metric
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, 1, 3, 4],
+            'col2': ['a', 'b', 'c', 'd', 'b'],
+            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 3, 4, 2, 2],
+            'col2': ['a', 'b', 'c', 'b', 'e'],
+            'col3': [1.32, 1.56, 1.21, np.nan, 1.90],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'numerical', 'subtype': 'int'},
+                'col2': {'type': 'categorical'},
+                'col3': {'type': 'numerical', 'subtype': 'float'},
+            },
+        }
+
+        # Run
+        metric = SyntheticUniqueness()
+        score = metric.compute(real_data, synthetic_data, metadata)
+
+        # Assert
+        assert score == 0.6
+
+    def test_compute_with_sample_size(self):
+        """Test the ``compute`` method with a sample size.
+
+        Expect that the synthetic uniqueness is returned.
+
+        Input:
+        - real data
+        - synthetic data
+
+        Output:
+        - the evaluated metric
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, 1, 3, 4],
+            'col2': ['a', 'b', 'c', 'd', 'b'],
+            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 3, 4, 2, 2],
+            'col2': ['a', 'b', 'c', 'd', 'e'],
+            'col3': [1.33, 1.56, 1.21, np.nan, 1.92],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'numerical', 'subtype': 'int'},
+                'col2': {'type': 'categorical'},
+                'col3': {'type': 'numerical', 'subtype': 'float'},
+            },
+        }
+        sample_size = 2
+
+        # Run
+        metric = SyntheticUniqueness()
+        score = metric.compute(
+            real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
+
+        # Assert
+        assert score == 1
+
+    @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize')
+    def test_normalize(self, normalize_mock):
+        """Test the ``normalize`` method.
+
+        Expect that the inherited ``normalize`` method is called.
+
+        Input:
+        - raw score
+
+        Output:
+        - the output of the inherited ``normalize`` method.
+        """
+        # Setup
+        metric = SyntheticUniqueness()
+        raw_score = 0.9
+
+        # Run
+        result = metric.normalize(raw_score)
+
+        # Assert
+        normalize_mock.assert_called_once_with(raw_score)
+        assert result == normalize_mock.return_value

From 2e56f429f618aa3dbacfb1e3584a6cf9297730de Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:53:52 -0400
Subject: [PATCH 2/9] Add warning for edge case

---
 .../single_table/synthetic_uniqueness.py      |  9 +++-
 .../single_table/test_synthetic_uniqueness.py | 45 +++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/synthetic_uniqueness.py
index a38456cc..92adc664 100644
--- a/sdmetrics/single_table/synthetic_uniqueness.py
+++ b/sdmetrics/single_table/synthetic_uniqueness.py
@@ -1,4 +1,6 @@
 """Synthetic uniqueness metrics for single table."""
+import warnings
+
 import pandas as pd
 
 from sdmetrics.goal import Goal
@@ -57,7 +59,12 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
                 The synthetic uniqueness score.
         """
         if synthetic_sample_size is not None:
-            synthetic_data = synthetic_data.sample(n=synthetic_sample_size)
+            if synthetic_sample_size > len(synthetic_data):
+                warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} '
+                              'is larger than the number of synthetic data rows '
+                              f'({len(synthetic_data)}). Proceeding without sampling.')
+            else:
+                synthetic_data = synthetic_data.sample(n=synthetic_sample_size)
 
         value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False)
         value_counts.name = 'value_counts'
diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_synthetic_uniqueness.py
index 7aca66ec..9b73fe5e 100644
--- a/tests/unit/single_table/test_synthetic_uniqueness.py
+++ b/tests/unit/single_table/test_synthetic_uniqueness.py
@@ -86,6 +86,51 @@ def test_compute_with_sample_size(self):
         # Assert
         assert score == 1
 
+    @patch('sdmetrics.single_table.synthetic_uniqueness.warnings')
+    def test_compute_with_sample_size_too_large(self, warnings_mock):
+        """Test the ``compute`` method with a sample size larger than the number of rows.
+
+        Expect that the synthetic uniqueness is returned. Expect a warning to be raised.
+
+        Input:
+        - real data
+        - synthetic data
+
+        Output:
+        - the evaluated metric
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, 1, 3, 4],
+            'col2': ['a', 'b', 'c', 'd', 'b'],
+            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 3, 4, 2, 2],
+            'col2': ['a', 'b', 'c', 'd', 'e'],
+            'col3': [1.33, 1.56, 1.21, np.nan, 1.92],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'numerical', 'subtype': 'int'},
+                'col2': {'type': 'categorical'},
+                'col3': {'type': 'numerical', 'subtype': 'float'},
+            },
+        }
+        sample_size = 15
+
+        # Run
+        metric = SyntheticUniqueness()
+        score = metric.compute(
+            real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
+
+        # Assert
+        assert score == 1
+        warnings_mock.warn.assert_called_once_with(
+            'The provided `synthetic_sample_size` of 15 is larger than the number of '
+            'synthetic data rows (5). Proceeding without sampling.'
+        )
+
     @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize')
     def test_normalize(self, normalize_mock):
         """Test the ``normalize`` method.

From 79398d0dc78b3950350cea19e1507de3f768c7ad Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Wed, 21 Sep 2022 12:05:32 -0400
Subject: [PATCH 3/9] Update metric name

---
 sdmetrics/multi_table/multi_single_table.py   |  6 +--
 sdmetrics/single_table/__init__.py            |  4 +-
 ...tic_uniqueness.py => new_row_synthesis.py} | 12 ++---
 ...niqueness.py => test_new_row_synthesis.py} | 52 ++++---------------
 4 files changed, 22 insertions(+), 52 deletions(-)
 rename sdmetrics/single_table/{synthetic_uniqueness.py => new_row_synthesis.py} (93%)
 rename tests/unit/single_table/{test_synthetic_uniqueness.py => test_new_row_synthesis.py} (75%)

diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py
index 625a99ea..c041855f 100644
--- a/sdmetrics/multi_table/multi_single_table.py
+++ b/sdmetrics/multi_table/multi_single_table.py
@@ -241,10 +241,10 @@ class BNLikelihood(MultiSingleTableMetric):
     single_table_metric = single_table.bayesian_network.BNLikelihood
 
 
-class SyntheticUniqueness(MultiSingleTableMetric):
-    """MultiSingleTableMetric based on SingleTable SyntheticUniqueness."""
+class NewRowSynthesis(MultiSingleTableMetric):
+    """MultiSingleTableMetric based on SingleTable NewRowSynthesis."""
 
-    single_table_metric = single_table.synthetic_uniqueness.SyntheticUniqueness
+    single_table_metric = single_table.new_row_synthesis.NewRowSynthesis
 
 
 class BNLogLikelihood(MultiSingleTableMetric):
diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py
index 690ceefe..35704626 100644
--- a/sdmetrics/single_table/__init__.py
+++ b/sdmetrics/single_table/__init__.py
@@ -22,6 +22,7 @@
 from sdmetrics.single_table.multi_single_column import (
     BoundaryAdherence, CategoryCoverage, CSTest, KSComplement, MissingValueSimilarity,
     MultiSingleColumnMetric, RangeCoverage, StatisticSimilarity, TVComplement)
+from sdmetrics.single_table.new_row_synthesis import NewRowSynthesis
 from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric
 from sdmetrics.single_table.privacy.cap import (
     CategoricalCAP, CategoricalGeneralizedCAP, CategoricalZeroCAP)
@@ -31,7 +32,6 @@
 from sdmetrics.single_table.privacy.numerical_sklearn import (
     NumericalLR, NumericalMLP, NumericalSVR)
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
-from sdmetrics.single_table.synthetic_uniqueness import SyntheticUniqueness
 
 __all__ = [
     'bayesian_network',
@@ -89,5 +89,5 @@
     'StatisticSimilarity',
     'TVComplement',
     'RangeCoverage',
-    'SyntheticUniqueness',
+    'NewRowSynthesis',
 ]
diff --git a/sdmetrics/single_table/synthetic_uniqueness.py b/sdmetrics/single_table/new_row_synthesis.py
similarity index 93%
rename from sdmetrics/single_table/synthetic_uniqueness.py
rename to sdmetrics/single_table/new_row_synthesis.py
index 92adc664..9782d9bb 100644
--- a/sdmetrics/single_table/synthetic_uniqueness.py
+++ b/sdmetrics/single_table/new_row_synthesis.py
@@ -1,4 +1,4 @@
-"""Synthetic uniqueness metrics for single table."""
+"""New Row Synthesis metric for single table."""
 import warnings
 
 import pandas as pd
@@ -7,10 +7,10 @@
 from sdmetrics.single_table.base import SingleTableMetric
 
 
-class SyntheticUniqueness(SingleTableMetric):
-    """SyntheticUniqueness Single Table metric.
+class NewRowSynthesis(SingleTableMetric):
+    """NewRowSynthesis Single Table metric.
 
-    This metric measures whether each row in the synthetic data is unique,
+    This metric measures whether each row in the synthetic data is new,
     or whether it exactly matches a row in the real data.
 
     Attributes:
@@ -24,7 +24,7 @@ class SyntheticUniqueness(SingleTableMetric):
             Maximum value or values that this metric can take.
     """
 
-    name = 'SyntheticUniqueness'
+    name = 'NewRowSynthesis'
     goal = Goal.MAXIMIZE
     min_value = 0
     max_value = 1
@@ -56,7 +56,7 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
 
         Returns:
             float:
-                The synthetic uniqueness score.
+                The new row synthesis score.
         """
         if synthetic_sample_size is not None:
             if synthetic_sample_size > len(synthetic_data):
diff --git a/tests/unit/single_table/test_synthetic_uniqueness.py b/tests/unit/single_table/test_new_row_synthesis.py
similarity index 75%
rename from tests/unit/single_table/test_synthetic_uniqueness.py
rename to tests/unit/single_table/test_new_row_synthesis.py
index 9b73fe5e..e903df2c 100644
--- a/tests/unit/single_table/test_synthetic_uniqueness.py
+++ b/tests/unit/single_table/test_new_row_synthesis.py
@@ -3,23 +3,13 @@
 import numpy as np
 import pandas as pd
 
-from sdmetrics.single_table import SyntheticUniqueness
+from sdmetrics.single_table import NewRowSynthesis
 
 
-class TestSyntheticUniqueness:
+class TestNewRowSynthesis:
 
     def test_compute(self):
-        """Test the ``compute`` method.
-
-        Expect that the synthetic uniqueness is returned.
-
-        Input:
-        - real data
-        - synthetic data
-
-        Output:
-        - the evaluated metric
-        """
+        """Test the ``compute`` method and expect that the new row synthesis score is returned."""
         # Setup
         real_data = pd.DataFrame({
             'col1': [1, 2, 1, 3, 4],
@@ -38,9 +28,9 @@ def test_compute(self):
                 'col3': {'type': 'numerical', 'subtype': 'float'},
             },
         }
+        metric = NewRowSynthesis()
 
         # Run
-        metric = SyntheticUniqueness()
         score = metric.compute(real_data, synthetic_data, metadata)
 
         # Assert
@@ -49,14 +39,7 @@ def test_compute(self):
     def test_compute_with_sample_size(self):
         """Test the ``compute`` method with a sample size.
 
-        Expect that the synthetic uniqueness is returned.
-
-        Input:
-        - real data
-        - synthetic data
-
-        Output:
-        - the evaluated metric
+        Expect that the new row synthesis score is returned.
         """
         # Setup
         real_data = pd.DataFrame({
@@ -77,27 +60,20 @@ def test_compute_with_sample_size(self):
             },
         }
         sample_size = 2
+        metric = NewRowSynthesis()
 
         # Run
-        metric = SyntheticUniqueness()
         score = metric.compute(
             real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
 
         # Assert
         assert score == 1
 
-    @patch('sdmetrics.single_table.synthetic_uniqueness.warnings')
+    @patch('sdmetrics.single_table.new_row_synthesis.warnings')
     def test_compute_with_sample_size_too_large(self, warnings_mock):
         """Test the ``compute`` method with a sample size larger than the number of rows.
 
-        Expect that the synthetic uniqueness is returned. Expect a warning to be raised.
-
-        Input:
-        - real data
-        - synthetic data
-
-        Output:
-        - the evaluated metric
+        Expect that the new row synthesis is returned. Expect a warning to be raised.
         """
         # Setup
         real_data = pd.DataFrame({
@@ -118,9 +94,9 @@ def test_compute_with_sample_size_too_large(self, warnings_mock):
             },
         }
         sample_size = 15
+        metric = NewRowSynthesis()
 
         # Run
-        metric = SyntheticUniqueness()
         score = metric.compute(
             real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
 
@@ -131,20 +107,14 @@ def test_compute_with_sample_size_too_large(self, warnings_mock):
             'synthetic data rows (5). Proceeding without sampling.'
         )
 
-    @patch('sdmetrics.single_table.synthetic_uniqueness.SingleTableMetric.normalize')
+    @patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize')
     def test_normalize(self, normalize_mock):
         """Test the ``normalize`` method.
 
         Expect that the inherited ``normalize`` method is called.
-
-        Input:
-        - raw score
-
-        Output:
-        - the output of the inherited ``normalize`` method.
         """
         # Setup
-        metric = SyntheticUniqueness()
+        metric = NewRowSynthesis()
         raw_score = 0.9
 
         # Run

From 28da18f22cfde11c43746622fb82b045c0a388be Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Wed, 21 Sep 2022 13:08:07 -0400
Subject: [PATCH 4/9] Update implementation

---
 sdmetrics/single_table/new_row_synthesis.py   | 35 ++++++++++++++-----
 .../single_table/test_new_row_synthesis.py    |  2 +-
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py
index 9782d9bb..c21a5611 100644
--- a/sdmetrics/single_table/new_row_synthesis.py
+++ b/sdmetrics/single_table/new_row_synthesis.py
@@ -1,6 +1,7 @@
 """New Row Synthesis metric for single table."""
 import warnings
 
+import numpy as np
 import pandas as pd
 
 from sdmetrics.goal import Goal
@@ -66,14 +67,32 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
             else:
                 synthetic_data = synthetic_data.sample(n=synthetic_sample_size)
 
-        value_counts = pd.concat([real_data, synthetic_data]).value_counts(dropna=False)
-        value_counts.name = 'value_counts'
-        value_counts = value_counts.reset_index()
-
-        columns = real_data.columns.to_list()
-        synthetic_value_counts = synthetic_data.merge(
-            value_counts, how='left', left_on=columns, right_on=columns)
-        num_unique_rows = (synthetic_value_counts['value_counts'] == 1).sum()
+        numerical_fields = []
+        discrete_fields = []
+        for field, field_meta in metadata['fields'].items():
+            if field_meta['type'] == 'datetime':
+                real_data[field] = pd.to_datetime(real_data[field])
+                synthetic_data[field] = pd.to_datetime(synthetic_data[field])
+                numerical_fields.append(field)
+            elif field_meta['type'] == 'numerical':
+                numerical_fields.append(field)
+            else:
+                discrete_fields.append(field)
+
+        num_unique_rows = 0
+        for index, row in synthetic_data.iterrows():
+            row_filter = []
+            for field in real_data.columns:
+                if field in numerical_fields:
+                    field_filter = f'{field}.isnull()' if np.isnan(row[field]) else (
+                        f'abs({field} - {row[field]}) < {numerical_match_tolerance * row[field]}')
+                    row_filter.append(field_filter)
+                else:
+                    row_filter.append(f"{field} == '{row[field]}'")
+
+            matches = real_data.query(' and '.join(row_filter))
+            if matches is None or matches.empty:
+                num_unique_rows += 1
 
         return num_unique_rows / len(synthetic_data)
 
diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py
index e903df2c..1135875b 100644
--- a/tests/unit/single_table/test_new_row_synthesis.py
+++ b/tests/unit/single_table/test_new_row_synthesis.py
@@ -84,7 +84,7 @@ def test_compute_with_sample_size_too_large(self, warnings_mock):
         synthetic_data = pd.DataFrame({
             'col1': [1, 3, 4, 2, 2],
             'col2': ['a', 'b', 'c', 'd', 'e'],
-            'col3': [1.33, 1.56, 1.21, np.nan, 1.92],
+            'col3': [1.35, 1.56, 1.21, np.nan, 1.92],
         })
         metadata = {
             'fields': {

From 1cbf7e829735670c3667e692ba45681a63707117 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Wed, 21 Sep 2022 13:53:02 -0400
Subject: [PATCH 5/9] Add input validation

---
 sdmetrics/single_table/new_row_synthesis.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py
index c21a5611..35033886 100644
--- a/sdmetrics/single_table/new_row_synthesis.py
+++ b/sdmetrics/single_table/new_row_synthesis.py
@@ -59,6 +59,9 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
             float:
                 The new row synthesis score.
         """
+        real_data, synthetic_data, metadata = cls._validate_inputs(
+            real_data, synthetic_data, metadata)
+
         if synthetic_sample_size is not None:
             if synthetic_sample_size > len(synthetic_data):
                 warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} '

From 376bec499e119436511d15cc6128b6a73bb19f05 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Thu, 22 Sep 2022 13:22:48 -0400
Subject: [PATCH 6/9] fix unit test

---
 sdmetrics/reports/single_table/plot_utils.py      | 3 +++
 tests/unit/single_table/test_new_row_synthesis.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py
index 586e1cb7..51ef64ba 100644
--- a/sdmetrics/reports/single_table/plot_utils.py
+++ b/sdmetrics/reports/single_table/plot_utils.py
@@ -89,6 +89,9 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns):
     Returns:
         pandas.DataFrame
     """
+    if isinstance(columns, set):
+        columns = list(columns)
+
     similarity_correlation = pd.DataFrame(
         index=columns,
         columns=columns,
diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py
index 1135875b..a8dc240d 100644
--- a/tests/unit/single_table/test_new_row_synthesis.py
+++ b/tests/unit/single_table/test_new_row_synthesis.py
@@ -50,7 +50,7 @@ def test_compute_with_sample_size(self):
         synthetic_data = pd.DataFrame({
             'col1': [1, 3, 4, 2, 2],
             'col2': ['a', 'b', 'c', 'd', 'e'],
-            'col3': [1.33, 1.56, 1.21, np.nan, 1.92],
+            'col3': [1.46, 1.56, 1.21, np.nan, 1.92],
         })
         metadata = {
             'fields': {

From d97831f77fd80baba755b170a8db1157202481a2 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Fri, 23 Sep 2022 13:22:57 -0400
Subject: [PATCH 7/9] Fix edge cases in new row synthesis query

---
 sdmetrics/reports/single_table/plot_utils.py |  2 ++
 sdmetrics/single_table/new_row_synthesis.py  | 29 +++++++++++++-------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py
index 51ef64ba..1cfb0732 100644
--- a/sdmetrics/reports/single_table/plot_utils.py
+++ b/sdmetrics/reports/single_table/plot_utils.py
@@ -85,6 +85,8 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns):
     Args:
         score_breakdowns (dict):
             Mapping of metric to the score breakdown result.
+        columns (list[string] or set[string]):
+            A list or set of column names.
 
     Returns:
         pandas.DataFrame
diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py
index 35033886..c4174f20 100644
--- a/sdmetrics/single_table/new_row_synthesis.py
+++ b/sdmetrics/single_table/new_row_synthesis.py
@@ -1,7 +1,6 @@
 """New Row Synthesis metric for single table."""
 import warnings
 
-import numpy as np
 import pandas as pd
 
 from sdmetrics.goal import Goal
@@ -46,8 +45,8 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
             metadata (dict):
                 Table metadata dict.
             numerical_match_tolerance (float):
-                A float >0.0 representing how close two numerical values have to be
-                in order to be considered a match.
+                A float larger than 0 representing how close two numerical values have to be
+                in order to be considered a match. Defaults to `0.01`.
             synthetic_sample_size (int):
                 The number of synthetic rows to sample before computing this metric.
                 Use this to speed up the computation time if you have a large amount
@@ -72,13 +71,16 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
 
         numerical_fields = []
         discrete_fields = []
+        categorical_fields = []
         for field, field_meta in metadata['fields'].items():
             if field_meta['type'] == 'datetime':
-                real_data[field] = pd.to_datetime(real_data[field])
-                synthetic_data[field] = pd.to_datetime(synthetic_data[field])
+                real_data[field] = pd.to_numeric(real_data[field])
+                synthetic_data[field] = pd.to_numeric(synthetic_data[field])
                 numerical_fields.append(field)
             elif field_meta['type'] == 'numerical':
                 numerical_fields.append(field)
+            elif field_meta['type'] == 'categorical':
+                categorical_fields.append(field)
             else:
                 discrete_fields.append(field)
 
@@ -86,12 +88,19 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
         for index, row in synthetic_data.iterrows():
             row_filter = []
             for field in real_data.columns:
-                if field in numerical_fields:
-                    field_filter = f'{field}.isnull()' if np.isnan(row[field]) else (
-                        f'abs({field} - {row[field]}) < {numerical_match_tolerance * row[field]}')
-                    row_filter.append(field_filter)
+                if pd.isna(row[field]):
+                    field_filter = f'{field}.isnull()'
+                elif field in numerical_fields:
+                    field_filter = (
+                        f'abs({field} - {row[field]}) <= '
+                        f'{abs(numerical_match_tolerance * row[field])}'
+                    )
+                elif field in categorical_fields:
+                    field_filter = f"{field} == '{row[field]}'"
                 else:
-                    row_filter.append(f"{field} == '{row[field]}'")
+                    field_filter = f'{field} == {row[field]}'
+
+                row_filter.append(field_filter)
 
             matches = real_data.query(' and '.join(row_filter))
             if matches is None or matches.empty:

From b17f2994240c1f62c8b04ddeee50fce1c5b532a5 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Fri, 23 Sep 2022 13:51:12 -0400
Subject: [PATCH 8/9] Update query logic

---
 sdmetrics/single_table/new_row_synthesis.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py
index c4174f20..854a7db7 100644
--- a/sdmetrics/single_table/new_row_synthesis.py
+++ b/sdmetrics/single_table/new_row_synthesis.py
@@ -71,7 +71,6 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
 
         numerical_fields = []
         discrete_fields = []
-        categorical_fields = []
         for field, field_meta in metadata['fields'].items():
             if field_meta['type'] == 'datetime':
                 real_data[field] = pd.to_numeric(real_data[field])
@@ -79,8 +78,6 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
                 numerical_fields.append(field)
             elif field_meta['type'] == 'numerical':
                 numerical_fields.append(field)
-            elif field_meta['type'] == 'categorical':
-                categorical_fields.append(field)
             else:
                 discrete_fields.append(field)
 
@@ -95,10 +92,11 @@ def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_toler
                         f'abs({field} - {row[field]}) <= '
                         f'{abs(numerical_match_tolerance * row[field])}'
                     )
-                elif field in categorical_fields:
-                    field_filter = f"{field} == '{row[field]}'"
                 else:
-                    field_filter = f'{field} == {row[field]}'
+                    if real_data[field].dtype == 'O':
+                        field_filter = f"{field} == '{row[field]}'"
+                    else:
+                        field_filter = f'{field} == {row[field]}'
 
                 row_filter.append(field_filter)
 

From f5d43c075679fca0b5d05e3773256ebe840d027f Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Fri, 23 Sep 2022 13:59:57 -0400
Subject: [PATCH 9/9] Update unit test

---
 sdmetrics/single_table/base.py                |  6 ++++
 .../single_table/test_new_row_synthesis.py    | 29 ++++++++++++-------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/sdmetrics/single_table/base.py b/sdmetrics/single_table/base.py
index e9138494..2f1dcd95 100644
--- a/sdmetrics/single_table/base.py
+++ b/sdmetrics/single_table/base.py
@@ -1,5 +1,6 @@
 """Base Single Table metric class."""
 
+import copy
 from operator import attrgetter
 
 import pandas as pd
@@ -103,6 +104,11 @@ def _validate_inputs(cls, real_data, synthetic_data, metadata=None):
             (pandas.DataFrame, pandas.DataFrame, dict):
                 The validated data and metadata.
         """
+        real_data = real_data.copy()
+        synthetic_data = synthetic_data.copy()
+        if metadata is not None:
+            metadata = copy.deepcopy(metadata)
+
         if set(real_data.columns) != set(synthetic_data.columns):
             raise ValueError('`real_data` and `synthetic_data` must have the same columns')
 
diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py
index a8dc240d..7bec691c 100644
--- a/tests/unit/single_table/test_new_row_synthesis.py
+++ b/tests/unit/single_table/test_new_row_synthesis.py
@@ -12,20 +12,29 @@ def test_compute(self):
         """Test the ``compute`` method and expect that the new row synthesis score is returned."""
         # Setup
         real_data = pd.DataFrame({
-            'col1': [1, 2, 1, 3, 4],
-            'col2': ['a', 'b', 'c', 'd', 'b'],
-            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+            'col1': [0, 1, 2, 3, 4],
+            'col2': [1, 2, 1, 3, 4],
+            'col3': ['a', 'b', 'c', 'd', 'b'],
+            'col4': [1.32, np.nan, 1.43, np.nan, 2.0],
+            'col5': [51, 52, 53, 54, 55],
+            'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'],
         })
         synthetic_data = pd.DataFrame({
-            'col1': [1, 3, 4, 2, 2],
-            'col2': ['a', 'b', 'c', 'b', 'e'],
-            'col3': [1.32, 1.56, 1.21, np.nan, 1.90],
+            'col1': [0, 1, 2, 3, 4],
+            'col2': [1, 3, 4, 2, 2],
+            'col3': ['a', 'b', 'c', 'b', 'e'],
+            'col4': [1.32, 1.56, 1.21, np.nan, 1.90],
+            'col5': [51, 51, 54, 55, 53],
+            'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'],
         })
         metadata = {
             'fields': {
-                'col1': {'type': 'numerical', 'subtype': 'int'},
-                'col2': {'type': 'categorical'},
-                'col3': {'type': 'numerical', 'subtype': 'float'},
+                'col1': {'type': 'id', 'subtype': 'int'},
+                'col2': {'type': 'numerical', 'subtype': 'int'},
+                'col3': {'type': 'categorical'},
+                'col4': {'type': 'numerical', 'subtype': 'float'},
+                'col5': {'type': 'categorical'},
+                'col6': {'type': 'datetime', 'format': '%Y-%m-%d'},
             },
         }
         metric = NewRowSynthesis()
@@ -34,7 +43,7 @@ def test_compute(self):
         score = metric.compute(real_data, synthetic_data, metadata)
 
         # Assert
-        assert score == 0.6
+        assert score == 0.8
 
     def test_compute_with_sample_size(self):
         """Test the ``compute`` method with a sample size.