sdv-dev · fealho · Dec 3, 2024 · Dec 2, 2024 · Dec 2, 2024
@@ -5,11 +5,12 @@
 import numpy as np
 import pandas as pd
 
+from sdmetrics.column_pairs.base import ColumnPairsMetric
 from sdmetrics.goal import Goal
 from sdmetrics.single_column.statistical.kscomplement import KSComplement
 
 
-class InterRowMSAS:
+class InterRowMSAS(ColumnPairsMetric):
     """Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.
 
     Attributes:
@@ -76,15 +77,17 @@
         num_invalid_groups = len(group_sizes[group_sizes <= n_rows_diff])
         if num_invalid_groups > 0:
             warnings.warn(
-                f"n_rows_diff '{n_rows_diff}' is greater than the "
+                f"n_rows_diff '{n_rows_diff}' is greater or equal to the "
                 f'size of {num_invalid_groups} sequence keys in {data_name}.'
             )
 
         def diff_func(group):
             if len(group) <= n_rows_diff:
                 return np.nan
             group = group.to_numpy()
-            return np.mean(group[n_rows_diff:] - group[:-n_rows_diff])
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', message='Mean of empty slice')
+                return np.nanmean(group[n_rows_diff:] - group[:-n_rows_diff])
 
         with warnings.catch_warnings():
             warnings.filterwarnings('ignore', message='invalid value encountered in.*')

@@ -2,11 +2,12 @@
 
 import pandas as pd
 
+from sdmetrics.column_pairs.base import ColumnPairsMetric
 from sdmetrics.goal import Goal
 from sdmetrics.single_column.statistical.kscomplement import KSComplement
 
 
-class StatisticMSAS:
+class StatisticMSAS(ColumnPairsMetric):
     """Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.
 
     Attributes:

@@ -3,10 +3,11 @@
 import pandas as pd
 
 from sdmetrics.goal import Goal
+from sdmetrics.single_column.base import SingleColumnMetric
 from sdmetrics.single_column.statistical.kscomplement import KSComplement
 
 
-class SequenceLengthSimilarity:
+class SequenceLengthSimilarity(SingleColumnMetric):
     """Sequence Length Similarity metric.
 
     Attributes:

@@ -1,12 +1,31 @@
 from datetime import datetime
 
+import numpy as np
 import pandas as pd
 import pytest
 
 from sdmetrics.column_pairs import InterRowMSAS
 
 
 class TestInterRowMSAS:
+    def test_compute_breakdown(self):
+        """Test `compute_breakdown` works."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])
+
+        metric = InterRowMSAS()
+
+        # Run
+        result = metric.compute_breakdown(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert result == {'score': 0.5}
+
     def test_compute(self):
         """Test it runs."""
         # Setup
@@ -23,6 +42,22 @@ def test_compute(self):
         # Assert
         assert score == 0.5
 
+    def test_compute_nans(self):
+        """Test it runs with nans."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, np.nan, 4, 5, 8])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 10, 4, 7, np.nan, np.nan])
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert score == 0.5
+
     def test_compute_identical_sequences(self):
         """Test it returns 1 when real and synthetic data are identical."""
         # Setup
@@ -94,9 +129,10 @@ def test_compute_with_log_warning(self):
             'There are 3 non-positive values in your data, which cannot be used with log. '
             "Consider changing 'apply_log' to False for a better result."
         )
+
         assert len(warning_info) == 1
         assert str(warning_info[0].message) == expected_message
-        assert score == 0
+        assert score == 0.5
 
     def test_compute_with_log_datetime(self):
         """Test it crashes for logs of datetime values."""
@@ -211,7 +247,9 @@ def test_compute_warning(self):
         synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])
 
         # Run and Assert
-        warn_msg = "n_rows_diff '10' is greater than the size of 2 sequence keys in real_data."
+        warn_msg = (
+            "n_rows_diff '10' is greater or equal to the size of 2 sequence keys in real_data."
+        )
         with pytest.warns(UserWarning, match=warn_msg):
             score = InterRowMSAS.compute(
                 real_data=(real_keys, real_values),

@@ -7,6 +7,24 @@
 
 
 class TestStatisticMSAS:
+    def test_compute_breakdown(self):
+        """Test `compute_breakdown` works."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])
+
+        metric = StatisticMSAS()
+
+        # Run
+        result = metric.compute_breakdown(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert result == {'score': 0.5}
+
     def test_compute_identical_sequences(self, recwarn):
         """Test it returns 1 when real and synthetic data are identical."""
         # Setup

@@ -4,6 +4,20 @@
 
 
 class TestSequenceLengthSimilarity:
+    def test_compute_breakdown(self):
+        """Test `compute_breakdown` works."""
+        # Setup
+        real_data = pd.Series([1, 1, 2, 2, 2])
+        synthetic_data = pd.Series([3, 4, 5, 6, 6])
+
+        metric = SequenceLengthSimilarity()
+
+        # Run
+        result = metric.compute_breakdown(real_data, synthetic_data)
+
+        # Assert
+        assert result == {'score': 0.25}
+
     def test_compute(self):
         """Test it runs."""
         # Setup