sdv-dev · fealho · Nov 5, 2024 · Oct 23, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py
@@ -6,6 +6,7 @@
 from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric
 from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy
 from sdmetrics.timeseries.inter_row_msas import InterRowMSAS
+from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity
 
 __all__ = [
     'base',
@@ -18,4 +19,5 @@
     'TimeSeriesEfficacyMetric',
     'LSTMClassifierEfficacy',
     'InterRowMSAS',
+    'SequenceLengthSimilarity',
 ]
diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py
@@ -0,0 +1,53 @@
+"""SequenceLengthSimilarity module."""
+
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class SequenceLengthSimilarity:
+    """Sequence Length Similarity metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Sequence Length Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
+        """Compute this metric.
+
+        The length of a sequence is determined by the number of times the same sequence key occurs.
+        For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
+        length 150. This metric compares the lengths of all sequence keys in the
+        real data vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the length of each sequence in the real data
+            - Calculate the length of each sequence in the synthetic data
+            - Apply the KSComplement metric to compare the similarities of the distributions
+            - Return this score
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+
+        Returns:
+            float:
+                The score.
+        """
+        return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity
+
+
+class TestSequenceLengthSimilarity:
+    def test_compute(self):
+        """Test it runs."""
+        # Setup
+        real_data = pd.Series(['id1', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id5', 'id6'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0.6666666666666667
+
+    def test_compute_one(self):
+        """Test it returns 1 when real and synthetic data have the same distribution."""
+        # Setup
+        real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 1
+
+    def test_compute_low_score(self):
+        """Test it for distinct distributions."""
+        # Setup
+        real_data = pd.Series([f'id{i}' for i in range(100)])
+        synthetic_data = pd.Series(['id100'] * 100)
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0