From d5a372582a3273d32bec3eb011706bcb9f18d87f Mon Sep 17 00:00:00 2001
From: Felipe <fealho@gmail.com>
Date: Wed, 23 Oct 2024 14:44:02 -0700
Subject: [PATCH 1/3] ADd metric

---
 .../timeseries/sequence_length_similarity.py  | 56 +++++++++++++++++++
 .../test_sequence_length_similarity.py        | 29 ++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 sdmetrics/timeseries/sequence_length_similarity.py
 create mode 100644 tests/unit/timeseries/test_sequence_length_similarity.py

diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py
new file mode 100644
index 00000000..1260f428
--- /dev/null
+++ b/sdmetrics/timeseries/sequence_length_similarity.py
@@ -0,0 +1,56 @@
+"""SequenceLengthSimilarity module."""
+
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class SequenceLengthSimilarity:
+    """Sequence Length Similarity metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'BayesianNetwork Likelihood'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
+        """Compute this metric.
+
+        The length of a sequence is determined by the number of times the same sequence key occurs.
+        For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
+        length 150. This metric compares the lengths of all sequence keys in the
+        real data vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the length of each sequence in the real data
+            - Calculate the length of each sequence in the synthetic data
+            - Apply the KSComplement metric to compare the similarities of the distributions
+            - Return this score
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+
+        Returns:
+            float:
+                Mean of the log probabilities returned by the Bayesian Network.
+        """
+        real_lengths = real_data.value_counts().to_numpy()
+        synthetic_lengths = synthetic_data.value_counts().to_numpy()
+
+        return KSComplement.compute(real_lengths, synthetic_lengths)
diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py
new file mode 100644
index 00000000..96f59e81
--- /dev/null
+++ b/tests/unit/timeseries/test_sequence_length_similarity.py
@@ -0,0 +1,29 @@
+import pandas as pd
+
+from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity
+
+
+class TestSequenceLengthSimilarity:
+    def test_compute_one(self):
+        """Test it returns 1 when real and synthetic data have the same distribution."""
+        # Setup
+        real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id1', 'id1', 'id2', 'id3', 'id3', 'id3'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 1
+
+    def test_compute_low_score(self):
+        """Test it for distinct distributions."""
+        # Setup
+        real_data = pd.Series(['id1', 'id1', 'id2'])
+        synthetic_data = pd.Series(['id1', 'id2', 'id3'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0.5

From 9524b00c0761e4d5fbd0b06b84a5db8e52a5f154 Mon Sep 17 00:00:00 2001
From: Felipe <fealho@gmail.com>
Date: Mon, 28 Oct 2024 09:37:37 -0700
Subject: [PATCH 2/3] Fix ordering of the metric

---
 sdmetrics/timeseries/__init__.py                         | 2 ++
 sdmetrics/timeseries/sequence_length_similarity.py       | 9 +++------
 tests/unit/timeseries/test_sequence_length_similarity.py | 8 ++++----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py
index 6a09b529..bc012092 100644
--- a/sdmetrics/timeseries/__init__.py
+++ b/sdmetrics/timeseries/__init__.py
@@ -5,6 +5,7 @@
 from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric
 from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric
 from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy
+from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity
 
 __all__ = [
     'base',
@@ -16,4 +17,5 @@
     'LSTMDetection',
     'TimeSeriesEfficacyMetric',
     'LSTMClassifierEfficacy',
+    'SequenceLengthSimilarity',
 ]
diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py
index 1260f428..174f0874 100644
--- a/sdmetrics/timeseries/sequence_length_similarity.py
+++ b/sdmetrics/timeseries/sequence_length_similarity.py
@@ -20,7 +20,7 @@ class SequenceLengthSimilarity:
             Maximum value or values that this metric can take.
     """
 
-    name = 'BayesianNetwork Likelihood'
+    name = 'Sequence Length Similarity'
     goal = Goal.MAXIMIZE
     min_value = 0.0
     max_value = 1.0
@@ -48,9 +48,6 @@ def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
 
         Returns:
             float:
-                Mean of the log probabilities returned by the Bayesian Network.
+                The score.
         """
-        real_lengths = real_data.value_counts().to_numpy()
-        synthetic_lengths = synthetic_data.value_counts().to_numpy()
-
-        return KSComplement.compute(real_lengths, synthetic_lengths)
+        return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py
index 96f59e81..02180780 100644
--- a/tests/unit/timeseries/test_sequence_length_similarity.py
+++ b/tests/unit/timeseries/test_sequence_length_similarity.py
@@ -8,7 +8,7 @@ def test_compute_one(self):
         """Test it returns 1 when real and synthetic data have the same distribution."""
         # Setup
         real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
-        synthetic_data = pd.Series(['id1', 'id1', 'id2', 'id3', 'id3', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6'])
 
         # Run
         score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
@@ -19,11 +19,11 @@ def test_compute_one(self):
     def test_compute_low_score(self):
         """Test it for distinct distributions."""
         # Setup
-        real_data = pd.Series(['id1', 'id1', 'id2'])
-        synthetic_data = pd.Series(['id1', 'id2', 'id3'])
+        real_data = pd.Series([f'id{i}' for i in range(100)])
+        synthetic_data = pd.Series(['id100'] * 100)
 
         # Run
         score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
 
         # Assert
-        assert score == 0.5
+        assert score == 0

From ae46e7ef198ee08357133b96702cdfde9a41a7cd Mon Sep 17 00:00:00 2001
From: Felipe <fealho@gmail.com>
Date: Thu, 31 Oct 2024 04:24:36 -0700
Subject: [PATCH 3/3] Add test case

---
 .../timeseries/test_sequence_length_similarity.py    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py
index 02180780..903b3b91 100644
--- a/tests/unit/timeseries/test_sequence_length_similarity.py
+++ b/tests/unit/timeseries/test_sequence_length_similarity.py
@@ -4,6 +4,18 @@
 
 
 class TestSequenceLengthSimilarity:
+    def test_compute(self):
+        """Test it runs."""
+        # Setup
+        real_data = pd.Series(['id1', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id5', 'id6'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0.6666666666666667
+
     def test_compute_one(self):
         """Test it returns 1 when real and synthetic data have the same distribution."""
         # Setup