From d5a372582a3273d32bec3eb011706bcb9f18d87f Mon Sep 17 00:00:00 2001 From: Felipe Date: Wed, 23 Oct 2024 14:44:02 -0700 Subject: [PATCH 1/3] ADd metric --- .../timeseries/sequence_length_similarity.py | 56 +++++++++++++++++++ .../test_sequence_length_similarity.py | 29 ++++++++++ 2 files changed, 85 insertions(+) create mode 100644 sdmetrics/timeseries/sequence_length_similarity.py create mode 100644 tests/unit/timeseries/test_sequence_length_similarity.py diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py new file mode 100644 index 00000000..1260f428 --- /dev/null +++ b/sdmetrics/timeseries/sequence_length_similarity.py @@ -0,0 +1,56 @@ +"""SequenceLengthSimilarity module.""" + +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class SequenceLengthSimilarity: + """Sequence Length Similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'BayesianNetwork Likelihood' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: + """Compute this metric. + + The length of a sequence is determined by the number of times the same sequence key occurs. + For example if id_09231 appeared 150 times in the sequence key, then the sequence is of + length 150. This metric compares the lengths of all sequence keys in the + real data vs. the synthetic data. + + It works as follows: + - Calculate the length of each sequence in the real data + - Calculate the length of each sequence in the synthetic data + - Apply the KSComplement metric to compare the similarities of the distributions + - Return this score + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + + Returns: + float: + Mean of the log probabilities returned by the Bayesian Network. + """ + real_lengths = real_data.value_counts().to_numpy() + synthetic_lengths = synthetic_data.value_counts().to_numpy() + + return KSComplement.compute(real_lengths, synthetic_lengths) diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py new file mode 100644 index 00000000..96f59e81 --- /dev/null +++ b/tests/unit/timeseries/test_sequence_length_similarity.py @@ -0,0 +1,29 @@ +import pandas as pd + +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity + + +class TestSequenceLengthSimilarity: + def test_compute_one(self): + """Test it returns 1 when real and synthetic data have the same distribution.""" + # Setup + real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id1', 'id1', 'id2', 'id3', 'id3', 'id3']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 1 + + def test_compute_low_score(self): + """Test it for distinct distributions.""" + # Setup + real_data = pd.Series(['id1', 'id1', 'id2']) + synthetic_data = pd.Series(['id1', 'id2', 'id3']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0.5 From 9524b00c0761e4d5fbd0b06b84a5db8e52a5f154 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 28 Oct 2024 09:37:37 -0700 Subject: [PATCH 2/3] Fix ordering of the metric --- sdmetrics/timeseries/__init__.py | 2 ++ sdmetrics/timeseries/sequence_length_similarity.py | 9 +++------ tests/unit/timeseries/test_sequence_length_similarity.py | 8 ++++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index 6a09b529..bc012092 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -5,6 +5,7 @@ from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'base', @@ -16,4 +17,5 @@ 'LSTMDetection', 'TimeSeriesEfficacyMetric', 'LSTMClassifierEfficacy', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py index 1260f428..174f0874 100644 --- a/sdmetrics/timeseries/sequence_length_similarity.py +++ b/sdmetrics/timeseries/sequence_length_similarity.py @@ -20,7 +20,7 @@ class SequenceLengthSimilarity: Maximum value or values that this metric can take. """ - name = 'BayesianNetwork Likelihood' + name = 'Sequence Length Similarity' goal = Goal.MAXIMIZE min_value = 0.0 max_value = 1.0 @@ -48,9 +48,6 @@ def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: Returns: float: - Mean of the log probabilities returned by the Bayesian Network. + The score. """ - real_lengths = real_data.value_counts().to_numpy() - synthetic_lengths = synthetic_data.value_counts().to_numpy() - - return KSComplement.compute(real_lengths, synthetic_lengths) + return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts()) diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py index 96f59e81..02180780 100644 --- a/tests/unit/timeseries/test_sequence_length_similarity.py +++ b/tests/unit/timeseries/test_sequence_length_similarity.py @@ -8,7 +8,7 @@ def test_compute_one(self): """Test it returns 1 when real and synthetic data have the same distribution.""" # Setup real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) - synthetic_data = pd.Series(['id1', 'id1', 'id2', 'id3', 'id3', 'id3']) + synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6']) # Run score = SequenceLengthSimilarity.compute(real_data, synthetic_data) @@ -19,11 +19,11 @@ def test_compute_one(self): def test_compute_low_score(self): """Test it for distinct distributions.""" # Setup - real_data = pd.Series(['id1', 'id1', 'id2']) - synthetic_data = pd.Series(['id1', 'id2', 'id3']) + real_data = pd.Series([f'id{i}' for i in range(100)]) + synthetic_data = pd.Series(['id100'] * 100) # Run score = SequenceLengthSimilarity.compute(real_data, synthetic_data) # Assert - assert score == 0.5 + assert score == 0 From ae46e7ef198ee08357133b96702cdfde9a41a7cd Mon Sep 17 00:00:00 2001 From: Felipe Date: Thu, 31 Oct 2024 04:24:36 -0700 Subject: [PATCH 3/3] Add test case --- .../timeseries/test_sequence_length_similarity.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py index 02180780..903b3b91 100644 --- a/tests/unit/timeseries/test_sequence_length_similarity.py +++ b/tests/unit/timeseries/test_sequence_length_similarity.py @@ -4,6 +4,18 @@ class TestSequenceLengthSimilarity: + def test_compute(self): + """Test it runs.""" + # Setup + real_data = pd.Series(['id1', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id5', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0.6666666666666667 + def test_compute_one(self): """Test it returns 1 when real and synthetic data have the same distribution.""" # Setup