diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index 584bcbf9..f5232004 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -6,6 +6,7 @@ from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy from sdmetrics.timeseries.inter_row_msas import InterRowMSAS +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'base', @@ -18,4 +19,5 @@ 'TimeSeriesEfficacyMetric', 'LSTMClassifierEfficacy', 'InterRowMSAS', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py new file mode 100644 index 00000000..174f0874 --- /dev/null +++ b/sdmetrics/timeseries/sequence_length_similarity.py @@ -0,0 +1,53 @@ +"""SequenceLengthSimilarity module.""" + +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class SequenceLengthSimilarity: + """Sequence Length Similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Sequence Length Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: + """Compute this metric. + + The length of a sequence is determined by the number of times the same sequence key occurs. + For example if id_09231 appeared 150 times in the sequence key, then the sequence is of + length 150. This metric compares the lengths of all sequence keys in the + real data vs. the synthetic data. + + It works as follows: + - Calculate the length of each sequence in the real data + - Calculate the length of each sequence in the synthetic data + - Apply the KSComplement metric to compare the similarities of the distributions + - Return this score + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + + Returns: + float: + The score. + """ + return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts()) diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py new file mode 100644 index 00000000..903b3b91 --- /dev/null +++ b/tests/unit/timeseries/test_sequence_length_similarity.py @@ -0,0 +1,41 @@ +import pandas as pd + +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity + + +class TestSequenceLengthSimilarity: + def test_compute(self): + """Test it runs.""" + # Setup + real_data = pd.Series(['id1', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id5', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0.6666666666666667 + + def test_compute_one(self): + """Test it returns 1 when real and synthetic data have the same distribution.""" + # Setup + real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 1 + + def test_compute_low_score(self): + """Test it for distinct distributions.""" + # Setup + real_data = pd.Series([f'id{i}' for i in range(100)]) + synthetic_data = pd.Series(['id100'] * 100) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0