From 5a3701662bad3492d7e6f8548b8babb241129141 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Mon, 4 Nov 2024 18:51:11 -0800 Subject: [PATCH 1/5] Add metric for inter-row MSAS (#647) --- .../single_column/statistical/kscomplement.py | 9 +- sdmetrics/timeseries/__init__.py | 2 + sdmetrics/timeseries/inter_row_msas.py | 106 +++++++++++ tests/unit/timeseries/test_inter_row_msas.py | 176 ++++++++++++++++++ 4 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 sdmetrics/timeseries/inter_row_msas.py create mode 100644 tests/unit/timeseries/test_inter_row_msas.py diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py index 3be01330..525e85c7 100644 --- a/sdmetrics/single_column/statistical/kscomplement.py +++ b/sdmetrics/single_column/statistical/kscomplement.py @@ -1,5 +1,6 @@ """Kolmogorov-Smirnov test based Metric.""" +import numpy as np import pandas as pd from scipy.stats import ks_2samp @@ -56,7 +57,13 @@ def compute(real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - statistic, _ = ks_2samp(real_data, synthetic_data) + try: + statistic, _ = ks_2samp(real_data, synthetic_data) + except ValueError as e: + if str(e) == 'Data passed to ks_2samp must not be empty': + return np.nan + else: + raise ValueError(e) return 1 - statistic diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index 6a09b529..584bcbf9 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -5,6 +5,7 @@ from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy +from sdmetrics.timeseries.inter_row_msas import InterRowMSAS __all__ = [ 'base', @@ -16,4 +17,5 @@ 'LSTMDetection', 'TimeSeriesEfficacyMetric', 'LSTMClassifierEfficacy', + 'InterRowMSAS', ] diff --git a/sdmetrics/timeseries/inter_row_msas.py b/sdmetrics/timeseries/inter_row_msas.py new file mode 100644 index 00000000..eea77f06 --- /dev/null +++ b/sdmetrics/timeseries/inter_row_msas.py @@ -0,0 +1,106 @@ +"""InterRowMSAS module.""" + +import warnings + +import numpy as np +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class InterRowMSAS: + """Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Inter-Row Multi-Sequence Aggregate Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False): + """Compute this metric. + + This metric compares the inter-row differences of sequences in the real data + vs. the synthetic data. + + It works as follows: + - Calculate the difference between row r and row r+x for each row in the real data + - Take the average over each sequence to form a distribution D_r + - Do the same for the synthetic data to form a new distribution D_s + - Apply the KSComplement metric to compare the similarities of (D_r, D_s) + - Return this score + + Args: + real_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the real data and the second represents a continuous column of data. + synthetic_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the synthetic data and the second represents a continuous column of data. + n_rows_diff (int): + An integer representing the number of rows to consider when taking the difference. + apply_log (bool): + Whether to apply a natural log before taking the difference. + + Returns: + float: + The similarity score between the real and synthetic data distributions. + """ + for data in [real_data, synthetic_data]: + if ( + not isinstance(data, tuple) + or len(data) != 2 + or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series))) + ): + raise ValueError('The data must be a tuple of two pandas series.') + + if not isinstance(n_rows_diff, int) or n_rows_diff < 1: + raise ValueError("'n_rows_diff' must be an integer greater than zero.") + + if not isinstance(apply_log, bool): + raise ValueError("'apply_log' must be a boolean.") + + real_keys, real_values = real_data + synthetic_keys, synthetic_values = synthetic_data + + if apply_log: + real_values = np.log(real_values) + synthetic_values = np.log(synthetic_values) + + def calculate_differences(keys, values, n_rows_diff, data_name): + group_sizes = values.groupby(keys).size() + num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count() + if num_invalid_groups > 0: + warnings.warn( + f"n_rows_diff '{n_rows_diff}' is greater than the " + f'size of {num_invalid_groups} sequence keys in {data_name}.' + ) + + differences = values.groupby(keys).apply( + lambda group: np.mean( + group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff] + ) + if len(group) > n_rows_diff + else np.nan + ) + + return pd.Series(differences) + + real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data') + synthetic_diff = calculate_differences( + synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data' + ) + + return KSComplement.compute(real_diff, synthetic_diff) diff --git a/tests/unit/timeseries/test_inter_row_msas.py b/tests/unit/timeseries/test_inter_row_msas.py new file mode 100644 index 00000000..14101079 --- /dev/null +++ b/tests/unit/timeseries/test_inter_row_msas.py @@ -0,0 +1,176 @@ +import pandas as pd +import pytest + +from sdmetrics.timeseries.inter_row_msas import InterRowMSAS + + +class TestInterRowMSAS: + def test_compute(self): + """Test it runs.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 10, 3, 7, 5, 1]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 0.5 + + def test_compute_identical_sequences(self): + """Test it returns 1 when real and synthetic data are identical.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4, 5, 6]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 1 + + def test_compute_different_sequences(self): + """Test it for distinct distributions.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 3, 5, 2, 4, 6]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 0 + + def test_compute_with_log(self): + """Test it with logarithmic transformation.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 4, 8, 16, 32]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 4, 8, 16, 32]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + apply_log=True, + ) + + # Assert + assert score == 1 + + def test_compute_different_n_rows_diff(self): + """Test it with different n_rows_diff.""" + # Setup + real_keys = pd.Series(['id1'] * 10 + ['id2'] * 10) + real_values = pd.Series(list(range(10)) + list(range(10))) + synthetic_keys = pd.Series(['id1'] * 10 + ['id2'] * 10) + synthetic_values = pd.Series(list(range(10)) + list(range(10))) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=3, + ) + + # Assert + assert score == 1 + + def test_compute_invalid_real_data(self): + """Test that it raises ValueError when real_data is invalid.""" + # Setup + real_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + InterRowMSAS.compute( + real_data=real_data, + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=1, + apply_log=False, + ) + + def test_compute_invalid_synthetic_data(self): + """Test that it raises ValueError when synthetic_data is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=synthetic_data, + n_rows_diff=1, + apply_log=False, + ) + + def test_compute_invalid_n_rows_diff(self): + """Test that it raises ValueError when n_rows_diff is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_keys = pd.Series(['id3', 'id3', 'id4', 'id4']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match="'n_rows_diff' must be an integer greater than zero."): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=0, + apply_log=False, + ) + + def test_compute_invalid_apply_log(self): + """Test that it raises ValueError when apply_log is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match="'apply_log' must be a boolean."): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=1, + apply_log='True', # Should be a boolean, not a string + ) + + def test_compute_warning(self): + """Test a warning is raised when n_rows_diff is greater than sequence values size.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 10, 3, 7, 5, 1]) + + # Run and Assert + warn_msg = "n_rows_diff '10' is greater than the size of 2 sequence keys in real_data." + with pytest.warns(UserWarning, match=warn_msg): + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=10, + ) + + # Assert + assert pd.isna(score) From b6771252a42c4f5e2259b40eadbcd13629850718 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Mon, 4 Nov 2024 19:41:01 -0800 Subject: [PATCH 2/5] Add metric for sequence length similarity (#643) --- sdmetrics/timeseries/__init__.py | 2 + .../timeseries/sequence_length_similarity.py | 53 +++++++++++++++++++ .../test_sequence_length_similarity.py | 41 ++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 sdmetrics/timeseries/sequence_length_similarity.py create mode 100644 tests/unit/timeseries/test_sequence_length_similarity.py diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index 584bcbf9..f5232004 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -6,6 +6,7 @@ from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy from sdmetrics.timeseries.inter_row_msas import InterRowMSAS +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'base', @@ -18,4 +19,5 @@ 'TimeSeriesEfficacyMetric', 'LSTMClassifierEfficacy', 'InterRowMSAS', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/timeseries/sequence_length_similarity.py new file mode 100644 index 00000000..174f0874 --- /dev/null +++ b/sdmetrics/timeseries/sequence_length_similarity.py @@ -0,0 +1,53 @@ +"""SequenceLengthSimilarity module.""" + +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class SequenceLengthSimilarity: + """Sequence Length Similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Sequence Length Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: + """Compute this metric. + + The length of a sequence is determined by the number of times the same sequence key occurs. + For example if id_09231 appeared 150 times in the sequence key, then the sequence is of + length 150. This metric compares the lengths of all sequence keys in the + real data vs. the synthetic data. + + It works as follows: + - Calculate the length of each sequence in the real data + - Calculate the length of each sequence in the synthetic data + - Apply the KSComplement metric to compare the similarities of the distributions + - Return this score + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + + Returns: + float: + The score. + """ + return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts()) diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/timeseries/test_sequence_length_similarity.py new file mode 100644 index 00000000..903b3b91 --- /dev/null +++ b/tests/unit/timeseries/test_sequence_length_similarity.py @@ -0,0 +1,41 @@ +import pandas as pd + +from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity + + +class TestSequenceLengthSimilarity: + def test_compute(self): + """Test it runs.""" + # Setup + real_data = pd.Series(['id1', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id5', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0.6666666666666667 + + def test_compute_one(self): + """Test it returns 1 when real and synthetic data have the same distribution.""" + # Setup + real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 1 + + def test_compute_low_score(self): + """Test it for distinct distributions.""" + # Setup + real_data = pd.Series([f'id{i}' for i in range(100)]) + synthetic_data = pd.Series(['id100'] * 100) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0 From a395568dabac6dba7d11df203843d5d9648fbe0f Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Mon, 4 Nov 2024 20:17:34 -0800 Subject: [PATCH 3/5] Add metric for general MSAS statistics (#649) --- sdmetrics/timeseries/__init__.py | 2 + sdmetrics/timeseries/statistic_msas.py | 96 ++++++++++++++ tests/unit/timeseries/test_statistic_msas.py | 125 +++++++++++++++++++ 3 files changed, 223 insertions(+) create mode 100644 sdmetrics/timeseries/statistic_msas.py create mode 100644 tests/unit/timeseries/test_statistic_msas.py diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index f5232004..06f5e4c8 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -7,6 +7,7 @@ from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy from sdmetrics.timeseries.inter_row_msas import InterRowMSAS from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity +from sdmetrics.timeseries.statistic_msas import StatisticMSAS __all__ = [ 'base', @@ -20,4 +21,5 @@ 'LSTMClassifierEfficacy', 'InterRowMSAS', 'SequenceLengthSimilarity', + 'StatisticMSAS', ] diff --git a/sdmetrics/timeseries/statistic_msas.py b/sdmetrics/timeseries/statistic_msas.py new file mode 100644 index 00000000..8afab764 --- /dev/null +++ b/sdmetrics/timeseries/statistic_msas.py @@ -0,0 +1,96 @@ +"""StatisticMSAS module.""" + +import numpy as np +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class StatisticMSAS: + """Statistic Multi-Sequence Aggregate Similarity (MSAS) metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Statistic Multi-Sequence Aggregate Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data, synthetic_data, statistic='mean'): + """Compute this metric. + + This metric compares the distribution of a given statistic across sequences + in the real data vs. the synthetic data. + + It works as follows: + - Calculate the specified statistic for each sequence in the real data + - Form a distribution D_r from these statistics + - Do the same for the synthetic data to form a new distribution D_s + - Apply the KSComplement metric to compare the similarities of (D_r, D_s) + - Return this score + + Args: + real_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the real data and the second represents a continuous column of data. + synthetic_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the synthetic data and the second represents a continuous column of data. + statistic (str): + A string representing the statistic function to use when computing MSAS. + + Available options are: + - 'mean': The arithmetic mean of the sequence + - 'median': The median value of the sequence + - 'std': The standard deviation of the sequence + - 'min': The minimum value in the sequence + - 'max': The maximum value in the sequence + + Returns: + float: + The similarity score between the real and synthetic data distributions. + """ + statistic_functions = { + 'mean': np.mean, + 'median': np.median, + 'std': np.std, + 'min': np.min, + 'max': np.max, + } + if statistic not in statistic_functions: + raise ValueError( + f'Invalid statistic: {statistic}.' + f' Choose from [{", ".join(statistic_functions.keys())}].' + ) + + for data in [real_data, synthetic_data]: + if ( + not isinstance(data, tuple) + or len(data) != 2 + or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series))) + ): + raise ValueError('The data must be a tuple of two pandas series.') + + real_keys, real_values = real_data + synthetic_keys, synthetic_values = synthetic_data + stat_func = statistic_functions[statistic] + + def calculate_statistics(keys, values): + df = pd.DataFrame({'keys': keys, 'values': values}) + return df.groupby('keys')['values'].agg(stat_func) + + real_stats = calculate_statistics(real_keys, real_values) + synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values) + + return KSComplement.compute(real_stats, synthetic_stats) diff --git a/tests/unit/timeseries/test_statistic_msas.py b/tests/unit/timeseries/test_statistic_msas.py new file mode 100644 index 00000000..f44812ec --- /dev/null +++ b/tests/unit/timeseries/test_statistic_msas.py @@ -0,0 +1,125 @@ +import re + +import pandas as pd +import pytest + +from sdmetrics.timeseries import StatisticMSAS + + +class TestStatisticMSAS: + def test_compute_identical_sequences(self): + """Test it returns 1 when real and synthetic data are identical.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 2, 3, 4, 5, 6]) + + # Run and Assert + for statistic in ['mean', 'median', 'std', 'min', 'max']: + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic=statistic, + ) + assert score == 1 + + def test_compute_different_sequences(self): + """Test it for distinct distributions.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([10, 20, 30, 40, 50, 60]) + + # Run and Assert + for statistic in ['mean', 'median', 'std', 'min', 'max']: + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic=statistic, + ) + assert score == 0 + + def test_compute_with_single_sequence(self): + """Test it with a single sequence.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1']) + real_values = pd.Series([1, 2, 3]) + synthetic_keys = pd.Series(['id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3]) + + # Run + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='mean', + ) + + # Assert + assert score == 1 + + def test_compute_with_different_sequence_lengths(self): + """Test it with different sequence lengths.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5]) + synthetic_keys = pd.Series(['id2', 'id2', 'id3', 'id4', 'id5']) + synthetic_values = pd.Series([1, 2, 3, 4, 5]) + + # Run + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='mean', + ) + + # Assert + assert score == 0.75 + + def test_compute_with_invalid_statistic(self): + """Test it raises ValueError for invalid statistic.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1']) + real_values = pd.Series([1, 2, 3]) + synthetic_keys = pd.Series(['id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3]) + + # Run and Assert + err_msg = re.escape( + 'Invalid statistic: invalid. Choose from [mean, median, std, min, max].' + ) + with pytest.raises(ValueError, match=err_msg): + StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='invalid', + ) + + def test_compute_invalid_real_data(self): + """Test that it raises ValueError when real_data is invalid.""" + # Setup + real_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + StatisticMSAS.compute( + real_data=real_data, + synthetic_data=(synthetic_keys, synthetic_values), + ) + + def test_compute_invalid_synthetic_data(self): + """Test that it raises ValueError when synthetic_data is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=synthetic_data, + ) From 343066b75b009fbe7fad27e23ab787f591e33dc1 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Thu, 14 Nov 2024 11:23:40 -0800 Subject: [PATCH 4/5] Relocate timeseries metrics modules (#658) --- sdmetrics/column_pairs/__init__.py | 4 ++++ sdmetrics/column_pairs/statistical/__init__.py | 4 ++++ .../statistical}/inter_row_msas.py | 0 .../statistical}/statistic_msas.py | 0 sdmetrics/single_column/__init__.py | 2 ++ sdmetrics/single_column/statistical/__init__.py | 2 ++ .../statistical}/sequence_length_similarity.py | 0 sdmetrics/timeseries/__init__.py | 6 ------ .../statistical}/test_inter_row_msas.py | 2 +- .../statistical}/test_statistic_msas.py | 2 +- .../statistical}/test_sequence_length_similarity.py | 2 +- 11 files changed, 15 insertions(+), 9 deletions(-) rename sdmetrics/{timeseries => column_pairs/statistical}/inter_row_msas.py (100%) rename sdmetrics/{timeseries => column_pairs/statistical}/statistic_msas.py (100%) rename sdmetrics/{timeseries => single_column/statistical}/sequence_length_similarity.py (100%) rename tests/unit/{timeseries => column_pairs/statistical}/test_inter_row_msas.py (99%) rename tests/unit/{timeseries => column_pairs/statistical}/test_statistic_msas.py (98%) rename tests/unit/{timeseries => single_column/statistical}/test_sequence_length_similarity.py (93%) diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py index e44e35de..38f1aebe 100644 --- a/sdmetrics/column_pairs/__init__.py +++ b/sdmetrics/column_pairs/__init__.py @@ -11,6 +11,8 @@ DiscreteKLDivergence, ) from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity +from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS +from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS __all__ = [ 'CardinalityBoundaryAdherence', @@ -20,4 +22,6 @@ 'CorrelationSimilarity', 'DiscreteKLDivergence', 'ReferentialIntegrity', + 'InterRowMSAS', + 'StatisticMSAS', ] diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index 7f921df6..7198944e 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -10,6 +10,8 @@ DiscreteKLDivergence, ) from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity +from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS +from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS __all__ = [ 'CardinalityBoundaryAdherence', @@ -18,4 +20,6 @@ 'CorrelationSimilarity', 'DiscreteKLDivergence', 'ReferentialIntegrity', + 'InterRowMSAS', + 'StatisticMSAS', ] diff --git a/sdmetrics/timeseries/inter_row_msas.py b/sdmetrics/column_pairs/statistical/inter_row_msas.py similarity index 100% rename from sdmetrics/timeseries/inter_row_msas.py rename to sdmetrics/column_pairs/statistical/inter_row_msas.py diff --git a/sdmetrics/timeseries/statistic_msas.py b/sdmetrics/column_pairs/statistical/statistic_msas.py similarity index 100% rename from sdmetrics/timeseries/statistic_msas.py rename to sdmetrics/column_pairs/statistical/statistic_msas.py diff --git a/sdmetrics/single_column/__init__.py b/sdmetrics/single_column/__init__.py index 563ea574..fdd9d9f1 100644 --- a/sdmetrics/single_column/__init__.py +++ b/sdmetrics/single_column/__init__.py @@ -12,6 +12,7 @@ from sdmetrics.single_column.statistical.range_coverage import RangeCoverage from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity from sdmetrics.single_column.statistical.tv_complement import TVComplement +from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'base', @@ -26,4 +27,5 @@ 'RangeCoverage', 'StatisticSimilarity', 'TVComplement', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/single_column/statistical/__init__.py b/sdmetrics/single_column/statistical/__init__.py index 252cd6ac..228a456b 100644 --- a/sdmetrics/single_column/statistical/__init__.py +++ b/sdmetrics/single_column/statistical/__init__.py @@ -10,6 +10,7 @@ from sdmetrics.single_column.statistical.range_coverage import RangeCoverage from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity from sdmetrics.single_column.statistical.tv_complement import TVComplement +from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'BoundaryAdherence', @@ -22,4 +23,5 @@ 'RangeCoverage', 'StatisticSimilarity', 'TVComplement', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/timeseries/sequence_length_similarity.py b/sdmetrics/single_column/statistical/sequence_length_similarity.py similarity index 100% rename from sdmetrics/timeseries/sequence_length_similarity.py rename to sdmetrics/single_column/statistical/sequence_length_similarity.py diff --git a/sdmetrics/timeseries/__init__.py b/sdmetrics/timeseries/__init__.py index 06f5e4c8..6a09b529 100644 --- a/sdmetrics/timeseries/__init__.py +++ b/sdmetrics/timeseries/__init__.py @@ -5,9 +5,6 @@ from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy -from sdmetrics.timeseries.inter_row_msas import InterRowMSAS -from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity -from sdmetrics.timeseries.statistic_msas import StatisticMSAS __all__ = [ 'base', @@ -19,7 +16,4 @@ 'LSTMDetection', 'TimeSeriesEfficacyMetric', 'LSTMClassifierEfficacy', - 'InterRowMSAS', - 'SequenceLengthSimilarity', - 'StatisticMSAS', ] diff --git a/tests/unit/timeseries/test_inter_row_msas.py b/tests/unit/column_pairs/statistical/test_inter_row_msas.py similarity index 99% rename from tests/unit/timeseries/test_inter_row_msas.py rename to tests/unit/column_pairs/statistical/test_inter_row_msas.py index 14101079..9a3552db 100644 --- a/tests/unit/timeseries/test_inter_row_msas.py +++ b/tests/unit/column_pairs/statistical/test_inter_row_msas.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from sdmetrics.timeseries.inter_row_msas import InterRowMSAS +from sdmetrics.column_pairs import InterRowMSAS class TestInterRowMSAS: diff --git a/tests/unit/timeseries/test_statistic_msas.py b/tests/unit/column_pairs/statistical/test_statistic_msas.py similarity index 98% rename from tests/unit/timeseries/test_statistic_msas.py rename to tests/unit/column_pairs/statistical/test_statistic_msas.py index f44812ec..9e8813eb 100644 --- a/tests/unit/timeseries/test_statistic_msas.py +++ b/tests/unit/column_pairs/statistical/test_statistic_msas.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from sdmetrics.timeseries import StatisticMSAS +from sdmetrics.column_pairs import StatisticMSAS class TestStatisticMSAS: diff --git a/tests/unit/timeseries/test_sequence_length_similarity.py b/tests/unit/single_column/statistical/test_sequence_length_similarity.py similarity index 93% rename from tests/unit/timeseries/test_sequence_length_similarity.py rename to tests/unit/single_column/statistical/test_sequence_length_similarity.py index 903b3b91..4e27ab98 100644 --- a/tests/unit/timeseries/test_sequence_length_similarity.py +++ b/tests/unit/single_column/statistical/test_sequence_length_similarity.py @@ -1,6 +1,6 @@ import pandas as pd -from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity +from sdmetrics.single_column import SequenceLengthSimilarity class TestSequenceLengthSimilarity: From 83871dfefe657090ca69857d445af43a29f0da51 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Thu, 14 Nov 2024 12:30:49 -0800 Subject: [PATCH 5/5] Fix `SequenceLengthSimilarity` docstring (#659) --- .../single_column/statistical/sequence_length_similarity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdmetrics/single_column/statistical/sequence_length_similarity.py b/sdmetrics/single_column/statistical/sequence_length_similarity.py index 174f0874..105f159b 100644 --- a/sdmetrics/single_column/statistical/sequence_length_similarity.py +++ b/sdmetrics/single_column/statistical/sequence_length_similarity.py @@ -41,9 +41,9 @@ def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: - Return this score Args: - real_data (Union[numpy.ndarray, pandas.DataFrame]): + real_data (pd.Series): The values from the real dataset. - synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + synthetic_data (pd.Series): The values from the synthetic dataset. Returns: