Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sdmetrics/timeseries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric
from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy
from sdmetrics.timeseries.inter_row_msas import InterRowMSAS
from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity

__all__ = [
'base',
Expand All @@ -18,4 +19,5 @@
'TimeSeriesEfficacyMetric',
'LSTMClassifierEfficacy',
'InterRowMSAS',
'SequenceLengthSimilarity',
]
53 changes: 53 additions & 0 deletions sdmetrics/timeseries/sequence_length_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""SequenceLengthSimilarity module."""

import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class SequenceLengthSimilarity:
"""Sequence Length Similarity metric.

Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Sequence Length Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
"""Compute this metric.

The length of a sequence is determined by the number of times the same sequence key occurs.
For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
length 150. This metric compares the lengths of all sequence keys in the
real data vs. the synthetic data.

It works as follows:
- Calculate the length of each sequence in the real data
- Calculate the length of each sequence in the synthetic data
- Apply the KSComplement metric to compare the similarities of the distributions
- Return this score

Args:
real_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the real dataset.
synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the synthetic dataset.

Returns:
float:
The score.
"""
return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
41 changes: 41 additions & 0 deletions tests/unit/timeseries/test_sequence_length_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd

from sdmetrics.timeseries.sequence_length_similarity import SequenceLengthSimilarity


class TestSequenceLengthSimilarity:
def test_compute(self):
"""Test it runs."""
# Setup
real_data = pd.Series(['id1', 'id2', 'id2', 'id3'])
synthetic_data = pd.Series(['id4', 'id5', 'id6'])

# Run
score = SequenceLengthSimilarity.compute(real_data, synthetic_data)

# Assert
assert score == 0.6666666666666667

def test_compute_one(self):
"""Test it returns 1 when real and synthetic data have the same distribution."""
# Setup
real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6'])

# Run
score = SequenceLengthSimilarity.compute(real_data, synthetic_data)

# Assert
assert score == 1

def test_compute_low_score(self):
"""Test it for distinct distributions."""
# Setup
real_data = pd.Series([f'id{i}' for i in range(100)])
synthetic_data = pd.Series(['id100'] * 100)

# Run
score = SequenceLengthSimilarity.compute(real_data, synthetic_data)

# Assert
assert score == 0
Loading