Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions sdmetrics/column_pairs/statistical/inter_row_msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import numpy as np
import pandas as pd

from sdmetrics.column_pairs.base import ColumnPairsMetric
from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class InterRowMSAS:
class InterRowMSAS(ColumnPairsMetric):
"""Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.

Attributes:
Expand Down Expand Up @@ -76,15 +77,17 @@
num_invalid_groups = len(group_sizes[group_sizes <= n_rows_diff])
if num_invalid_groups > 0:
warnings.warn(
f"n_rows_diff '{n_rows_diff}' is greater than the "
f"n_rows_diff '{n_rows_diff}' is greater or equal to the "
f'size of {num_invalid_groups} sequence keys in {data_name}.'
)

def diff_func(group):
if len(group) <= n_rows_diff:
return np.nan
group = group.to_numpy()
return np.mean(group[n_rows_diff:] - group[:-n_rows_diff])
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='Mean of empty slice')
return np.nanmean(group[n_rows_diff:] - group[:-n_rows_diff])

Check warning on line 90 in sdmetrics/column_pairs/statistical/inter_row_msas.py

View check run for this annotation

Codecov / codecov/patch

sdmetrics/column_pairs/statistical/inter_row_msas.py#L88-L90

Added lines #L88 - L90 were not covered by tests

with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='invalid value encountered in.*')
Expand Down
3 changes: 2 additions & 1 deletion sdmetrics/column_pairs/statistical/statistic_msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

import pandas as pd

from sdmetrics.column_pairs.base import ColumnPairsMetric
from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class StatisticMSAS:
class StatisticMSAS(ColumnPairsMetric):
"""Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.

Attributes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.base import SingleColumnMetric
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class SequenceLengthSimilarity:
class SequenceLengthSimilarity(SingleColumnMetric):
"""Sequence Length Similarity metric.

Attributes:
Expand Down
42 changes: 40 additions & 2 deletions tests/unit/column_pairs/statistical/test_inter_row_msas.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,31 @@
from datetime import datetime

import numpy as np
import pandas as pd
import pytest

from sdmetrics.column_pairs import InterRowMSAS


class TestInterRowMSAS:
def test_compute_breakdown(self):
"""Test `compute_breakdown` works."""
# Setup
real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
real_values = pd.Series([1, 2, 3, 4, 5, 6])
synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])

metric = InterRowMSAS()

# Run
result = metric.compute_breakdown(
real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
)

# Assert
assert result == {'score': 0.5}

def test_compute(self):
"""Test it runs."""
# Setup
Expand All @@ -23,6 +42,22 @@ def test_compute(self):
# Assert
assert score == 0.5

def test_compute_nans(self):
"""Test it runs with nans."""
# Setup
real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
real_values = pd.Series([1, 2, np.nan, 4, 5, 8])
synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
synthetic_values = pd.Series([1, 10, 4, 7, np.nan, np.nan])

# Run
score = InterRowMSAS.compute(
real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
)

# Assert
assert score == 0.5

def test_compute_identical_sequences(self):
"""Test it returns 1 when real and synthetic data are identical."""
# Setup
Expand Down Expand Up @@ -94,9 +129,10 @@ def test_compute_with_log_warning(self):
'There are 3 non-positive values in your data, which cannot be used with log. '
"Consider changing 'apply_log' to False for a better result."
)

assert len(warning_info) == 1
assert str(warning_info[0].message) == expected_message
assert score == 0
assert score == 0.5

def test_compute_with_log_datetime(self):
"""Test it crashes for logs of datetime values."""
Expand Down Expand Up @@ -211,7 +247,9 @@ def test_compute_warning(self):
synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])

# Run and Assert
warn_msg = "n_rows_diff '10' is greater than the size of 2 sequence keys in real_data."
warn_msg = (
"n_rows_diff '10' is greater or equal to the size of 2 sequence keys in real_data."
)
with pytest.warns(UserWarning, match=warn_msg):
score = InterRowMSAS.compute(
real_data=(real_keys, real_values),
Expand Down
18 changes: 18 additions & 0 deletions tests/unit/column_pairs/statistical/test_statistic_msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,24 @@


class TestStatisticMSAS:
def test_compute_breakdown(self):
"""Test `compute_breakdown` works."""
# Setup
real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
real_values = pd.Series([1, 2, 3, 4, 5, 6])
synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])

metric = StatisticMSAS()

# Run
result = metric.compute_breakdown(
real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
)

# Assert
assert result == {'score': 0.5}

def test_compute_identical_sequences(self, recwarn):
"""Test it returns 1 when real and synthetic data are identical."""
# Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@


class TestSequenceLengthSimilarity:
def test_compute_breakdown(self):
"""Test `compute_breakdown` works."""
# Setup
real_data = pd.Series([1, 1, 2, 2, 2])
synthetic_data = pd.Series([3, 4, 5, 6, 6])

metric = SequenceLengthSimilarity()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.25}

def test_compute(self):
"""Test it runs."""
# Setup
Expand Down
Loading