Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 67 additions & 22 deletions sdmetrics/column_pairs/statistical/contingency_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ class ContingencySimilarity(ColumnPairsMetric):
max_value = 1.0

@staticmethod
def _validate_inputs(real_data, synthetic_data, continuous_column_names, num_discrete_bins):
def _validate_inputs(
real_data, synthetic_data, continuous_column_names, num_discrete_bins, num_rows_subsample
):
for data in [real_data, synthetic_data]:
if not isinstance(data, pd.DataFrame) or len(data.columns) != 2:
raise ValueError('The data must be a pandas DataFrame with two columns.')
Expand All @@ -47,31 +49,36 @@ def _validate_inputs(real_data, synthetic_data, continuous_column_names, num_dis
if not isinstance(num_discrete_bins, int) or num_discrete_bins <= 0:
raise ValueError('`num_discrete_bins` must be an integer greater than zero.')

if num_rows_subsample is not None:
if not isinstance(num_rows_subsample, int) or num_rows_subsample <= 0:
raise ValueError('`num_rows_subsample` must be an integer greater than zero.')

@classmethod
def compute(cls, real_data, synthetic_data, continuous_column_names=None, num_discrete_bins=10):
"""Compare the contingency similarity of two discrete columns.
def compute_breakdown(
cls,
real_data,
synthetic_data,
continuous_column_names=None,
num_discrete_bins=10,
num_rows_subsample=None,
):
"""Compute the breakdown of this metric."""
cls._validate_inputs(
real_data,
synthetic_data,
continuous_column_names,
num_discrete_bins,
num_rows_subsample,
)
columns = real_data.columns[:2]

Args:
real_data (pd.DataFrame):
The values from the real dataset.
synthetic_data (pd.DataFrame):
The values from the synthetic dataset.
continuous_column_names (list[str], optional):
The list of columns to discretize before running the metric. The column names in
this list should match the column names in the real and synthetic data. Defaults
to ``None``.
num_discrete_bins (int, optional):
The number of bins to create for the continuous columns. Defaults to 10.
if num_rows_subsample is not None:
real_data = real_data.sample(min(num_rows_subsample, len(real_data)))
synthetic_data = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data)))

Returns:
float:
The contingency similarity of the two columns.
"""
cls._validate_inputs(real_data, synthetic_data, continuous_column_names, num_discrete_bins)
columns = real_data.columns[:2]
real = real_data[columns]
synthetic = synthetic_data[columns]
if continuous_column_names is not None:
if continuous_column_names:
for column in continuous_column_names:
real[column], synthetic[column] = discretize_column(
real[column], synthetic[column], num_discrete_bins=num_discrete_bins
Expand All @@ -86,7 +93,45 @@ def compute(cls, real_data, synthetic_data, continuous_column_names=None, num_di
contingency_real = contingency_real.reindex(combined_index, fill_value=0)
diff = abs(contingency_real - contingency_synthetic).fillna(0)
variation = diff / 2
return 1 - variation.sum()
return {'score': 1 - variation.sum()}

@classmethod
def compute(
cls,
real_data,
synthetic_data,
continuous_column_names=None,
num_discrete_bins=10,
num_rows_subsample=None,
):
"""Compare the contingency similarity of two discrete columns.

Args:
real_data (pd.DataFrame):
The values from the real dataset.
synthetic_data (pd.DataFrame):
The values from the synthetic dataset.
continuous_column_names (list[str], optional):
The list of columns to discretize before running the metric. The column names in
this list should match the column names in the real and synthetic data. Defaults
to ``None``.
num_discrete_bins (int, optional):
The number of bins to create for the continuous columns. Defaults to 10.
num_rows_subsample (int, optional):
The number of rows to subsample from the real and synthetic data before computing
the metric. Defaults to ``None``.

Returns:
float:
The contingency similarity of the two columns.
"""
return cls.compute_breakdown(
real_data,
synthetic_data,
continuous_column_names,
num_discrete_bins,
num_rows_subsample,
)['score']

@classmethod
def normalize(cls, raw_score):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np

from sdmetrics.column_pairs.statistical import ContingencySimilarity
from sdmetrics.demos import load_demo


def test_with_num_rows_subsample():
"""Test the metric with `num_rows_subsample`.

Here the `real_data` and `syntehtic_data` have 218 rows.
"""
# Setup
real_data, synthetic_data, _ = load_demo('single_table')
real_data = real_data[['degree_type', 'high_spec']]
synthetic_data = synthetic_data[['degree_type', 'high_spec']]
num_rows_subsample = 100

# Run
result_1 = ContingencySimilarity.compute(
real_data=real_data,
synthetic_data=synthetic_data,
num_rows_subsample=num_rows_subsample,
)
result_2 = ContingencySimilarity.compute(
real_data=real_data,
synthetic_data=synthetic_data,
num_rows_subsample=num_rows_subsample,
)
result_entire_data = ContingencySimilarity.compute(
real_data=real_data,
synthetic_data=synthetic_data,
num_rows_subsample=None,
)

# Assert
assert result_1 != result_2
assert result_1 != result_entire_data
assert result_2 != result_entire_data
assert np.isclose(result_1, result_entire_data, atol=0.1)
assert np.isclose(result_2, result_entire_data, atol=0.1)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from unittest.mock import patch

import numpy as np
import pandas as pd
import pytest

Expand All @@ -17,15 +18,24 @@ def test__validate_inputs(self):
synthetic_data = pd.DataFrame({'col1': range(5), 'col2': range(5)})
bad_continous_columns = ['col1', 'missing_col']
bad_num_discrete_bins = -1
bad_num_rows_subsample = -1

# Run and Assert
ContingencySimilarity._validate_inputs(
real_data=real_data,
synthetic_data=synthetic_data,
continuous_column_names=None,
num_discrete_bins=10,
num_rows_subsample=3,
)
expected_bad_data = re.escape('The data must be a pandas DataFrame with two columns.')
with pytest.raises(ValueError, match=expected_bad_data):
ContingencySimilarity._validate_inputs(
real_data=bad_data,
synthetic_data=bad_data,
continuous_column_names=None,
num_discrete_bins=10,
num_rows_subsample=3,
)

expected_mismatch_columns_error = re.escape(
Expand All @@ -37,6 +47,7 @@ def test__validate_inputs(self):
synthetic_data=bad_synthetic_data,
continuous_column_names=None,
num_discrete_bins=10,
num_rows_subsample=3,
)

expected_bad_continous_column_error = re.escape(
Expand All @@ -48,6 +59,7 @@ def test__validate_inputs(self):
synthetic_data=synthetic_data,
continuous_column_names=bad_continous_columns,
num_discrete_bins=10,
num_rows_subsample=3,
)

expected_bad_num_discrete_bins_error = re.escape(
Expand All @@ -59,9 +71,41 @@ def test__validate_inputs(self):
synthetic_data=synthetic_data,
continuous_column_names=['col1'],
num_discrete_bins=bad_num_discrete_bins,
num_rows_subsample=3,
)
expected_bad_num_rows_subsample_error = re.escape(
'`num_rows_subsample` must be an integer greater than zero.'
)
with pytest.raises(ValueError, match=expected_bad_num_rows_subsample_error):
ContingencySimilarity._validate_inputs(
real_data=real_data,
synthetic_data=synthetic_data,
continuous_column_names=['col1'],
num_discrete_bins=10,
num_rows_subsample=bad_num_rows_subsample,
)

def test_compute(self):
@patch(
'sdmetrics.column_pairs.statistical.contingency_similarity.ContingencySimilarity.compute_breakdown'
)
def test_compute_mock(self, compute_breakdown_mock):
"""Test that the ``compute`` method calls the ``compute_breakdown`` method."""
# Setup
real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]})
synthetic_data = pd.DataFrame({'col1': [1.0, 1.8, 2.6, 1.0], 'col2': [2, 3, 7, -10]})
compute_breakdown_mock.return_value = {'score': 0.25}

# Run
score = ContingencySimilarity.compute(real_data, synthetic_data)

# Assert
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None, 10, None)
assert score == 0.25

@patch(
'sdmetrics.column_pairs.statistical.contingency_similarity.ContingencySimilarity._validate_inputs'
)
def test_compute_breakdown(self, validate_inputs_mock):
"""Test the ``compute`` method.

Expect that the total variation distance of the two contingency matricies
Expand All @@ -81,9 +125,47 @@ def test_compute(self):

# Run
metric = ContingencySimilarity()
result = metric.compute(real_data, synthetic_data)
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
validate_inputs_mock.assert_called_once_with(
real_data,
synthetic_data,
None,
10,
None,
)
assert result == {'score': expected_score}

@patch('sdmetrics.column_pairs.statistical.contingency_similarity.discretize_column')
def test_compute_with_num_rows_subsample(self, discretize_column_mock):
"""Test the ``compute`` method with ``num_rows_subsample``."""
# Setup
np.random.seed(0)
real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]})
synthetic_data = pd.DataFrame({'col1': [1.0, 1.8], 'col2': [2, 3]})
discretize_column_mock.return_value = (
pd.DataFrame({'col2': [1, 2, 3]}),
pd.DataFrame({'col2': [2, 3]}),
)
expected_score = 0.0

# Run
metric = ContingencySimilarity()
result = metric.compute(
real_data,
synthetic_data,
continuous_column_names=['col2'],
num_discrete_bins=4,
num_rows_subsample=3,
)

# Assert
arg_mock = discretize_column_mock.call_args
expected_real = pd.Series([3, 4, 2], name='col2', index=[2, 3, 1])
expected_synthetic = pd.Series([2, 3], name='col2', index=[0, 1])
pd.testing.assert_series_equal(arg_mock[0][0], expected_real)
pd.testing.assert_series_equal(arg_mock[0][1], expected_synthetic)
assert result == expected_score

def test_compute_with_discretization(self):
Expand Down
Loading