Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: stats.crosstab: convert output tuple to bunch #16958

Merged
merged 4 commits into from Sep 5, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 35 additions & 26 deletions scipy/stats/_crosstab.py
@@ -1,7 +1,12 @@
import numpy as np
from scipy.sparse import coo_matrix
from scipy._lib._bunch import _make_tuple_bunch


CrosstabResult = _make_tuple_bunch(
"CrosstabResult", ["elements", "count"]
)

def crosstab(*args, levels=None, sparse=False):
"""
Return table of counts for each possible unique combination in ``*args``.
Expand Down Expand Up @@ -35,15 +40,18 @@ def crosstab(*args, levels=None, sparse=False):

Returns
-------
elements : tuple of numpy.ndarrays.
Tuple of length ``len(args)`` containing the arrays of elements that
are counted in `count`. These can be interpreted as the labels of
the corresponding dimensions of `count`.
If `levels` was given, then if ``levels[i]`` is not None,
``elements[i]`` will hold the values given in ``levels[i]``.
count : numpy.ndarray or scipy.sparse.coo_matrix
Counts of the unique elements in ``zip(*args)``, stored in an array.
Also known as a *contingency table* when ``len(args) > 1``.
res : CrosstabResult
An object containing the following attributes:

elements : tuple of numpy.ndarrays.
Tuple of length ``len(args)`` containing the arrays of elements
that are counted in `count`. These can be interpreted as the
labels of the corresponding dimensions of `count`. If `levels` was
given, then if ``levels[i]`` is not None, ``elements[i]`` will
hold the values given in ``levels[i]``.
count : numpy.ndarray or scipy.sparse.coo_matrix
Counts of the unique elements in ``zip(*args)``, stored in an
array. Also known as a *contingency table* when ``len(args) > 1``.

See Also
--------
Expand All @@ -66,12 +74,13 @@ def crosstab(*args, levels=None, sparse=False):

>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
>>> (avals, xvals), count = crosstab(a, x)
>>> res = crosstab(a, x)
>>> avals, xvals = res.elements
>>> avals
array(['A', 'B'], dtype='<U1')
>>> xvals
array(['X', 'Y', 'Z'], dtype='<U1')
>>> count
>>> res.count
array([[2, 3, 0],
[1, 0, 4]])

Expand All @@ -80,15 +89,15 @@ def crosstab(*args, levels=None, sparse=False):
Higher dimensional contingency tables can be created.

>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
>>> (avals, xvals, pvals), count = crosstab(a, x, p)
>>> count
>>> res = crosstab(a, x, p)
>>> res.count
array([[[2, 0],
[2, 1],
[0, 0]],
[[1, 0],
[0, 0],
[1, 3]]])
>>> count.shape
>>> res.count.shape
(2, 3, 2)

The values to be counted can be set by using the `levels` argument.
Expand All @@ -102,8 +111,8 @@ def crosstab(*args, levels=None, sparse=False):
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
>>> options = [1, 2, 3, 4]
>>> vals, count = crosstab(q1, q2, levels=(options, options))
>>> count
>>> res = crosstab(q1, q2, levels=(options, options))
>>> res.count
array([[0, 0, 0, 0],
[1, 1, 0, 1],
[1, 4, 0, 1],
Expand All @@ -112,32 +121,32 @@ def crosstab(*args, levels=None, sparse=False):
If `levels` is given, but an element of `levels` is None, the unique values
of the corresponding argument are used. For example,

>>> vals, count = crosstab(q1, q2, levels=(None, options))
>>> vals
>>> res.count = crosstab(q1, q2, levels=(None, options))
>>> res.elements
[array([2, 3, 4]), [1, 2, 3, 4]]
>>> count
>>> res.count
array([[1, 1, 0, 1],
[1, 4, 0, 1],
[0, 3, 0, 3]])

If we want to ignore the pairs where 4 occurs in ``q2``, we can
give just the values [1, 2] to `levels`, and the 4 will be ignored:

>>> vals, count = crosstab(q1, q2, levels=(None, [1, 2]))
>>> vals
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
>>> res.elements
[array([2, 3, 4]), [1, 2]]
>>> count
>>> res.count
array([[1, 1],
[1, 4],
[0, 3]])

Finally, let's repeat the first example, but return a sparse matrix:

>>> (avals, xvals), count = crosstab(a, x, sparse=True)
>>> count
>>> res = crosstab(a, x, sparse=True)
>>> res.count
<2x3 sparse matrix of type '<class 'numpy.int64'>'
with 4 stored elements in COOrdinate format>
>>> count.A
>>> res.count.A
array([[2, 3, 0],
[1, 0, 4]])

Expand Down Expand Up @@ -191,4 +200,4 @@ def crosstab(*args, levels=None, sparse=False):
count = np.zeros(shape, dtype=int)
np.add.at(count, indices, 1)

return actual_levels, count
return CrosstabResult(actual_levels, count)
7 changes: 6 additions & 1 deletion scipy/stats/tests/test_crosstab.py
@@ -1,6 +1,6 @@
import pytest
import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_equal, assert_equal
from scipy.stats.contingency import crosstab


Expand Down Expand Up @@ -108,3 +108,8 @@ def test_validation_sparse_only_two_args():
def test_validation_len_levels_matches_args():
with pytest.raises(ValueError, match='number of input sequences'):
crosstab([0, 1, 1], [8, 8, 9], levels=([0, 1, 2, 3],))


def test_result():
res = crosstab([0, 1], [1, 2])
assert_equal((res.elements, res.count), res)