Skip to content

Commit

Permalink
ENH: add crosstab function and test
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jan 16, 2012
1 parent 908cae5 commit 4bea867
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 32 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Expand Up @@ -72,6 +72,7 @@ pandas 0.7.0
multiple rows (GH #464)
- Add ``level`` argument to ``DataFrame.xs`` for selecting data from other
MultiIndex levels (GH #371, GH #629)
- New ``crosstab`` function for easily computing frequency tables (GH #170)

**API Changes**

Expand Down
7 changes: 6 additions & 1 deletion pandas/core/common.py
Expand Up @@ -751,6 +751,11 @@ def _asarray_tuplesafe(values, dtype=None):

return result

def _maybe_make_list(obj):
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
return obj

def is_integer(obj):
return isinstance(obj, (int, long, np.integer))

Expand Down Expand Up @@ -795,7 +800,7 @@ def load(path):
Parameters
----------
path : string
p path : string
File path
Returns
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/groupby.py
Expand Up @@ -314,9 +314,10 @@ def size(self):
keys, values = zip(*result)

if len(self.groupings) > 1:
index = MultiIndex.from_tuples(keys)
names = [ping.name for ping in self.groupings]
index = MultiIndex.from_tuples(keys, names=names)
else:
index = Index(keys)
index = Index(keys, name=self.groupings[0].name)

return Series(values, index=index)

Expand Down
12 changes: 9 additions & 3 deletions pandas/core/index.py
Expand Up @@ -92,11 +92,17 @@ def dtype(self):
def nlevels(self):
return 1

@property
def names(self):
# for compat with multindex code
# for compat with multindex code

def _get_names(self):
return [self.name]

def _set_names(self, values):
assert(len(values) == 1)
self.name = values[0]

names = property(fset=_set_names, fget=_get_names)

@property
def _constructor(self):
return Index
Expand Down
11 changes: 3 additions & 8 deletions pandas/tools/merge.py
Expand Up @@ -53,9 +53,9 @@ def __init__(self, left, right, how='inner', on=None,
self.how = how
self.axis = axis

self.on = _maybe_make_list(on)
self.left_on = _maybe_make_list(left_on)
self.right_on = _maybe_make_list(right_on)
self.on = com._maybe_make_list(on)
self.left_on = com._maybe_make_list(left_on)
self.right_on = com._maybe_make_list(right_on)

self.drop_keys = False # set this later...kludge

Expand Down Expand Up @@ -333,11 +333,6 @@ def _get_multiindex_indexer(join_keys, index, sort=True):
# NOW! reorder
#right_indexer.take(left_indexer.argsort())

def _maybe_make_list(obj):
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
return obj

def _right_outer_join(x, y, max_groups):
right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups)
return left_indexer, right_indexer
Expand Down
84 changes: 67 additions & 17 deletions pandas/tools/pivot.py
Expand Up @@ -2,6 +2,7 @@

from pandas import Series, DataFrame
from pandas.tools.merge import concat
import pandas.core.common as com
import numpy as np

def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean,
Expand Down Expand Up @@ -165,31 +166,80 @@ def _convert_by(by):
by = list(by)
return by

def crosstab(rows, columns):
def crosstab(rows, cols, rownames=None, colnames=None):
"""
Compute a simple cross-tabulation of two (or more) factors
Parameters
----------
rows :
columns :
rows : array-like, Series, or list of arrays/Series
Values to group by in the rows
cols : array-like, Series, or list of arrays/Series
Values to group by in the columns
rownames : sequence, default None
If passed, must match number of row arrays passed
colnames : sequence, default None
If passed, must match number of column arrays passed
Notes
-----
Any Series passed will have their name attributes used unless row or column
names for the cross-tabulation are specified
Examples
--------
>>> a
array([foo, foo, foo, foo, bar, bar,
bar, bar, foo, foo, foo], dtype=object)
>>> b
array([one, one, one, two, one, one,
one, two, two, two, one], dtype=object)
>>> c
array([dull, dull, shiny, dull, dull, shiny,
shiny, dull, shiny, shiny, shiny], dtype=object)
>>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
b one two
c dull shiny dull shiny
a
bar 1 2 1 0
foo 2 2 1 2
Returns
-------
crosstab : DataFrame
"""
rname = cname = None
if isinstance(rows, Series):
rname = rows.name

if isinstance(columns, Series):
cname = columns.name

df = DataFrame({'rows' : rows, 'columns' : columns})
table = df.groupby(['rows', 'columns']).size()

result = table.unstack()
result.columns.name = cname
result.index.name = rname
rows = com._maybe_make_list(rows)
cols = com._maybe_make_list(cols)

rownames = _get_names(rows, rownames, prefix='row')
colnames = _get_names(cols, colnames, prefix='col')

data = {}
data.update(zip(rownames, rows))
data.update(zip(colnames, cols))

df = DataFrame(data)
table = df.groupby(rownames + colnames).size()

for cname in colnames:
table = table.unstack(cname)

table.columns.names = colnames
table.index.names = rownames
return table.fillna(0).astype(np.int64)

def _get_names(arrs, names, prefix='row'):
if names is None:
names = []
for i, arr in enumerate(arrs):
if isinstance(arr, Series) and arr.name is not None:
names.append(arr.name)
else:
names.append('%s_%d' % (prefix, i))
else:
assert(len(names) == len(arrs))
if not isinstance(names, list):
names = list(names)

return result
return names
55 changes: 54 additions & 1 deletion pandas/tools/tests/test_pivot.py
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from pandas import DataFrame, concat
from pandas.tools.pivot import pivot_table
from pandas.tools.pivot import pivot_table, crosstab
import pandas.util.testing as tm

class TestPivotTable(unittest.TestCase):
Expand Down Expand Up @@ -124,6 +124,59 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
# gmarg = table[valcol]['All', '']
# self.assertEqual(gmarg, self.data[valcol].mean())


class TestCrosstab(unittest.TestCase):

def setUp(self):
df = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
'bar', 'bar', 'bar', 'bar',
'foo', 'foo', 'foo'],
'B' : ['one', 'one', 'one', 'two',
'one', 'one', 'one', 'two',
'two', 'two', 'one'],
'C' : ['dull', 'dull', 'shiny', 'dull',
'dull', 'shiny', 'shiny', 'dull',
'shiny', 'shiny', 'shiny'],
'D' : np.random.randn(11),
'E' : np.random.randn(11),
'F' : np.random.randn(11)})

self.df = df.append(df, ignore_index=True)

def test_crosstab_single(self):
df = self.df
result = crosstab(df['A'], df['C'])
expected = df.groupby(['A', 'C']).size().unstack()
tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))

def test_crosstab_multiple(self):
df = self.df

result = crosstab(df['A'], [df['B'], df['C']])
expected = df.groupby(['A', 'B', 'C']).size()
expected = expected.unstack('B').unstack('C').fillna(0).astype(np.int64)
tm.assert_frame_equal(result, expected)

result = crosstab([df['B'], df['C']], df['A'])
expected = df.groupby(['B', 'C', 'A']).size()
expected = expected.unstack('A').fillna(0).astype(np.int64)
tm.assert_frame_equal(result, expected)

def test_crosstab_ndarray(self):
a = np.random.randint(0, 5, size=100)
b = np.random.randint(0, 3, size=100)
c = np.random.randint(0, 10, size=100)

df = DataFrame({'a': a, 'b': b, 'c': c})

result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
expected = crosstab(df['a'], [df['b'], df['c']])
tm.assert_frame_equal(result, expected)

result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
expected = crosstab([df['b'], df['c']], df['a'])
tm.assert_frame_equal(result, expected)

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
Expand Down

0 comments on commit 4bea867

Please sign in to comment.