ENH: add crosstab function and test

pandas-dev · Jan 16, 2012 · 4bea867 · 4bea867
1 parent 908cae5
commit 4bea867
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 32 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -72,6 +72,7 @@ pandas 0.7.0
     multiple rows (GH #464)
   - Add ``level`` argument to ``DataFrame.xs`` for selecting data from other
     MultiIndex levels (GH #371, GH #629)
+  - New ``crosstab`` function for easily computing frequency tables (GH #170)
 
 **API Changes**
 

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -751,6 +751,11 @@ def _asarray_tuplesafe(values, dtype=None):
 
     return result
 
+def _maybe_make_list(obj):
+    if obj is not None and not isinstance(obj, (tuple, list)):
+        return [obj]
+    return obj
+
 def is_integer(obj):
     return isinstance(obj, (int, long, np.integer))
 
@@ -795,7 +800,7 @@ def load(path):
 
     Parameters
     ----------
-    path : string
+p    path : string
         File path
 
     Returns

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -314,9 +314,10 @@ def size(self):
         keys, values = zip(*result)
 
         if len(self.groupings) > 1:
-            index = MultiIndex.from_tuples(keys)
+            names = [ping.name for ping in self.groupings]
+            index = MultiIndex.from_tuples(keys, names=names)
         else:
-            index = Index(keys)
+            index = Index(keys, name=self.groupings[0].name)
 
         return Series(values, index=index)
 

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -92,11 +92,17 @@ def dtype(self):
     def nlevels(self):
         return 1
 
-    @property
-    def names(self):
-        # for compat with multindex code
+    # for compat with multindex code
+
+    def _get_names(self):
         return [self.name]
 
+    def _set_names(self, values):
+        assert(len(values) == 1)
+        self.name = values[0]
+
+    names = property(fset=_set_names, fget=_get_names)
+
     @property
     def _constructor(self):
         return Index

diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -53,9 +53,9 @@ def __init__(self, left, right, how='inner', on=None,
         self.how = how
         self.axis = axis
 
-        self.on = _maybe_make_list(on)
-        self.left_on = _maybe_make_list(left_on)
-        self.right_on = _maybe_make_list(right_on)
+        self.on = com._maybe_make_list(on)
+        self.left_on = com._maybe_make_list(left_on)
+        self.right_on = com._maybe_make_list(right_on)
 
         self.drop_keys = False # set this later...kludge
 
@@ -333,11 +333,6 @@ def _get_multiindex_indexer(join_keys, index, sort=True):
     # NOW! reorder
     #right_indexer.take(left_indexer.argsort())
 
-def _maybe_make_list(obj):
-    if obj is not None and not isinstance(obj, (tuple, list)):
-        return [obj]
-    return obj
-
 def _right_outer_join(x, y, max_groups):
     right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups)
     return left_indexer, right_indexer

diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -2,6 +2,7 @@
 
 from pandas import Series, DataFrame
 from pandas.tools.merge import concat
+import pandas.core.common as com
 import numpy as np
 
 def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean,
@@ -165,31 +166,80 @@ def _convert_by(by):
         by = list(by)
     return by
 
-def crosstab(rows, columns):
+def crosstab(rows, cols, rownames=None, colnames=None):
     """
     Compute a simple cross-tabulation of two (or more) factors
 
     Parameters
     ----------
-    rows :
-    columns :
+    rows : array-like, Series, or list of arrays/Series
+        Values to group by in the rows
+    cols : array-like, Series, or list of arrays/Series
+        Values to group by in the columns
+    rownames : sequence, default None
+        If passed, must match number of row arrays passed
+    colnames : sequence, default None
+        If passed, must match number of column arrays passed
+
+    Notes
+    -----
+    Any Series passed will have their name attributes used unless row or column
+    names for the cross-tabulation are specified
+
+    Examples
+    --------
+    >>> a
+    array([foo, foo, foo, foo, bar, bar,
+           bar, bar, foo, foo, foo], dtype=object)
+    >>> b
+    array([one, one, one, two, one, one,
+           one, two, two, two, one], dtype=object)
+    >>> c
+    array([dull, dull, shiny, dull, dull, shiny,
+           shiny, dull, shiny, shiny, shiny], dtype=object)
+
+    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
+    b    one          two
+    c    dull  shiny  dull  shiny
+    a
+    bar  1     2      1     0
+    foo  2     2      1     2
 
     Returns
     -------
     crosstab : DataFrame
     """
-    rname = cname = None
-    if isinstance(rows, Series):
-        rname = rows.name
-
-    if isinstance(columns, Series):
-        cname = columns.name
-
-    df = DataFrame({'rows' : rows, 'columns' : columns})
-    table = df.groupby(['rows', 'columns']).size()
-
-    result = table.unstack()
-    result.columns.name = cname
-    result.index.name = rname
+    rows = com._maybe_make_list(rows)
+    cols = com._maybe_make_list(cols)
+
+    rownames = _get_names(rows, rownames, prefix='row')
+    colnames = _get_names(cols, colnames, prefix='col')
+
+    data = {}
+    data.update(zip(rownames, rows))
+    data.update(zip(colnames, cols))
+
+    df = DataFrame(data)
+    table = df.groupby(rownames + colnames).size()
+
+    for cname in colnames:
+        table = table.unstack(cname)
+
+    table.columns.names = colnames
+    table.index.names = rownames
+    return table.fillna(0).astype(np.int64)
+
+def _get_names(arrs, names, prefix='row'):
+    if names is None:
+        names = []
+        for i, arr in enumerate(arrs):
+            if isinstance(arr, Series) and arr.name is not None:
+                names.append(arr.name)
+            else:
+                names.append('%s_%d' % (prefix, i))
+    else:
+        assert(len(names) == len(arrs))
+        if not isinstance(names, list):
+            names = list(names)
 
-    return result
+    return names
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from pandas import DataFrame, concat
-from pandas.tools.pivot import pivot_table
+from pandas.tools.pivot import pivot_table, crosstab
 import pandas.util.testing as tm
 
 class TestPivotTable(unittest.TestCase):
@@ -124,6 +124,59 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
         #     gmarg = table[valcol]['All', '']
         #     self.assertEqual(gmarg, self.data[valcol].mean())
 
+
+class TestCrosstab(unittest.TestCase):
+
+    def setUp(self):
+        df = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
+                               'bar', 'bar', 'bar', 'bar',
+                               'foo', 'foo', 'foo'],
+                        'B' : ['one', 'one', 'one', 'two',
+                               'one', 'one', 'one', 'two',
+                               'two', 'two', 'one'],
+                        'C' : ['dull', 'dull', 'shiny', 'dull',
+                               'dull', 'shiny', 'shiny', 'dull',
+                               'shiny', 'shiny', 'shiny'],
+                        'D' : np.random.randn(11),
+                        'E' : np.random.randn(11),
+                        'F' : np.random.randn(11)})
+
+        self.df = df.append(df, ignore_index=True)
+
+    def test_crosstab_single(self):
+        df = self.df
+        result = crosstab(df['A'], df['C'])
+        expected = df.groupby(['A', 'C']).size().unstack()
+        tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
+
+    def test_crosstab_multiple(self):
+        df = self.df
+
+        result = crosstab(df['A'], [df['B'], df['C']])
+        expected = df.groupby(['A', 'B', 'C']).size()
+        expected = expected.unstack('B').unstack('C').fillna(0).astype(np.int64)
+        tm.assert_frame_equal(result, expected)
+
+        result = crosstab([df['B'], df['C']], df['A'])
+        expected = df.groupby(['B', 'C', 'A']).size()
+        expected = expected.unstack('A').fillna(0).astype(np.int64)
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_ndarray(self):
+        a = np.random.randint(0, 5, size=100)
+        b = np.random.randint(0, 3, size=100)
+        c = np.random.randint(0, 10, size=100)
+
+        df = DataFrame({'a': a, 'b': b, 'c': c})
+
+        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
+        expected = crosstab(df['a'], [df['b'], df['c']])
+        tm.assert_frame_equal(result, expected)
+
+        result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
+        expected = crosstab([df['b'], df['c']], df['a'])
+        tm.assert_frame_equal(result, expected)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],