add pandas_groupby_nan #6

sdpython · May 17, 2018 · 3d92302 · 3d92302
1 parent 738a0f9
commit 3d92302
Show file tree

Hide file tree

Showing 5 changed files with 353 additions and 2 deletions.
diff --git a/_doc/sphinxdoc/source/api/rdf.rst b/_doc/sphinxdoc/source/api/rdf.rst
@@ -31,6 +31,8 @@ Data Manipulation
 
 .. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
 
+.. autosignature:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan
+
 Complex splits
 ++++++++++++++
 

diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py
@@ -0,0 +1,112 @@
+"""
+@brief      test log(time=1s)
+
+You should indicate a time in seconds. The program ``run_unittests.py``
+will sort all test files by increasing time and run them.
+"""
+
+
+import sys
+import os
+import unittest
+import pandas
+import numpy
+from pyquickhelper.pycode import ExtTestCase
+
+
+try:
+    import src
+except ImportError:
+    path = os.path.normpath(
+        os.path.abspath(
+            os.path.join(
+                os.path.split(__file__)[0],
+                "..",
+                "..")))
+    if path not in sys.path:
+        sys.path.append(path)
+    import src
+
+from src.pandas_streaming.df import pandas_groupby_nan, numpy_types
+
+
+class TestPandasHelper(ExtTestCase):
+
+    def test_pandas_groupbynan(self):
+        types = [(int, -10), (float, -20.2), (str, "e"),
+                 (bytes, bytes("a", "ascii"))]
+        skip = (numpy.bool_, numpy.complex64, numpy.complex128)
+        types += [(_, _(5)) for _ in numpy_types() if _ not in skip]
+
+        for ty in types:
+            data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
+                    {"this": "cst", "type": "tt2=" +
+                        str(ty[0]), "value": ty[1]},
+                    {"this": "cst", "type": "row_for_nan"}]
+            df = pandas.DataFrame(data)
+            gr = pandas_groupby_nan(df, "value")
+            co = gr.sum()
+            li = list(co["value"])
+            try:
+                self.assertIsInstance(li[-1], float)
+            except AssertionError as e:
+                raise AssertionError("Issue with {0}".format(ty)) from e
+            try:
+                self.assertTrue(numpy.isnan(li[-1]))
+            except AssertionError as e:
+                raise AssertionError(
+                    "Issue with value {0}\n--df--\n{1}\n--co--\n{2}".format(li, df, co)) from e
+
+        for ty in types:
+            data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
+                    {"this": "cst", "type": "tt2=" +
+                        str(ty[0]), "value": ty[1]},
+                    {"this": "cst", "type": "row_for_nan"}]
+            df = pandas.DataFrame(data)
+            try:
+                gr = pandas_groupby_nan(df, ("value", "this"))
+                t = True
+                raise Exception("---")
+            except TypeError:
+                t = False
+            if t:
+                co = gr.sum()
+                li = list(co["value"])
+                self.assertIsInstance(li[-1], float)
+                self.assertTrue(numpy.isnan(li[-1]))
+            try:
+                gr = pandas_groupby_nan(df, ["value", "this"])
+                t = True
+            except (TypeError, NotImplementedError):
+                t = False
+
+            if t:
+                co = gr.sum()
+                li = list(co["value"])
+                self.assertEqual(len(li), 2)
+
+    def test_pandas_groupbynan_tuple(self):
+        data = [dict(a="a", b="b", c="c", n=1), dict(
+            b="b", n=2), dict(a="a", n=3), dict(c="c", n=4)]
+        df = pandas.DataFrame(data)
+        gr = df.groupby(["a", "b", "c"]).sum()
+        self.assertEqual(gr.shape, (1, 1))
+
+        for nanback in [True, False]:
+            try:
+                gr2_ = pandas_groupby_nan(
+                    df, ["a", "b", "c"], nanback=nanback, suffix="NAN")
+            except NotImplementedError:
+                continue
+            gr2 = gr2_.sum().sort_values("n")
+            self.assertEqual(gr2.shape, (4, 4))
+            d = gr2.to_dict("records")
+            self.assertEqual(d[0]["a"], "a")
+            self.assertEqual(d[0]["b"], "b")
+            self.assertEqual(d[0]["c"], "c")
+            self.assertEqual(d[0]["n"], 1)
+            self.assertEqual(d[1]["a"], "NAN")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/_unittests/ut_module/test_code_style.py b/_unittests/ut_module/test_code_style.py
@@ -33,6 +33,8 @@ def test_style_src(self):
         thi = os.path.abspath(os.path.dirname(__file__))
         src_ = os.path.normpath(os.path.join(thi, "..", "..", "src"))
         check_pep8(src_, fLOG=fLOG,
+                   pylint_ignore=('C0103', 'C1801', 'R0201', 'R1705', 'W0108', 'W0613',
+                                  'W0212'),
                    skip=["Too many nested blocks",
                          "Module 'numpy.random' has no 'RandomState' member",
                          "Value 'sch' is unsubscriptable",

diff --git a/src/pandas_streaming/df/__init__.py b/src/pandas_streaming/df/__init__.py
@@ -6,4 +6,5 @@
 from .connex_split import train_test_split_weights, train_test_connex_split, train_test_apart_stratify
 from .dataframe import StreamingDataFrame
 from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold, dataframe_shuffle
+from .dataframe_helpers import pandas_groupby_nan, numpy_types
 from .dataframe_io import to_zip, read_zip
diff --git a/src/pandas_streaming/df/dataframe_helpers.py b/src/pandas_streaming/df/dataframe_helpers.py
@@ -5,8 +5,9 @@
 """
 import hashlib
 import struct
-import pandas
+import warnings
 import numpy
+from pandas import DataFrame, Index
 
 
 def hash_str(c, hash_length):
@@ -199,7 +200,7 @@ def dataframe_unfold(df, col, new_col=None, sep=","):
             rows.append({col: v, col_name: v, temp_col: i})
     df = df.copy()
     df[temp_col] = list(range(df.shape[0]))
-    dfj = pandas.DataFrame(rows)
+    dfj = DataFrame(rows)
     res = df.merge(dfj, on=[col, temp_col])
     return res.drop(temp_col, axis=1).copy()
 
@@ -249,3 +250,236 @@ def dataframe_shuffle(df, random_state=None):
     res = shuffled.set_index(keep_cols)[ori_cols]
     res.index.names = df.index.names
     return res
+
+
+def pandas_fillna(df, by, hasna=None, suffix=None):
+    """
+    Replaces the :epkg:`nan` values for something not :epkg:`nan`.
+    Mostly used by @see fn pandas_groupby_nan.
+
+    @param      df      dataframe
+    @param      by      list of columns for which we need to replace nan
+    @param      hasna   None or list of columns for which we need to replace NaN
+    @param      suffix  use a prefix for the NaN value
+    @return             list of values chosen for each column, new dataframe (new copy)
+    """
+    suffix = suffix if suffix else "²"
+    df = df.copy()
+    rep = {}
+    for c in by:
+        if hasna is not None and c not in hasna:
+            continue
+        if df[c].dtype in (str, bytes, object):
+            se = set(df[c].dropna())
+            val = se.pop()
+            if isinstance(val, str):
+                cst = suffix
+                val = ""
+            elif isinstance(val, bytes):
+                cst = b"_"
+            else:
+                raise TypeError(
+                    "Unable to determine a constant for type='{0}' dtype='{1}'".format(val, df[c].dtype))
+            val += cst
+            while val in se:
+                val += suffix
+            df[c].fillna(val, inplace=True)
+            rep[c] = val
+        else:
+            dr = df[c].dropna()
+            mi = abs(dr.min())
+            ma = abs(dr.max())
+            val = ma + mi
+            if val <= ma:
+                raise ValueError(
+                    "Unable to find a different value for column '{0}': min={1} max={2}".format(val, mi, ma))
+            df[c].fillna(val, inplace=True)
+            rep[c] = val
+    return rep, df
+
+
+def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs):
+    """
+    Does a *groupby* including keeping missing values (:epkg:`nan`).
+
+    @param      df          dataframe
+    @param      by          column or list of columns
+    @param      axis        only 0 is allowed
+    @param      as_index    should be False
+    @param      suffix      None or a string
+    @param      nanback     put :epkg:`nan` back in the index,
+                            otherwise it leaves a replacement for :epkg:`nan`.
+                            (does not work when grouping by multiple columns)
+    @param      kwargs      other parameters sent to
+                            `groupby <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html>`_
+    @return                 groupby results
+
+    See `groupby and missing values <http://pandas-docs.github.io/pandas-docs-travis/groupby.html#na-and-nat-group-handling>`_.
+    If no :epkg:`nan` is detected, the function falls back in regular
+    :epkg:`pandas:DataFrame:groupby` which has the following
+    behavior.
+
+    .. exref::
+        :title: Group a dataframe by one column including nan values
+        :tag: dataframe
+
+        The regular :epkg:`pandas:dataframe:GroupBy` of a
+        :epkg:`pandas:DataFrame` removes every :epkg:`nan`
+        values from the index.
+
+        .. runpython::
+            :showcode:
+
+            from pandas import DataFrame
+
+            data = [dict(a=2, ind="a", n=1), dict(a=2, ind="a"), dict(a=3, ind="b"), dict(a=30)]
+            df = DataFrame(data)
+            print(df)
+            gr = df.groupby(["ind"]).sum()
+            print(gr)
+
+        Function @see fn pandas_groupby_nan modifies keeps them.
+
+        .. runpython::
+            :showcode:
+
+            from pandas import DataFrame
+            from pyensae.ml_helper import pandas_groupby_nan
+
+            data = [dict(a=2, ind="a", n=1), dict(a=2, ind="a"), dict(a=3, ind="b"), dict(a=30)]
+            df = DataFrame(data)
+            gr2 = pandas_groupby_nan(df, ["ind"]).sum()
+            print(gr2)
+    """
+    if axis != 0:
+        raise NotImplementedError("axis should be 0")
+    if as_index:
+        raise NotImplementedError("as_index must be False")
+    if isinstance(by, tuple):
+        raise TypeError("by should be of list not tuple")
+    if not isinstance(by, list):
+        by = [by]
+    hasna = {}
+    for b in by:
+        h = df[b].isnull().values.any()
+        if h:
+            hasna[b] = True
+    if len(hasna) > 0:
+        rep, df_copy = pandas_fillna(df, by, hasna, suffix=suffix)
+        res = df_copy.groupby(by, axis=axis, as_index=as_index, **kwargs)
+        if len(by) == 1:
+            if not nanback:
+                dummy = DataFrame([{"a": "a"}])
+                do = dummy.dtypes[0]
+                typ = {c: t for c, t in zip(df.columns, df.dtypes)}
+                if typ[by[0]] != do:
+                    warnings.warn(
+                        "[pandas_groupby_nan] NaN value: {0}".format(rep))
+                return res
+            for b in by:
+                fnan = rep[b]
+                if fnan in res.grouper.groups:
+                    res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
+                    del res.grouper.groups[fnan]
+                new_val = list((numpy.nan if b == fnan else b)
+                               for b in res.grouper.result_index)
+                res.grouper.groupings[0]._group_index = Index(new_val)
+                res.grouper.groupings[0].obj[b].replace(
+                    fnan, numpy.nan, inplace=True)
+                if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray):
+                    arr = numpy.array(new_val)
+                    res.grouper.groupings[0].grouper = arr
+                    if hasattr(res.grouper.groupings[0], '_cache') and 'result_index' in res.grouper.groupings[0]._cache:
+                        del res.grouper.groupings[0]._cache['result_index']
+                else:
+                    raise NotImplementedError("Not implemented for type: {0}".format(
+                        type(res.grouper.groupings[0].grouper)))
+                res.grouper._cache['result_index'] = res.grouper.groupings[0]._group_index
+        else:
+            if not nanback:
+                dummy = DataFrame([{"a": "a"}])
+                do = dummy.dtypes[0]
+                typ = {c: t for c, t in zip(df.columns, df.dtypes)}
+                for b in by:
+                    if typ[b] != do:
+                        warnings.warn(
+                            "[pandas_groupby_nan] NaN values: {0}".format(rep))
+                        break
+                return res
+            raise NotImplementedError(
+                "Not yet implemented. Replacing pseudo nan values by real nan values is not as easy as it looks. Use nanback=False")
+
+            # keys = list(res.grouper.groups.keys())
+            # didit = False
+            # mapping = {}
+            # for key in keys:
+            #     new_key = list(key)
+            #     mod = False
+            #     for k, b in enumerate(by):
+            #         if b not in rep:
+            #             continue
+            #         fnan = rep[b]
+            #         if key[k] == fnan:
+            #             new_key[k] = numpy.nan
+            #             mod = True
+            #             didit = True
+            #             mapping[fnan] = numpy.nan
+            #     if mod:
+            #         new_key = tuple(new_key)
+            #         mapping[key] = new_key
+            #         res.grouper.groups[new_key] = res.grouper.groups[key]
+            #         del res.grouper.groups[key]
+            # if didit:
+            #     # this code deos not work
+            #     vnan = numpy.nan
+            #     new_index = list(mapping.get(v, v)
+            #                      for v in res.grouper.result_index)
+            #     names = res.grouper.result_index.names
+            #     # index = MultiIndex.from_tuples(tuples=new_index, names=names)
+            #     # res.grouper.result_index = index  # does not work cannot set
+            #     # values for [result_index]
+            #     for k in range(len(res.grouper.groupings)):
+            #         grou = res.grouper.groupings[k]
+            #         new_val = list(mapping.get(v, v) for v in grou)
+            #         grou._group_index = Index(new_val)
+            #         b = names[k]
+            #         if b in rep:
+            #             vv = rep[b]
+            #             grou.obj[b].replace(vv, vnan, inplace=True)
+            #         if isinstance(grou.grouper, numpy.ndarray):
+            #             grou.grouper = numpy.array(new_val)
+            #         else:
+            #             raise NotImplementedError(
+            #                 "Not implemented for type: {0}".format(type(grou.grouper)))
+            #     del res.grouper._cache
+        return res
+    else:
+        return df.groupby(by, axis=axis, **kwargs)
+
+
+def numpy_types():
+    """
+    Returns the list of :epkg:`numpy` available types.
+
+    @return     list of types
+    """
+
+    return [numpy.bool_,
+            numpy.int_,
+            numpy.intc,
+            numpy.intp,
+            numpy.int8,
+            numpy.int16,
+            numpy.int32,
+            numpy.int64,
+            numpy.uint8,
+            numpy.uint16,
+            numpy.uint32,
+            numpy.uint64,
+            numpy.float_,
+            numpy.float16,
+            numpy.float32,
+            numpy.float64,
+            numpy.complex_,
+            numpy.complex64,
+            numpy.complex128]