Skip to content

Commit

Permalink
add pandas_groupby_nan #6
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed May 17, 2018
1 parent 738a0f9 commit 3d92302
Show file tree
Hide file tree
Showing 5 changed files with 353 additions and 2 deletions.
2 changes: 2 additions & 0 deletions _doc/sphinxdoc/source/api/rdf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ Data Manipulation

.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold

.. autosignature:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan

Complex splits
++++++++++++++

Expand Down
112 changes: 112 additions & 0 deletions _unittests/ut_df/test_pandas_groupbynan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
@brief test log(time=1s)
You should indicate a time in seconds. The program ``run_unittests.py``
will sort all test files by increasing time and run them.
"""


import sys
import os
import unittest
import pandas
import numpy
from pyquickhelper.pycode import ExtTestCase


try:
import src
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..")))
if path not in sys.path:
sys.path.append(path)
import src

from src.pandas_streaming.df import pandas_groupby_nan, numpy_types


class TestPandasHelper(ExtTestCase):

def test_pandas_groupbynan(self):
types = [(int, -10), (float, -20.2), (str, "e"),
(bytes, bytes("a", "ascii"))]
skip = (numpy.bool_, numpy.complex64, numpy.complex128)
types += [(_, _(5)) for _ in numpy_types() if _ not in skip]

for ty in types:
data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
{"this": "cst", "type": "tt2=" +
str(ty[0]), "value": ty[1]},
{"this": "cst", "type": "row_for_nan"}]
df = pandas.DataFrame(data)
gr = pandas_groupby_nan(df, "value")
co = gr.sum()
li = list(co["value"])
try:
self.assertIsInstance(li[-1], float)
except AssertionError as e:
raise AssertionError("Issue with {0}".format(ty)) from e
try:
self.assertTrue(numpy.isnan(li[-1]))
except AssertionError as e:
raise AssertionError(
"Issue with value {0}\n--df--\n{1}\n--co--\n{2}".format(li, df, co)) from e

for ty in types:
data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
{"this": "cst", "type": "tt2=" +
str(ty[0]), "value": ty[1]},
{"this": "cst", "type": "row_for_nan"}]
df = pandas.DataFrame(data)
try:
gr = pandas_groupby_nan(df, ("value", "this"))
t = True
raise Exception("---")
except TypeError:
t = False
if t:
co = gr.sum()
li = list(co["value"])
self.assertIsInstance(li[-1], float)
self.assertTrue(numpy.isnan(li[-1]))
try:
gr = pandas_groupby_nan(df, ["value", "this"])
t = True
except (TypeError, NotImplementedError):
t = False

if t:
co = gr.sum()
li = list(co["value"])
self.assertEqual(len(li), 2)

def test_pandas_groupbynan_tuple(self):
data = [dict(a="a", b="b", c="c", n=1), dict(
b="b", n=2), dict(a="a", n=3), dict(c="c", n=4)]
df = pandas.DataFrame(data)
gr = df.groupby(["a", "b", "c"]).sum()
self.assertEqual(gr.shape, (1, 1))

for nanback in [True, False]:
try:
gr2_ = pandas_groupby_nan(
df, ["a", "b", "c"], nanback=nanback, suffix="NAN")
except NotImplementedError:
continue
gr2 = gr2_.sum().sort_values("n")
self.assertEqual(gr2.shape, (4, 4))
d = gr2.to_dict("records")
self.assertEqual(d[0]["a"], "a")
self.assertEqual(d[0]["b"], "b")
self.assertEqual(d[0]["c"], "c")
self.assertEqual(d[0]["n"], 1)
self.assertEqual(d[1]["a"], "NAN")


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions _unittests/ut_module/test_code_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def test_style_src(self):
thi = os.path.abspath(os.path.dirname(__file__))
src_ = os.path.normpath(os.path.join(thi, "..", "..", "src"))
check_pep8(src_, fLOG=fLOG,
pylint_ignore=('C0103', 'C1801', 'R0201', 'R1705', 'W0108', 'W0613',
'W0212'),
skip=["Too many nested blocks",
"Module 'numpy.random' has no 'RandomState' member",
"Value 'sch' is unsubscriptable",
Expand Down
1 change: 1 addition & 0 deletions src/pandas_streaming/df/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
from .connex_split import train_test_split_weights, train_test_connex_split, train_test_apart_stratify
from .dataframe import StreamingDataFrame
from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold, dataframe_shuffle
from .dataframe_helpers import pandas_groupby_nan, numpy_types
from .dataframe_io import to_zip, read_zip
238 changes: 236 additions & 2 deletions src/pandas_streaming/df/dataframe_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
"""
import hashlib
import struct
import pandas
import warnings
import numpy
from pandas import DataFrame, Index


def hash_str(c, hash_length):
Expand Down Expand Up @@ -199,7 +200,7 @@ def dataframe_unfold(df, col, new_col=None, sep=","):
rows.append({col: v, col_name: v, temp_col: i})
df = df.copy()
df[temp_col] = list(range(df.shape[0]))
dfj = pandas.DataFrame(rows)
dfj = DataFrame(rows)
res = df.merge(dfj, on=[col, temp_col])
return res.drop(temp_col, axis=1).copy()

Expand Down Expand Up @@ -249,3 +250,236 @@ def dataframe_shuffle(df, random_state=None):
res = shuffled.set_index(keep_cols)[ori_cols]
res.index.names = df.index.names
return res


def pandas_fillna(df, by, hasna=None, suffix=None):
"""
Replaces the :epkg:`nan` values for something not :epkg:`nan`.
Mostly used by @see fn pandas_groupby_nan.
@param df dataframe
@param by list of columns for which we need to replace nan
@param hasna None or list of columns for which we need to replace NaN
@param suffix use a prefix for the NaN value
@return list of values chosen for each column, new dataframe (new copy)
"""
suffix = suffix if suffix else "²"
df = df.copy()
rep = {}
for c in by:
if hasna is not None and c not in hasna:
continue
if df[c].dtype in (str, bytes, object):
se = set(df[c].dropna())
val = se.pop()
if isinstance(val, str):
cst = suffix
val = ""
elif isinstance(val, bytes):
cst = b"_"
else:
raise TypeError(
"Unable to determine a constant for type='{0}' dtype='{1}'".format(val, df[c].dtype))
val += cst
while val in se:
val += suffix
df[c].fillna(val, inplace=True)
rep[c] = val
else:
dr = df[c].dropna()
mi = abs(dr.min())
ma = abs(dr.max())
val = ma + mi
if val <= ma:
raise ValueError(
"Unable to find a different value for column '{0}': min={1} max={2}".format(val, mi, ma))
df[c].fillna(val, inplace=True)
rep[c] = val
return rep, df


def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs):
"""
Does a *groupby* including keeping missing values (:epkg:`nan`).
@param df dataframe
@param by column or list of columns
@param axis only 0 is allowed
@param as_index should be False
@param suffix None or a string
@param nanback put :epkg:`nan` back in the index,
otherwise it leaves a replacement for :epkg:`nan`.
(does not work when grouping by multiple columns)
@param kwargs other parameters sent to
`groupby <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html>`_
@return groupby results
See `groupby and missing values <http://pandas-docs.github.io/pandas-docs-travis/groupby.html#na-and-nat-group-handling>`_.
If no :epkg:`nan` is detected, the function falls back in regular
:epkg:`pandas:DataFrame:groupby` which has the following
behavior.
.. exref::
:title: Group a dataframe by one column including nan values
:tag: dataframe
The regular :epkg:`pandas:dataframe:GroupBy` of a
:epkg:`pandas:DataFrame` removes every :epkg:`nan`
values from the index.
.. runpython::
:showcode:
from pandas import DataFrame
data = [dict(a=2, ind="a", n=1), dict(a=2, ind="a"), dict(a=3, ind="b"), dict(a=30)]
df = DataFrame(data)
print(df)
gr = df.groupby(["ind"]).sum()
print(gr)
Function @see fn pandas_groupby_nan modifies keeps them.
.. runpython::
:showcode:
from pandas import DataFrame
from pyensae.ml_helper import pandas_groupby_nan
data = [dict(a=2, ind="a", n=1), dict(a=2, ind="a"), dict(a=3, ind="b"), dict(a=30)]
df = DataFrame(data)
gr2 = pandas_groupby_nan(df, ["ind"]).sum()
print(gr2)
"""
if axis != 0:
raise NotImplementedError("axis should be 0")
if as_index:
raise NotImplementedError("as_index must be False")
if isinstance(by, tuple):
raise TypeError("by should be of list not tuple")
if not isinstance(by, list):
by = [by]
hasna = {}
for b in by:
h = df[b].isnull().values.any()
if h:
hasna[b] = True
if len(hasna) > 0:
rep, df_copy = pandas_fillna(df, by, hasna, suffix=suffix)
res = df_copy.groupby(by, axis=axis, as_index=as_index, **kwargs)
if len(by) == 1:
if not nanback:
dummy = DataFrame([{"a": "a"}])
do = dummy.dtypes[0]
typ = {c: t for c, t in zip(df.columns, df.dtypes)}
if typ[by[0]] != do:
warnings.warn(
"[pandas_groupby_nan] NaN value: {0}".format(rep))
return res
for b in by:
fnan = rep[b]
if fnan in res.grouper.groups:
res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
del res.grouper.groups[fnan]
new_val = list((numpy.nan if b == fnan else b)
for b in res.grouper.result_index)
res.grouper.groupings[0]._group_index = Index(new_val)
res.grouper.groupings[0].obj[b].replace(
fnan, numpy.nan, inplace=True)
if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray):
arr = numpy.array(new_val)
res.grouper.groupings[0].grouper = arr
if hasattr(res.grouper.groupings[0], '_cache') and 'result_index' in res.grouper.groupings[0]._cache:
del res.grouper.groupings[0]._cache['result_index']
else:
raise NotImplementedError("Not implemented for type: {0}".format(
type(res.grouper.groupings[0].grouper)))
res.grouper._cache['result_index'] = res.grouper.groupings[0]._group_index
else:
if not nanback:
dummy = DataFrame([{"a": "a"}])
do = dummy.dtypes[0]
typ = {c: t for c, t in zip(df.columns, df.dtypes)}
for b in by:
if typ[b] != do:
warnings.warn(
"[pandas_groupby_nan] NaN values: {0}".format(rep))
break
return res
raise NotImplementedError(
"Not yet implemented. Replacing pseudo nan values by real nan values is not as easy as it looks. Use nanback=False")

# keys = list(res.grouper.groups.keys())
# didit = False
# mapping = {}
# for key in keys:
# new_key = list(key)
# mod = False
# for k, b in enumerate(by):
# if b not in rep:
# continue
# fnan = rep[b]
# if key[k] == fnan:
# new_key[k] = numpy.nan
# mod = True
# didit = True
# mapping[fnan] = numpy.nan
# if mod:
# new_key = tuple(new_key)
# mapping[key] = new_key
# res.grouper.groups[new_key] = res.grouper.groups[key]
# del res.grouper.groups[key]
# if didit:
# # this code deos not work
# vnan = numpy.nan
# new_index = list(mapping.get(v, v)
# for v in res.grouper.result_index)
# names = res.grouper.result_index.names
# # index = MultiIndex.from_tuples(tuples=new_index, names=names)
# # res.grouper.result_index = index # does not work cannot set
# # values for [result_index]
# for k in range(len(res.grouper.groupings)):
# grou = res.grouper.groupings[k]
# new_val = list(mapping.get(v, v) for v in grou)
# grou._group_index = Index(new_val)
# b = names[k]
# if b in rep:
# vv = rep[b]
# grou.obj[b].replace(vv, vnan, inplace=True)
# if isinstance(grou.grouper, numpy.ndarray):
# grou.grouper = numpy.array(new_val)
# else:
# raise NotImplementedError(
# "Not implemented for type: {0}".format(type(grou.grouper)))
# del res.grouper._cache
return res
else:
return df.groupby(by, axis=axis, **kwargs)


def numpy_types():
"""
Returns the list of :epkg:`numpy` available types.
@return list of types
"""

return [numpy.bool_,
numpy.int_,
numpy.intc,
numpy.intp,
numpy.int8,
numpy.int16,
numpy.int32,
numpy.int64,
numpy.uint8,
numpy.uint16,
numpy.uint32,
numpy.uint64,
numpy.float_,
numpy.float16,
numpy.float32,
numpy.float64,
numpy.complex_,
numpy.complex64,
numpy.complex128]

0 comments on commit 3d92302

Please sign in to comment.