Skip to content

Commit

Permalink
add function to unfold a dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed Nov 22, 2017
1 parent cc4673a commit c1752d3
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 1 deletion.
2 changes: 2 additions & 0 deletions _doc/sphinxdoc/source/api/rdf.rst
Expand Up @@ -29,6 +29,8 @@ Data Manipulation

.. autosignature:: pandas_streaming.df.connex_split.dataframe_shuffle

.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold

Complex splits
++++++++++++++

Expand Down
68 changes: 68 additions & 0 deletions _unittests/ut_df/test_dataframe_helpers_simple.py
@@ -0,0 +1,68 @@
#-*- coding: utf-8 -*-
"""
@brief test log(time=4s)
"""

import sys
import os
import unittest
import pandas


try:
import pyquickhelper as skip_
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..",
"..",
"pyquickhelper",
"src")))
if path not in sys.path:
sys.path.append(path)
import pyquickhelper as skip_


try:
import src
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..")))
if path not in sys.path:
sys.path.append(path)
import src

from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import ExtTestCase
from src.pandas_streaming.df import dataframe_unfold


class TestDataFrameHelpersSimple(ExtTestCase):

def test_unfold(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")

df = pandas.DataFrame([dict(a=1, b="e,f"),
dict(a=2, b="g"),
dict(a=3)])
df2 = dataframe_unfold(df, "b")

exp = pandas.DataFrame([dict(a=1, b="e,f", b_unfold="e"),
dict(a=1, b="e,f", b_unfold="f"),
dict(a=2, b="g", b_unfold="g"),
dict(a=3)])
self.assertEqualDataFrame(df2, exp)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion src/pandas_streaming/df/__init__.py
Expand Up @@ -5,4 +5,4 @@

from .connex_split import dataframe_shuffle, train_test_split_weights, train_test_connex_split
from .dataframe import StreamingDataFrame
from .dataframe_helpers import dataframe_hash_columns
from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold
28 changes: 28 additions & 0 deletions src/pandas_streaming/df/dataframe_helpers.py
Expand Up @@ -5,6 +5,7 @@
"""
import hashlib
import struct
import pandas
import numpy


Expand Down Expand Up @@ -120,3 +121,30 @@ def hash_floatl(c):
"Conversion of type {0} in column '{1}' is not implemented".format(t, c))

return df


def dataframe_unfold(df, col, new_col=None, sep=","):
"""
One column may contain concatenated values.
This function splits these values and multiplies the
rows for each split values.
@param df dataframe
@param col column with the concatenated values (strings)
@param new_col new column name, if None, use default value.
@param sep separator
@return a new dataframe
"""
if new_col is None:
col_name = col + "_unfold"
rows = []
for v in df[col]:
if isinstance(v, str):
spl = v.split(sep)
for vs in spl:
rows.append({col: v, col_name: vs})
else:
rows.append({col: v, col_name: v})
dfj = pandas.DataFrame(rows)
res = df.merge(dfj, on=col)
return res

0 comments on commit c1752d3

Please sign in to comment.