add function to unfold a dataframe

sdpython · Nov 22, 2017 · c1752d3 · c1752d3
1 parent cc4673a
commit c1752d3
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 1 deletion.
diff --git a/_doc/sphinxdoc/source/api/rdf.rst b/_doc/sphinxdoc/source/api/rdf.rst
@@ -29,6 +29,8 @@ Data Manipulation
 
 .. autosignature:: pandas_streaming.df.connex_split.dataframe_shuffle
 
+.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
+
 Complex splits
 ++++++++++++++
 

diff --git a/_unittests/ut_df/test_dataframe_helpers_simple.py b/_unittests/ut_df/test_dataframe_helpers_simple.py
@@ -0,0 +1,68 @@
+#-*- coding: utf-8 -*-
+"""
+@brief      test log(time=4s)
+"""
+
+import sys
+import os
+import unittest
+import pandas
+
+
+try:
+    import pyquickhelper as skip_
+except ImportError:
+    path = os.path.normpath(
+        os.path.abspath(
+            os.path.join(
+                os.path.split(__file__)[0],
+                "..",
+                "..",
+                "..",
+                "pyquickhelper",
+                "src")))
+    if path not in sys.path:
+        sys.path.append(path)
+    import pyquickhelper as skip_
+
+
+try:
+    import src
+except ImportError:
+    path = os.path.normpath(
+        os.path.abspath(
+            os.path.join(
+                os.path.split(__file__)[0],
+                "..",
+                "..")))
+    if path not in sys.path:
+        sys.path.append(path)
+    import src
+
+from pyquickhelper.loghelper import fLOG
+from pyquickhelper.pycode import ExtTestCase
+from src.pandas_streaming.df import dataframe_unfold
+
+
+class TestDataFrameHelpersSimple(ExtTestCase):
+
+    def test_unfold(self):
+        fLOG(
+            __file__,
+            self._testMethodName,
+            OutputPrint=__name__ == "__main__")
+
+        df = pandas.DataFrame([dict(a=1, b="e,f"),
+                               dict(a=2, b="g"),
+                               dict(a=3)])
+        df2 = dataframe_unfold(df, "b")
+
+        exp = pandas.DataFrame([dict(a=1, b="e,f", b_unfold="e"),
+                                dict(a=1, b="e,f", b_unfold="f"),
+                                dict(a=2, b="g", b_unfold="g"),
+                                dict(a=3)])
+        self.assertEqualDataFrame(df2, exp)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/pandas_streaming/df/__init__.py b/src/pandas_streaming/df/__init__.py
@@ -5,4 +5,4 @@
 
 from .connex_split import dataframe_shuffle, train_test_split_weights, train_test_connex_split
 from .dataframe import StreamingDataFrame
-from .dataframe_helpers import dataframe_hash_columns
+from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold
diff --git a/src/pandas_streaming/df/dataframe_helpers.py b/src/pandas_streaming/df/dataframe_helpers.py
@@ -5,6 +5,7 @@
 """
 import hashlib
 import struct
+import pandas
 import numpy
 
 
@@ -120,3 +121,30 @@ def hash_floatl(c):
                 "Conversion of type {0} in column '{1}' is not implemented".format(t, c))
 
     return df
+
+
+def dataframe_unfold(df, col, new_col=None, sep=","):
+    """
+    One column may contain concatenated values.
+    This function splits these values and multiplies the
+    rows for each split values.
+
+    @param      df      dataframe
+    @param      col     column with the concatenated values (strings)
+    @param      new_col new column name, if None, use default value.
+    @param      sep     separator
+    @return             a new dataframe
+    """
+    if new_col is None:
+        col_name = col + "_unfold"
+    rows = []
+    for v in df[col]:
+        if isinstance(v, str):
+            spl = v.split(sep)
+            for vs in spl:
+                rows.append({col: v, col_name: vs})
+        else:
+            rows.append({col: v, col_name: v})
+    dfj = pandas.DataFrame(rows)
+    res = df.merge(dfj, on=col)
+    return res