fix(eda): fix column name and index related issues

sfu-db · Apr 7, 2021 · 40a89b9 · 40a89b9
1 parent e83f95b
commit 40a89b9
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 13 deletions.
diff --git a/dataprep/eda/correlation/__init__.py b/dataprep/eda/correlation/__init__.py
@@ -72,6 +72,7 @@ def plot_correlation(
     This function only supports numerical or categorical data,
     and it is better to drop None, Nan and Null value before using it
     """
+
     cfg = Config.from_dict(display, config)
 
     with ProgressBar(minimum=1, disable=not progress):

diff --git a/dataprep/eda/correlation/compute/__init__.py b/dataprep/eda/correlation/compute/__init__.py
@@ -7,6 +7,7 @@
 from ...configs import Config
 from ...data_array import DataArray, DataFrame
 from ...intermediate import Intermediate
+from ...utils import preprocess_dataframe
 from .bivariate import _calc_bivariate
 from .overview import _calc_overview
 from .univariate import _calc_univariate
@@ -55,6 +56,7 @@ def compute_correlation(
     elif not cfg:
         cfg = Config()
 
+    df = preprocess_dataframe(df)
     if x is None and y is None:  # pylint: disable=no-else-return
         with catch_warnings():
             filterwarnings(

diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -33,13 +33,12 @@
     Nominal,
     detect_dtype,
     is_dtype,
-    string_dtype_to_object,
 )
 from ..intermediate import Intermediate
 from ..missing import render_missing
 from ..missing.compute.nullivariate import compute_missing_nullivariate
 from ..progress_bar import ProgressBar
-from ..utils import to_dask
+from ..utils import preprocess_dataframe
 
 
 def format_report(
@@ -70,8 +69,7 @@ def format_report(
         This variable acts like an API in passing data to the template engine.
     """
     with ProgressBar(minimum=1, disable=not progress):
-        df = to_dask(df)
-        df = string_dtype_to_object(df)
+        df = preprocess_dataframe(df)
         if mode == "basic":
             comps = format_basic(df, cfg)
         # elif mode == "full":

diff --git a/dataprep/eda/distribution/compute/__init__.py b/dataprep/eda/distribution/compute/__init__.py
@@ -11,7 +11,7 @@
 from ...configs import Config
 from ...dtypes import DTypeDef, string_dtype_to_object
 from ...intermediate import Intermediate
-from ...utils import to_dask
+from ...utils import preprocess_dataframe
 from .bivariate import compute_bivariate
 from .overview import compute_overview
 from .trivariate import compute_trivariate
@@ -59,9 +59,7 @@ def compute(
     """
     # pylint: disable=too-many-arguments
 
-    df = to_dask(df)
-    df.columns = df.columns.astype(str)
-    df = string_dtype_to_object(df)
+    df = preprocess_dataframe(df)
 
     if isinstance(cfg, dict):
         cfg = Config.from_dict(display, cfg)

diff --git a/dataprep/eda/missing/compute/__init__.py b/dataprep/eda/missing/compute/__init__.py
@@ -6,7 +6,8 @@
 
 from ...configs import Config
 from ...data_array import DataArray, DataFrame
-from ...dtypes import DTypeDef, string_dtype_to_object
+from ...dtypes import DTypeDef
+from ...utils import preprocess_dataframe
 from ...intermediate import Intermediate
 from .bivariate import compute_missing_bivariate
 from .nullivariate import compute_missing_nullivariate
@@ -61,7 +62,7 @@ def compute_missing(
     >>> plot_missing(df, "HDI_for_year")
     >>> plot_missing(df, "HDI_for_year", "population")
     """
-    df = string_dtype_to_object(df)
+    df = preprocess_dataframe(df)
     df = DataArray(df)
 
     # pylint: disable=no-else-raise

diff --git a/dataprep/eda/utils.py b/dataprep/eda/utils.py
@@ -3,11 +3,14 @@
 import logging
 from math import ceil
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from collections import Counter
+
 
 import dask
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_object_dtype
 from bokeh.models import Legend, FuncTickFormatter
 from bokeh.plotting import Figure
 from scipy.stats import gaussian_kde as gaussian_kde_
@@ -34,6 +37,58 @@ def to_dask(df: Union[pd.DataFrame, dd.DataFrame]) -> dd.DataFrame:
     return dd.from_pandas(df, npartitions=npartitions)
 
 
+def preprocess_dataframe(
+    org_df: Union[pd.DataFrame, dd.DataFrame],
+    used_columns: Optional[Union[List[str], List[object]]] = None,
+) -> dd.DataFrame:
+    """
+    Make a dask dataframe with only used_columns.
+    This function will do the following:
+        1. keep only used_columns.
+        2. transform column name to string (avoid object column name) and rename
+        duplicate column names in form of {col}_{id}.
+        3. reset index
+        4. transform object column to string column (note that obj column can contain
+        cells from different type).
+        5. transform to dask dataframe if input is pandas dataframe.
+    """
+    if used_columns is None:
+        df = org_df.copy()
+    else:
+        # Process the case when used_columns are string column name,
+        # but org_df column name is object.
+        used_columns_set = set(used_columns)
+        used_cols_obj = set()
+        for col in org_df.columns:
+            if str(col) in used_columns_set or col in used_columns_set:
+                used_cols_obj.add(col)
+        df = org_df[used_cols_obj]
+
+    columns = list(df.columns)
+
+    # Resolve duplicate names in columns.
+    # Duplicate names will be renamed as col_{id}.
+    column_count = Counter(columns)
+    current_id: Dict[Any, int] = dict()
+    for i, col in enumerate(columns):
+        if column_count[col] > 1:
+            current_id[col] = current_id.get(col, 0) + 1
+            new_col_name = f"{col}_{current_id[col]}"
+        else:
+            new_col_name = f"{col}"
+        columns[i] = new_col_name
+
+    df.columns = columns
+    df = df.reset_index(drop=True)
+
+    # Since an object column could contains multiple types
+    # in different cells. transform object column to string.
+    for col in df.columns:
+        if is_object_dtype(df[col].dtype):
+            df[col] = df[col].astype(str)
+    return to_dask(df)
+
+
 def sample_n(arr: np.ndarray, n: int) -> np.ndarray:  # pylint: disable=C0103
     """Sample n values uniformly from the range of the `arr`,
     not from the distribution of `arr`'s elems."""

diff --git a/dataprep/tests/eda/random_data_generator.py b/dataprep/tests/eda/random_data_generator.py
@@ -83,6 +83,7 @@ def gen_random_series(
     size: int,
     dtype: str = "object",
     na_ratio: float = 0.0,
+    str_max_len: int = 100,
     random_state: Union[int, np.random.RandomState] = 0,
 ) -> pd.Series:
     """
@@ -97,6 +98,8 @@ def gen_random_series(
         Chosen from 'int', 'float', 'boolean', 'datetime', 'string' and 'object'.
     na_ratio: float
         The ratio of NA values in the series. Should be in [0.0, 1.0]
+    str_max_len: int
+        The max len of random string
     seed: int
         generator seed
     """
@@ -118,7 +121,10 @@ def gen_random_series(
     population_list = []
     for curr_type in gen_func:
         if dtype in [curr_type, "object"]:
-            rand_series = gen_func[curr_type](size, random_state=rand)
+            if curr_type != "string":
+                rand_series = gen_func[curr_type](size, random_state=rand)
+            else:
+                rand_series = gen_func[curr_type](size, max_len=str_max_len, random_state=rand)
             population_list.append(rand_series)
     object_population = pd.concat(population_list, ignore_index=True)
     object_series = pd.Series(rand.choice(object_population, size=size))
@@ -134,6 +140,7 @@ def gen_random_dataframe(
     nrows: int = 30,
     ncols: int = 30,
     na_ratio: float = 0.0,
+    str_col_name_max_len: int = 100,
     random_state: Union[int, np.random.RandomState] = 0,
 ) -> pd.DataFrame:
     """
@@ -148,6 +155,8 @@ def gen_random_dataframe(
         Number of rows of the generated dataframe.
     na_ratio:
         Ratio of NA values.
+    str_col_name_max_len:
+        max length of string column name
     ncols: int
         Number of columns of the generated dataframe.
     seed: int
@@ -166,7 +175,15 @@ def gen_random_dataframe(
     df = pd.DataFrame(series_list)
 
     # Generate random column names and index.
-    col_names = gen_random_series(size=ncols, dtype="object", na_ratio=0.1, random_state=rand)
+    col_names = gen_random_series(
+        size=ncols,
+        dtype="object",
+        na_ratio=0.1,
+        str_max_len=str_col_name_max_len,
+        random_state=rand,
+    )
     df.columns = col_names
-    df.index = gen_random_series(df.index.shape[0], na_ratio=0.1, random_state=rand)
+    df.index = gen_random_series(
+        df.index.shape[0], na_ratio=0.1, str_max_len=str_col_name_max_len, random_state=rand
+    )
     return df