Skip to content

Commit

Permalink
fix(eda): keep na when preprocess df
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Apr 26, 2021
1 parent 180e6ad commit 17d8219
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=lxml
extension-pkg-whitelist=lxml, pandas._libs.missing

# Add files or directories to the blacklist. They should be base names, not
# paths.
Expand Down
19 changes: 16 additions & 3 deletions dataprep/eda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
import pandas._libs.missing as libmissing
from bokeh.models import Legend, FuncTickFormatter
from bokeh.plotting import Figure
from scipy.stats import gaussian_kde as gaussian_kde_
Expand All @@ -19,6 +20,7 @@
from scipy.stats import skewtest as skewtest_
from .dtypes import drop_null


LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -83,12 +85,23 @@ def preprocess_dataframe(
df = df.reset_index(drop=True)

# Since an object column could contains multiple types
# in different cells. transform object column to string.
# in different cells. transform non-na values in object column to string.

# Function `_notna2str` transforms an obj to str if it is not NA.
# The check for NA is similar to pd.isna, but will treat a list obj as
# a scalar and return a single boolean, rather than a list of booleans.
# Otherwise when a cell is tuple or list it will throw an error.
_notna2str = lambda obj: obj if libmissing.checknull(obj) else str(obj)
for col in df.columns:
if is_object_dtype(df[col].dtype) and (
excluded_columns is not None and col not in excluded_columns
(excluded_columns is None) or (col not in excluded_columns)
):
df[col] = df[col].astype(str)
if isinstance(df, pd.DataFrame):
df[col] = df[col].apply(_notna2str)
elif isinstance(df, dd.DataFrame):
df[col] = df[col].apply(_notna2str, meta=("object"))
else:
raise RuntimeError(f"Unknown dataframe type: {type(df)}")
return to_dask(df)


Expand Down

0 comments on commit 17d8219

Please sign in to comment.