Skip to content

Commit

Permalink
fix(eda):fix str transform with small distinct as categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Apr 26, 2021
1 parent 1ce5775 commit 65e7f90
Showing 1 changed file with 11 additions and 10 deletions.
21 changes: 11 additions & 10 deletions dataprep/eda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
import pandas._libs.missing as libmissing
from bokeh.models import Legend, FuncTickFormatter
from bokeh.plotting import Figure
from scipy.stats import gaussian_kde as gaussian_kde_
from scipy.stats import ks_2samp as ks_2samp_
from scipy.stats import normaltest as normaltest_
from scipy.stats import skewtest as skewtest_
from .dtypes import drop_null
from .dtypes import (
drop_null,
Nominal,
detect_dtype,
is_dtype,
)


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -83,6 +87,7 @@ def preprocess_dataframe(

df.columns = columns
df = df.reset_index(drop=True)
df = to_dask(df)

# Since an object column could contains multiple types
# in different cells. transform non-na values in object column to string.
Expand All @@ -93,16 +98,12 @@ def preprocess_dataframe(
# Otherwise when a cell is tuple or list it will throw an error.
_notna2str = lambda obj: obj if libmissing.checknull(obj) else str(obj)
for col in df.columns:
if is_object_dtype(df[col].dtype) and (
col_dtype = detect_dtype(df[col])
if (is_dtype(col_dtype, Nominal())) and (
(excluded_columns is None) or (col not in excluded_columns)
):
if isinstance(df, pd.DataFrame):
df[col] = df[col].apply(_notna2str)
elif isinstance(df, dd.DataFrame):
df[col] = df[col].apply(_notna2str, meta=("object"))
else:
raise RuntimeError(f"Unknown dataframe type: {type(df)}")
return to_dask(df)
df[col] = df[col].apply(_notna2str, meta=("object"))
return df


def sample_n(arr: np.ndarray, n: int) -> np.ndarray: # pylint: disable=C0103
Expand Down

0 comments on commit 65e7f90

Please sign in to comment.