Skip to content

Commit

Permalink
fix(eda):report for pandas extension type
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Nov 23, 2021
1 parent 57d3f37 commit 2cbb387
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
14 changes: 13 additions & 1 deletion dataprep/eda/eda_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
import pandas as pd
import pandas._libs.missing as libmissing

from .dtypes_v2 import NUMERICAL_DTYPES, DType, DTypeDef, detect_dtype, Nominal, GeoGraphy
from .dtypes_v2 import (
NUMERICAL_DTYPES,
DType,
DTypeDef,
detect_dtype,
Nominal,
GeoGraphy,
)

DataFrame = Union[pd.DataFrame, dd.DataFrame, "EDAFrame"]

Expand Down Expand Up @@ -109,6 +116,11 @@ def __init__(
if isinstance(self._eda_dtypes[col], (Nominal, GeoGraphy)):
ddf[col] = ddf[col].apply(_to_str_if_not_na, meta=(col, "object"))

# transform pandas extension type to the numpy type,
# to avoid computation issue of pandas type, e.g., #733.
elif issubclass(type(ddf[col].dtype), pd.api.extensions.ExtensionDtype):
ddf[col] = ddf[col].astype(ddf[col].dtype.type)

self._ddf = ddf.persist()
self._columns = self._ddf.columns
self._values = self._ddf.to_dask_array(lengths=True)
Expand Down
1 change: 1 addition & 0 deletions dataprep/tests/eda/test_create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def simpledf() -> pd.DataFrame:
# test when column is object but some cells are numerical
df["h"] = pd.Series([0, "x"] * 500)
df["i"] = pd.Series(["str"] * 1000).astype("string")
df["j"] = pd.Series(list(range(1000)), dtype=pd.Int64Dtype())

idx = np.arange(1000)
np.random.shuffle(idx)
Expand Down

1 comment on commit 2cbb387

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: 2cbb387 Previous: 57d3f37 Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.17782181075745707 iter/sec (stddev: 0.14340648156070304) 0.1580572351344921 iter/sec (stddev: 0.06557930995648795) 0.89

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.