Skip to content

Commit

Permalink
fix(eda):fix string type
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Oct 19, 2021
1 parent a64e356 commit b7e3321
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 10 deletions.
16 changes: 6 additions & 10 deletions dataprep/eda/dtypes_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
import pandas as pd
from ..clean import validate_country, validate_lat_long
from ..errors import UnreachableError

STRING_PANDAS_DTYPES = [pd.StringDtype]
STRING_DTYPES = STRING_PANDAS_DTYPES
Expand Down Expand Up @@ -202,15 +201,8 @@ def detect_without_known(col: Union[dd.Series, pd.Series], head: pd.Series) -> D
"""
This function detects dtypes of column when users didn't specify.
"""
if is_nominal(col.dtype):
if is_geography(head):
return GeoGraphy()
if is_geopoint(head):
return GeoPoint()
else:
return Nominal()

elif is_continuous(col.dtype):
if is_continuous(col.dtype):
# detect as categorical if distinct value is small
if isinstance(col, dd.Series):
nuniques = col.nunique_approx().compute()
Expand All @@ -225,8 +217,12 @@ def detect_without_known(col: Union[dd.Series, pd.Series], head: pd.Series) -> D

elif is_datetime(col.dtype):
return DateTime()
elif is_geography(head):
return GeoGraphy()
elif is_geopoint(head):
return GeoPoint()
else:
raise UnreachableError
return Nominal()


def is_dtype(dtype1: Any, dtype2: DType) -> bool:
Expand Down
1 change: 1 addition & 0 deletions dataprep/tests/eda/test_create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def simpledf() -> pd.DataFrame:
df["g"] = pd.to_datetime(df["f"])
# test when column is object but some cells are numerical
df["h"] = pd.Series([0, "x"] * 500)
df["i"] = pd.Series(["str"] * 1000).astype("string")

idx = np.arange(1000)
np.random.shuffle(idx)
Expand Down

1 comment on commit b7e3321

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: b7e3321 Previous: a64e356 Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.15207782298047312 iter/sec (stddev: 0.1695308936570434) 0.204863280161837 iter/sec (stddev: 0.02400732527459022) 1.35

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.