Skip to content

Commit

Permalink
feat(type): detect column as categorical for small unique values
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Apr 20, 2021
1 parent 46f724b commit 4696e59
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 10 deletions.
4 changes: 2 additions & 2 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
_insight_pagination,
)
from ..dtypes import (
CATEGORICAL_DTYPES,
Continuous,
DateTime,
Nominal,
Expand Down Expand Up @@ -263,7 +262,8 @@ def basic_computations(

df_num = df.select_num_columns()
data["num_cols"] = df_num.columns
first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head
first_rows = df.head

# variables
if cfg.variables.enable:
for col in df.columns:
Expand Down
7 changes: 6 additions & 1 deletion dataprep/eda/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,12 @@ def detect_without_known(col: dd.Series) -> DType:
return Nominal()

elif is_continuous(col.dtype):
return Continuous()
# detect as categorical if distinct value is small
nuniques = col.nunique_approx().compute()
if nuniques < 10:
return Nominal()
else:
return Continuous()

elif is_datetime(col.dtype):
return DateTime()
Expand Down
10 changes: 4 additions & 6 deletions dataprep/eda/missing/compute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def histogram(
"""Calculate "histogram" for both numerical and categorical."""
if len(arr.shape) != 1:
raise ValueError("Histogram only supports 1-d array.")

if is_dtype(detect_dtype(arr, dtype), Continuous()):
srs = dd.from_dask_array(arr)
detected_type = detect_dtype(srs, dtype)
if is_dtype(detected_type, Continuous()):
if range is not None:
minimum, maximum = range
else:
Expand All @@ -67,14 +68,11 @@ def histogram(
if not return_edges:
return counts, centers
return counts, centers, edges
elif is_dtype(detect_dtype(arr, dtype), Nominal()) or is_dtype(
detect_dtype(arr, dtype), GeoGraphy()
):
elif is_dtype(detected_type, Nominal()) or is_dtype(detected_type, GeoGraphy()):
# Dask array's unique is way slower than the values_counts on Series
# See https://github.com/dask/dask/issues/2851
# centers, counts = da.unique(arr, return_counts=True)

srs = dd.from_dask_array(arr)
value_counts = srs.value_counts()

counts = value_counts.to_dask_array()
Expand Down
2 changes: 1 addition & 1 deletion dataprep/tests/eda/random_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _gen_random_int_series(
) -> pd.Series:
"""Return a randonly generated int Series, where the value is in [low, high]"""
rand = _resolve_random_state(random_state)
arr = rand.random_integers(low=low, high=high, size=size)
arr = rand.randint(low=low, high=high, size=size)
return pd.Series(arr)


Expand Down

0 comments on commit 4696e59

Please sign in to comment.