feat(type): detect column as categorical for small unique values

sfu-db · Apr 20, 2021 · 4696e59 · 4696e59
1 parent 46f724b
commit 4696e59
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 10 deletions.
diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -27,7 +27,6 @@
     _insight_pagination,
 )
 from ..dtypes import (
-    CATEGORICAL_DTYPES,
     Continuous,
     DateTime,
     Nominal,
@@ -263,7 +262,8 @@ def basic_computations(
 
     df_num = df.select_num_columns()
     data["num_cols"] = df_num.columns
-    first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head
+    first_rows = df.head
+
     # variables
     if cfg.variables.enable:
         for col in df.columns:

diff --git a/dataprep/eda/dtypes.py b/dataprep/eda/dtypes.py
@@ -199,7 +199,12 @@ def detect_without_known(col: dd.Series) -> DType:
             return Nominal()
 
     elif is_continuous(col.dtype):
-        return Continuous()
+        # detect as categorical if distinct value is small
+        nuniques = col.nunique_approx().compute()
+        if nuniques < 10:
+            return Nominal()
+        else:
+            return Continuous()
 
     elif is_datetime(col.dtype):
         return DateTime()

diff --git a/dataprep/eda/missing/compute/common.py b/dataprep/eda/missing/compute/common.py
@@ -51,8 +51,9 @@ def histogram(
     """Calculate "histogram" for both numerical and categorical."""
     if len(arr.shape) != 1:
         raise ValueError("Histogram only supports 1-d array.")
-
-    if is_dtype(detect_dtype(arr, dtype), Continuous()):
+    srs = dd.from_dask_array(arr)
+    detected_type = detect_dtype(srs, dtype)
+    if is_dtype(detected_type, Continuous()):
         if range is not None:
             minimum, maximum = range
         else:
@@ -67,14 +68,11 @@ def histogram(
         if not return_edges:
             return counts, centers
         return counts, centers, edges
-    elif is_dtype(detect_dtype(arr, dtype), Nominal()) or is_dtype(
-        detect_dtype(arr, dtype), GeoGraphy()
-    ):
+    elif is_dtype(detected_type, Nominal()) or is_dtype(detected_type, GeoGraphy()):
         # Dask array's unique is way slower than the values_counts on Series
         # See https://github.com/dask/dask/issues/2851
         # centers, counts = da.unique(arr, return_counts=True)
 
-        srs = dd.from_dask_array(arr)
         value_counts = srs.value_counts()
 
         counts = value_counts.to_dask_array()

diff --git a/dataprep/tests/eda/random_data_generator.py b/dataprep/tests/eda/random_data_generator.py
@@ -23,7 +23,7 @@ def _gen_random_int_series(
 ) -> pd.Series:
     """Return a randonly generated int Series, where the value is in [low, high]"""
     rand = _resolve_random_state(random_state)
-    arr = rand.random_integers(low=low, high=high, size=size)
+    arr = rand.randint(low=low, high=high, size=size)
     return pd.Series(arr)