fix(eda): fix missing for SmallCard and DateTime type

sfu-db · May 16, 2021 · 201e487 · 201e487
1 parent fe515d6
commit 201e487
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 63 deletions.
diff --git a/dataprep/eda/eda_frame.py b/dataprep/eda/eda_frame.py
@@ -13,13 +13,7 @@
 import pandas as pd
 import pandas._libs.missing as libmissing
 
-from .dtypes_v2 import (
-    NUMERICAL_DTYPES,
-    DType,
-    DTypeDef,
-    detect_dtype,
-    Nominal,
-)
+from .dtypes_v2 import NUMERICAL_DTYPES, DType, DTypeDef, detect_dtype, Nominal, GeoGraphy
 
 DataFrame = Union[pd.DataFrame, dd.DataFrame, "EDAFrame"]
 
@@ -113,7 +107,7 @@ def __init__(
 
         # Transform categorical column to string for non-na values.
         for col in ddf.columns:
-            if isinstance(self._eda_dtypes[col], Nominal):
+            if isinstance(self._eda_dtypes[col], (Nominal, GeoGraphy)):
                 ddf[col] = ddf[col].apply(_to_str_if_not_na, meta=(col, "object"))
 
         self._ddf = ddf.persist()

diff --git a/dataprep/eda/missing/compute/__init__.py b/dataprep/eda/missing/compute/__init__.py
@@ -74,9 +74,9 @@ def compute_missing(
     if x is None and y is not None:
         raise ValueError("x cannot be None while y has value")
     elif x is not None and y is None:
-        ret = compute_missing_univariate(eda_frame, x, cfg, dtype)
+        ret = compute_missing_univariate(eda_frame, x, cfg)
     elif x is not None and y is not None:
-        ret = compute_missing_bivariate(eda_frame, x, y, cfg, dtype)
+        ret = compute_missing_bivariate(eda_frame, x, y, cfg)
     else:
         ret = compute_missing_nullivariate(eda_frame, cfg)
 

diff --git a/dataprep/eda/missing/compute/bivariate.py b/dataprep/eda/missing/compute/bivariate.py
@@ -1,7 +1,7 @@
 """This module implements the plot_missing(df) function's
 calculating intermediate part."""
 
-from typing import Any, Generator, List, Optional
+from typing import Any, Generator, List
 
 import dask.dataframe as dd
 import numpy as np
@@ -10,32 +10,45 @@
 
 from ...configs import Config
 from ...eda_frame import EDAFrame
-from ...dtypes_v2 import Continuous, DTypeDef, Nominal, GeoGraphy
+from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime
 from ...intermediate import ColumnsMetadata, Intermediate
 from ...staged import staged
 from .common import LABELS, histogram
 
 
-def _compute_missing_bivariate(  # pylint: disable=too-many-locals,too-many-statements
+def _compute_missing_bivariate(  # pylint: disable=too-many-locals,too-many-statements, too-many-branches
     df: EDAFrame,
     x: str,
     y: str,
     cfg: Config,
-    dtype: Optional[DTypeDef] = None,
 ) -> Generator[Any, Any, Intermediate]:
     """Calculate the distribution change on another column y when
     the missing values in x is dropped."""
 
-    xloc, yloc = df.columns.get_loc(x), df.columns.get_loc(y)
-
-    col0 = df.values[~df.nulls[:, yloc], yloc].astype(df.dtypes[y])
-    col1 = df.values[~(df.nulls[:, xloc] | df.nulls[:, yloc]), yloc].astype(df.dtypes[y])
-
-    minimum, maximum = col0.min(), col0.max()
     y_dtype = df.get_eda_dtype(y)
-    bins = cfg.bar.bars if isinstance(y_dtype, (Nominal, GeoGraphy)) else cfg.hist.bins
+    # dataframe with all rows where column x is null removed
+    ddf = df.frame[~df.frame[x].isna()]
+    if isinstance(y_dtype, (SmallCardNum, DateTime)):
+        col0 = df.frame[y].dropna().astype(str).values  # series from original dataframe
+        col1 = ddf[y].dropna().astype(str).values  # series with null rows from col x removed
+    elif isinstance(y_dtype, (GeoGraphy, Nominal, Continuous)):
+        # Geograph, Nominal should be transformed to str when constructing edaframe.
+        # Here we do not need to transform them again.
+        col0 = df.frame[y].dropna().values
+        col1 = ddf[y].dropna().values
+    else:
+        raise ValueError(f"unprocessed type:{y_dtype}")
 
-    hists = [histogram(col, bins, return_edges=True, dtype=dtype) for col in [col0, col1]]
+    minimum, maximum = col0.min(), col0.max()
+    bins = (
+        cfg.bar.bars
+        if isinstance(y_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
+        else cfg.hist.bins
+    )
+
+    hists = [
+        histogram(col, eda_dtype=y_dtype, bins=bins, return_edges=True) for col in [col0, col1]
+    ]
 
     quantiles = None
     if isinstance(y_dtype, Continuous) and cfg.box.enable:

diff --git a/dataprep/eda/missing/compute/common.py b/dataprep/eda/missing/compute/common.py
@@ -5,28 +5,26 @@
 import dask.dataframe as dd
 
 from ...configs import Config
-from ...dtypes_v2 import Continuous, DTypeDef, Nominal, detect_dtype, GeoGraphy
+from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime, DType
 
 LABELS = ["Orignal data", "After drop missing values"]
 
 
 def uni_histogram(
     srs: dd.Series,
+    srs_dtype: DType,
     cfg: Config,
-    dtype: Optional[DTypeDef] = None,
 ) -> Tuple[da.Array, ...]:
     """Calculate "histogram" for both numerical and categorical."""
 
-    srs_type = detect_dtype(srs, srs.head(), dtype)
-
-    if isinstance(srs_type, Continuous):
+    if isinstance(srs_dtype, Continuous):
 
         counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
         centers = (edges[:-1] + edges[1:]) / 2
 
         return counts, centers, edges
 
-    elif isinstance(srs_type, (Nominal, GeoGraphy)):
+    elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
         # Dask array's unique is way slower than the values_counts on Series
         # See https://github.com/dask/dask/issues/2851
         # centers, counts = da.unique(arr, return_counts=True)
@@ -43,17 +41,16 @@ def uni_histogram(
 
 def histogram(
     arr: da.Array,
+    eda_dtype: DType,
     bins: Optional[int] = None,
     return_edges: bool = True,
     range: Optional[Tuple[int, int]] = None,  # pylint: disable=redefined-builtin
-    dtype: Optional[DTypeDef] = None,
 ) -> Tuple[da.Array, ...]:
     """Calculate "histogram" for both numerical and categorical."""
     if len(arr.shape) != 1:
         raise ValueError("Histogram only supports 1-d array.")
     srs = dd.from_dask_array(arr)
-    detected_type = detect_dtype(srs, srs.head(), dtype)
-    if isinstance(detected_type, Continuous):
+    if isinstance(eda_dtype, Continuous):
         if range is not None:
             minimum, maximum = range
         else:
@@ -68,7 +65,7 @@ def histogram(
         if not return_edges:
             return counts, centers
         return counts, centers, edges
-    elif isinstance(detected_type, (Nominal, GeoGraphy)):
+    elif isinstance(eda_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
         # Dask array's unique is way slower than the values_counts on Series
         # See https://github.com/dask/dask/issues/2851
         # centers, counts = da.unique(arr, return_counts=True)
@@ -80,4 +77,4 @@ def histogram(
 
         return (counts, centers)
     else:
-        raise ValueError(f"Unsupported dtype {arr.dtype}")
+        raise ValueError(f"Unsupported dtype {eda_dtype}")
diff --git a/dataprep/eda/missing/compute/univariate.py b/dataprep/eda/missing/compute/univariate.py
@@ -1,14 +1,14 @@
 """This module implements the plot_missing(df, x) function's
 calculating intermediate part
 """
-from typing import Any, Generator, List, Optional
+from typing import Any, Generator, List
 
 import numpy as np
 import pandas as pd
 
 from ...configs import Config
 from ...eda_frame import EDAFrame
-from ...dtypes_v2 import DTypeDef, Continuous, Nominal, GeoGraphy
+from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime
 from ...intermediate import ColumnsMetadata, Intermediate
 from ...staged import staged
 from .common import LABELS, uni_histogram
@@ -18,7 +18,6 @@ def _compute_missing_univariate(  # pylint: disable=too-many-locals
     df: EDAFrame,
     x: str,
     cfg: Config,
-    dtype: Optional[DTypeDef] = None,
 ) -> Generator[Any, Any, Intermediate]:
     """Calculate the distribution change on other columns when
     the missing values in x is dropped."""
@@ -33,17 +32,26 @@ def _compute_missing_univariate(  # pylint: disable=too-many-locals
         col_dtype = df.get_eda_dtype(col)
         if (
             col == x
-            or isinstance(col_dtype, (Nominal, GeoGraphy))
-            and not cfg.bar.enable
-            or isinstance(col_dtype, Continuous)
-            and not cfg.hist.enable
+            or (
+                isinstance(col_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
+                and not cfg.bar.enable
+            )
+            or (isinstance(col_dtype, Continuous) and not cfg.hist.enable)
         ):
             continue
 
-        srs0 = df.frame[col].dropna()  # series from original dataframe
-        srs1 = ddf[col].dropna()  # series with null rows from col x removed
+        if isinstance(col_dtype, (SmallCardNum, DateTime)):
+            srs0 = df.frame[col].dropna().astype(str)  # series from original dataframe
+            srs1 = ddf[col].dropna().astype(str)  # series with null rows from col x removed
+        elif isinstance(col_dtype, (GeoGraphy, Nominal, Continuous)):
+            # Geograph, Nominal should be transformed to str when constructing edaframe.
+            # Here we do not need to transform them again.
+            srs0 = df.frame[col].dropna()
+            srs1 = ddf[col].dropna()
+        else:
+            raise ValueError(f"unprocessed type:{col_dtype}")
 
-        hists[col] = [uni_histogram(srs, cfg, dtype) for srs in [srs0, srs1]]
+        hists[col] = [uni_histogram(srs, col_dtype, cfg) for srs in [srs0, srs1]]
 
     ### Lazy Region End
     hists = yield hists
@@ -80,7 +88,9 @@ def _compute_missing_univariate(  # pylint: disable=too-many-locals
         # If the cardinality of a categorical column is too large,
         # we show the top `num_bins` values, sorted by their count before drop
         col_dtype = df.get_eda_dtype(col_name)
-        if len(counts[0]) > cfg.bar.bars and (isinstance(col_dtype, (Nominal, GeoGraphy))):
+        if len(counts[0]) > cfg.bar.bars and (
+            isinstance(col_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
+        ):
             sortidx = np.argsort(-counts[0])
             selected_xs = xs[0][sortidx[: cfg.bar.bars]]
             ret_df = ret_df[ret_df["x"].isin(selected_xs)]

diff --git a/dataprep/eda/missing/render.py b/dataprep/eda/missing/render.py
@@ -25,7 +25,7 @@
 
 from ...errors import UnreachableError
 from ..configs import Config
-from ..dtypes_v2 import Continuous, Nominal, GeoGraphy, drop_null, is_dtype
+from ..dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, drop_null, DateTime
 from ..intermediate import ColumnMetadata, Intermediate
 from ..palette import CATEGORY10, CATEGORY20, GREYS256, RDBU
 from ..utils import cut_long_name, fuse_missing_perc, relocate_legend
@@ -109,7 +109,7 @@ def render_dist(
     return fig
 
 
-def render_hist(  # pylint: disable=too-many-arguments
+def render_hist(
     df: pd.DataFrame,
     x: str,
     meta: ColumnMetadata,
@@ -120,13 +120,16 @@ def render_hist(  # pylint: disable=too-many-arguments
     """
     Render a histogram
     """
-    if is_dtype(meta["dtype"], Nominal()) or is_dtype(meta["dtype"], GeoGraphy()):
+    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-locals
+
+    if isinstance(meta["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
         tooltips = [
             (x, "@x"),
             ("Count", "@count"),
             ("Label", "@label"),
         ]
-    else:
+    elif isinstance(meta["dtype"], Continuous):
         df = df.copy()
         df["repr"] = [f"[{row.lower_bound:.0f}~{row.upper_bound:.0f})" for row in df.itertuples()]
 
@@ -135,21 +138,28 @@ def render_hist(  # pylint: disable=too-many-arguments
             ("Frequency", "@count"),
             ("Label", "@label"),
         ]
+    else:
+        mtype = type(meta["dtype"])
+        raise ValueError(f"unprocessed data type:{mtype}, col:{x}")
+
     cols = [f"{col[:12]}..." if isinstance(col, str) and len(col) > 18 else col for col in df["x"]]
     df["x"] = cols
     cmapper = CategoricalColorMapper(palette=CATEGORY10, factors=LABELS)
 
-    if is_dtype(meta["dtype"], Nominal()) or is_dtype(meta["dtype"], GeoGraphy()):
+    if isinstance(meta["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
         radius = 0.99
 
         # Inputs of FactorRange() have to be sequence of strings,
         # object only contains numbers can cause errors.(Issue#98).
         df["x"] = df["x"].astype("str")
         x_range = FactorRange(*df["x"].unique())
-    else:
+    elif isinstance(meta["dtype"], Continuous):
 
         radius = df["x"][1] - df["x"][0]
         x_range = Range1d(df["x"].min() - radius, df["x"].max() + radius)
+    else:
+        mtype = type(meta["dtype"])
+        raise ValueError(f"unprocessed data type:{mtype}, col:{x}")
 
     y_range = Range1d(0, df["count"].max() * 1.05)
 
@@ -608,6 +618,7 @@ def render_missing_impact_1vn(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
     """
     Render the plot from `plot_missing(df, "x")`
     """
+    # pylint: disable = too-many-locals
     plot_width = cfg.plot.width if cfg.plot.width is not None else 300
     plot_height = cfg.plot.height if cfg.plot.height is not None else 300
 
@@ -623,10 +634,13 @@ def render_missing_impact_1vn(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
         fig.frame_height = plot_height
         panels.append(Panel(child=fig, title=col))
 
-        if is_dtype(meta[col]["dtype"], Nominal()) or is_dtype(meta[col]["dtype"], GeoGraphy()):
+        if isinstance(meta[col]["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
             htgs[title] = cfg.bar.grid_how_to_guide()
-        else:
+        elif isinstance(meta[col]["dtype"], Continuous):
             htgs[title] = cfg.hist.grid_how_to_guide()
+        else:
+            mtype = type(meta[col]["dtype"])
+            raise ValueError(f"unprocessed type:{mtype}")
         titles.append(title)
     legend_colors = [CATEGORY10[count] for count in range(len(LABELS))]
     return {
@@ -650,7 +664,7 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
     x, y, meta = itmdt["x"], itmdt["y"], itmdt["meta"]
     htgs: Dict[str, List[Tuple[str, str]]] = {}
 
-    if is_dtype(meta["dtype"], Continuous()):
+    if isinstance(meta["dtype"], Continuous):
         panels = []
 
         if cfg.hist.enable:
@@ -679,7 +693,7 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
             "container_width": max([panel.child.plot_width for panel in panels]),
             "how_to_guide": htgs,
         }
-    else:
+    elif isinstance(meta["dtype"], (Nominal, SmallCardNum, GeoGraphy, DateTime)):
         fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height, True)
         shown, total = meta["shown"], meta["total"]
         if shown != total:
@@ -693,3 +707,6 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
             "container_width": fig.plot_width,
             "how_to_guide": htgs,
         }
+    else:
+        mtype = type(meta["dtype"])
+        raise ValueError(f"unsupported type:{mtype}")
diff --git a/dataprep/tests/eda/random_data_generator.py b/dataprep/tests/eda/random_data_generator.py
@@ -190,14 +190,34 @@ def gen_random_dataframe(
     return df
 
 
-@pytest.fixture(scope="module")  # type: ignore
-def random_df() -> pd.DataFrame:
-    df1 = gen_random_dataframe(nrows=30, ncols=10, random_state=0).reset_index(drop=True)
-    df2 = gen_random_dataframe(nrows=30, ncols=10, na_ratio=0.1, random_state=1).reset_index(
-        drop=True
+def gen_test_df() -> pd.DataFrame:
+    rand = np.random.RandomState(0)
+    nrows = 30
+    data = {}
+    data[0] = gen_random_dataframe(nrows=nrows, ncols=10, random_state=rand).reset_index(drop=True)
+    data[1] = gen_random_dataframe(
+        nrows=nrows, ncols=10, na_ratio=0.1, random_state=rand
+    ).reset_index(drop=True)
+    data[2] = pd.Series([np.nan] * nrows, name="const_na")
+    data[3] = pd.Series(["s"] * nrows, name="const_str")
+    data[4] = pd.Series([0] * nrows, name="const_zero")
+    data[5] = pd.Series([-1] * nrows, name="const_neg")
+    data[6] = pd.Series([1] * nrows, name="const_pos")
+    data[7] = pd.Series([0, 1, np.nan] * (nrows // 3), name="small_distinct_miss")
+    data[8] = gen_random_series(size=nrows, dtype="string", random_state=rand).rename("str_no_miss")
+    data[9] = gen_random_series(size=nrows, dtype="string", na_ratio=0.1, random_state=rand).rename(
+        "str_miss"
     )
-    df3 = gen_constant_series(30, np.nan).to_frame().reset_index(drop=True)
-    df4 = gen_constant_series(30, "s").to_frame().reset_index(drop=True)
-    df = pd.concat([df1, df2, df3, df4], axis=1)
+    data[10] = gen_random_series(size=nrows, dtype="float", random_state=rand).rename("num_no_miss")
+    data[11] = gen_random_series(size=nrows, dtype="float", na_ratio=0.1, random_state=rand).rename(
+        "num_miss"
+    )
+
+    df = pd.concat(data.values(), axis=1)
     df.index = gen_random_series(df.index.shape[0], na_ratio=0.1, str_max_len=100, random_state=2)
     return df
+
+
+@pytest.fixture(scope="module")  # type: ignore
+def random_df() -> pd.DataFrame:
+    return gen_test_df()