perf(eda.plot): changed drop_null to dropna

sfu-db · Sep 15, 2020 · 0a7fe56 · 0a7fe56
1 parent 1dbf297
commit 0a7fe56
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 27 deletions.
diff --git a/dataprep/eda/distribution/compute/bivariate.py b/dataprep/eda/distribution/compute/bivariate.py
@@ -14,7 +14,6 @@
     DTypeDef,
     Nominal,
     detect_dtype,
-    drop_null,
     is_dtype,
 )
 from .common import (
@@ -98,7 +97,7 @@ def compute_bivariate(
         except TypeError:
             df[x] = df[x].astype(str)
 
-        (comps,) = dask.compute(nom_cont_comps(drop_null(df), bins, ngroups, largest))
+        (comps,) = dask.compute(nom_cont_comps(df.dropna(), bins, ngroups, largest))
 
         return Intermediate(
             x=x, y=y, data=comps, ngroups=ngroups, visual_type="cat_and_num_cols"
@@ -110,7 +109,7 @@ def compute_bivariate(
         and is_dtype(ytype, DateTime())
     ):
         x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
-        df = drop_null(df[[x, y]])
+        df = df[[x, y]].dropna()
         dtnum: List[Any] = []
         # line chart
         dtnum.append(dask.delayed(_calc_line_dt)(df, timeunit, agg))
@@ -131,7 +130,7 @@ def compute_bivariate(
         and is_dtype(ytype, DateTime())
     ):
         x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
-        df = drop_null(df[[x, y]])
+        df = df[[x, y]].dropna()
         df[y] = df[y].apply(str, meta=(y, str))
         dtcat: List[Any] = []
         # line chart
@@ -160,7 +159,7 @@ def compute_bivariate(
         except TypeError:
             df[y] = df[y].astype(str)
 
-        (comps,) = dask.compute(drop_null(df).groupby([x, y]).size())
+        (comps,) = dask.compute(df.dropna().groupby([x, y]).size())
 
         return Intermediate(
             x=x,
@@ -171,7 +170,7 @@ def compute_bivariate(
             visual_type="two_cat_cols",
         )
     elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()):
-        df = drop_null(df[[x, y]])
+        df = df[[x, y]].dropna()
 
         data: Dict[str, Any] = {}
         # scatter plot data

diff --git a/dataprep/eda/distribution/compute/common.py b/dataprep/eda/distribution/compute/common.py
@@ -9,6 +9,7 @@
 from scipy.stats import gaussian_kde as gaussian_kde_
 from scipy.stats import ks_2samp as ks_2samp_
 from scipy.stats import normaltest as normaltest_
+from scipy.stats import skewtest as skewtest_
 
 from ...dtypes import drop_null
 
@@ -233,5 +234,13 @@ def ks_2samp(data1: np.ndarray, data2: np.ndarray) -> Tuple[float, float]:
     name="scipy-gaussian_kde", pure=True, nout=2
 )
 def gaussian_kde(arr: np.ndarray) -> Tuple[float, float]:
-    """Delayed version of scipy ks_2samp."""
+    """Delayed version of scipy gaussian_kde."""
     return cast(Tuple[np.ndarray, np.ndarray], gaussian_kde_(arr))
+
+
+@dask.delayed(  # pylint: disable=no-value-for-parameter
+    name="scipy-skewtest", pure=True, nout=2
+)
+def skewtest(arr: np.ndarray) -> Tuple[float, float]:
+    """Delayed version of scipy skewtest."""
+    return cast(Tuple[float, float], skewtest_(arr))
diff --git a/dataprep/eda/distribution/compute/overview.py b/dataprep/eda/distribution/compute/overview.py
@@ -8,7 +8,7 @@
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
-from dask.array.stats import chisquare, skew
+from dask.array.stats import chisquare
 
 from ....errors import UnreachableError
 from ...dtypes import (
@@ -18,12 +18,11 @@
     DTypeDef,
     Nominal,
     detect_dtype,
-    drop_null,
     get_dtype_cnts_and_num_cols,
     is_dtype,
 )
 from ...intermediate import Intermediate
-from .common import _calc_line_dt, ks_2samp, normaltest
+from .common import _calc_line_dt, ks_2samp, normaltest, skewtest
 
 
 def compute_overview(
@@ -80,11 +79,11 @@ def compute_overview(
                 first_rows[col].apply(hash)
             except TypeError:
                 srs = df[col] = srs.astype(str)
-            datas.append(calc_nom_col(drop_null(srs), ngroups, largest))
+            datas.append(calc_nom_col(srs.dropna(), first_rows[col], ngroups, largest))
             col_names_dtypes.append((col, Nominal()))
         elif is_dtype(col_dtype, Continuous()):
             ## if cfg.hist_enable or cfg.any_insights("hist"):
-            datas.append(calc_cont_col(drop_null(srs), bins))
+            datas.append(calc_cont_col(srs.dropna(), bins))
             col_names_dtypes.append((col, Continuous()))
         elif is_dtype(col_dtype, DateTime()):
             datas.append(dask.delayed(_calc_line_dt)(df[[col]], timeunit))
@@ -145,10 +144,11 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
     data["npres"] = srs.shape[0]
 
     ## if cfg.insight.infinity_enable:
-    data["ninf"] = srs.isin({np.inf, -np.inf}).sum()
+    is_inf_srs = srs.isin({np.inf, -np.inf})
+    data["ninf"] = is_inf_srs.sum()
 
     # remove infinite values
-    srs = srs[~srs.isin({np.inf, -np.inf})]
+    srs = srs[~is_inf_srs]
 
     ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
     ## bins = cfg.hist_bins
@@ -164,7 +164,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
     data["nneg"] = (srs < 0).sum()
 
     ## if cfg.insight.skew_enabled:
-    data["skew"] = skew(srs)
+    data["skew"] = skewtest(data["hist"][0])
 
     ## if cfg.insight.unique_enabled:
     data["nuniq"] = srs.nunique()
@@ -176,7 +176,9 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
 
 
 ## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
-def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
+def calc_nom_col(
+    srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool
+) -> Dict[str, Any]:
     """
     Computations for a categorical column in plot(df)
 
@@ -222,8 +224,10 @@ def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
     ## data["npresent"] = srs.shape[0]
 
     ## if cfg.insight.constant_length_enable:
-    length = srs.apply(lambda v: len(str(v)), meta=(srs.name, np.int64))
-    data["min_len"], data["max_len"] = length.min(), length.max()
+    if not first_rows.apply(lambda x: isinstance(x, str)).all():
+        srs = srs.astype(str)  # srs must be a string to compute the value lengths
+    lengths = srs.str.len()
+    data["min_len"], data["max_len"] = lengths.min(), lengths.max()
 
     return data
 
@@ -247,7 +251,6 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
 
     ## if cfg.stats_enable
     dtype_cnts, num_cols = get_dtype_cnts_and_num_cols(df, dtype)
-    stats["nrows"] = df.shape[0]
     stats["ncols"] = df.shape[1]
     stats["npresent_cells"] = df.count().sum()
     stats["nrows_wo_dups"] = df.drop_duplicates().shape[0]
@@ -327,9 +330,8 @@ def format_cont(col: str, data: Dict[str, Any], nrows: int) -> Any:
         ins.append({"Missing": f"{col} has {nmiss} ({pmiss}%) missing values"})
 
     ## if cfg.insight.skewed_enable:
-    if data["skew"] >= 20:  ## cfg.insight.skewed_threshold
-        skew_val = np.round(data["skew"], 4)
-        ins.append({"Skewed": f"{col} is skewed (\u03B31 = {skew_val})"})
+    if data["skew"][1] < 1e-5:  ## cfg.insight.skewed_threshold
+        ins.append({"Skewed": f"{col} is skewed"})
 
     ## if cfg.insight.infinity_enable:
     pinf = round(data["ninf"] / nrows * 100, 2)

diff --git a/dataprep/eda/distribution/compute/univariate.py b/dataprep/eda/distribution/compute/univariate.py
@@ -18,7 +18,6 @@
     DTypeDef,
     Nominal,
     detect_dtype,
-    drop_null,
     is_dtype,
 )
 from ...intermediate import Intermediate
@@ -177,7 +176,7 @@ def nom_comps(
     except TypeError:
         srs = srs.astype(str)
     # drop null values
-    srs = drop_null(srs)
+    srs = srs.dropna()
 
     ## if cfg.bar_enable or cfg.pie_enable
     # counts of unique values in the series
@@ -223,7 +222,7 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
     ## if cfg.stats_enable or cfg.hist_enable or
     # calculate the total number of rows then drop the missing values
     data["nrows"] = srs.shape[0]
-    srs = drop_null(srs)
+    srs = srs.dropna()
     ## if cfg.stats_enable
     # number of not null (present) values
     data["npres"] = srs.shape[0]
@@ -236,7 +235,6 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
     ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
     data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]])
     ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
-    # NOTE normal test does a .compute() and I cannot fix it with delayed
     data["norm"] = normaltest(data["hist"][0])
     ## if cfg.qqplot_enable
     data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))

diff --git a/dataprep/eda/distribution/render.py b/dataprep/eda/distribution/render.py
@@ -1636,7 +1636,7 @@ def cont_insights(data: Dict[str, Any], col: str) -> Dict[str, List[str]]:
 
     ## if cfg.insight.normal_enable:
     if data["norm"][1] > 0.99:
-        ins["hist"].append(f"{col} is normally distributed")
+        ins["Histogram"].append(f"{col} is normally distributed")
 
     ## if cfg.insight.uniform_enable:
     if data["chisq"][1] > 0.999:  ## cfg.insight.uniform_threshold