fix(eda.create_report): fix constant column error

fix the errors when a column has constant value or all the entries are nan close #565
sfu-db · Apr 21, 2021 · 160844a · 160844a
1 parent 8b17302
commit 160844a
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 23 deletions.
diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -178,8 +178,6 @@ def format_basic(df: dd.DataFrame, cfg: Config) -> Dict[str, Any]:
                 "plots_tab": zip(comp[1][1:], rndrd["meta"][1:], insight_keys),
                 "insights_tab": rndrd["insights"],
             }
-            # for div, tab, key in res["variables"][col]['plots_tab']:
-            #     print(div)
     else:
         res["has_variables"] = False
 
@@ -267,13 +265,19 @@ def basic_computations(
     # variables
     if cfg.variables.enable:
         for col in df.columns:
-            if is_dtype(detect_dtype(df.frame[col]), Continuous()):
-                data[col] = cont_comps(df.frame[col], cfg)
+            npres = dask.compute(df.frame[col].dropna().shape[0])
+            # Since it will throw error if a numerical column is all-nan,
+            # we transform it to categorical column
+            if npres[0] == 0:
+                df.frame[col] = df.frame[col].astype(str)
+                data[col] = nom_comps(df.frame[col], df.frame[col].head(), cfg)
             elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
                 # Since it will throw error if column is object while some cells are
                 # numerical, we transform column to string first.
                 df.frame[col] = df.frame[col].astype(str)
                 data[col] = nom_comps(df.frame[col], first_rows[col], cfg)
+            elif is_dtype(detect_dtype(df.frame[col]), Continuous()):
+                data[col] = cont_comps(df.frame[col], cfg)
             elif is_dtype(detect_dtype(df.frame[col]), DateTime()):
                 data[col] = {}
                 data[col]["stats"] = calc_stats_dt(df.frame[col])
@@ -323,3 +327,26 @@ def basic_computations(
 
     else:
         return data
+
+
+# def cont_comps_brief(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
+#
+#     """
+#     Computations required for constant column and all-nan column
+#     """
+#     # pylint: disable=too-many-branches
+#     data: Dict[str, Any] = {}
+#
+#     data["nrows"] = srs.shape[0]  # total rows
+#     srs = srs.dropna()
+#     data["npres"] = srs.shape[0]  # number of present (not null) values
+#     data["mean"] = srs.mean()
+#     data["min"] = srs.min()
+#     data["max"] = srs.max()
+#     data["nreals"] = srs.shape[0]
+#     data["nzero"] = (srs == 0).sum()
+#     data["nneg"] = (srs < 0).sum()
+#     data["mem_use"] = srs.memory_usage(deep=True)
+#     data["nuniq"] = srs.nunique_approx()
+#
+#     return data
diff --git a/dataprep/eda/distribution/compute/univariate.py b/dataprep/eda/distribution/compute/univariate.py
@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+import math
 import dask
 import dask.array as da
 import dask.dataframe as dd
@@ -207,37 +208,28 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
 
     if cfg.stats.enable or cfg.hist.enable:
         data["nrows"] = srs.shape[0]  # total rows
-
     srs = srs.dropna()
-
     if cfg.stats.enable:
         data["npres"] = srs.shape[0]  # number of present (not null) values
-
     srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
-
     if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
         data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
         if cfg.insight.enable:
             data["norm"] = normaltest(data["hist"][0])
-
     if cfg.hist.enable and cfg.insight.enable:
         data["chisq"] = chisquare(data["hist"][0])
-
     # compute only the required amount of quantiles
     if cfg.qqnorm.enable:
         data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
     elif cfg.stats.enable:
         data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
     elif cfg.box.enable:
         data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
-
     if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
         data["skew"] = skew(srs)
-
     if cfg.stats.enable or cfg.qqnorm.enable:
         data["mean"] = srs.mean()
         data["std"] = srs.std()
-
     if cfg.stats.enable:
         data["min"] = srs.min()
         data["max"] = srs.max()
@@ -246,18 +238,19 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
         data["nneg"] = (srs < 0).sum()
         data["kurt"] = kurtosis(srs)
         data["mem_use"] = srs.memory_usage(deep=True)
-
     # compute the density histogram
     if cfg.kde.enable:
-        data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True)
-        # gaussian kernel density estimate
-        data["kde"] = gaussian_kde(
-            srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)
-        )
-
+        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
+        if not math.isclose(dask.compute(data["min"])[0], dask.compute(data["max"])[0]):
+            data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True)
+            # gaussian kernel density estimate
+            data["kde"] = gaussian_kde(
+                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)
+            )
+        else:
+            data["kde"] = None
     if cfg.box.enable:
         data.update(_calc_box(srs, data["qntls"], cfg))
-
     if cfg.value_table.enable:
         value_counts = srs.value_counts(sort=False)
         if cfg.stats.enable:

diff --git a/dataprep/eda/distribution/render.py b/dataprep/eda/distribution/render.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 import json
 import os
+import math
 
 import numpy as np
 import pandas as pd
@@ -1875,11 +1876,13 @@ def render_num(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
         tabs.append(Panel(child=row(fig), title="Histogram"))
         htgs["Histogram"] = cfg.hist.how_to_guide(plot_height, plot_width)
     if cfg.kde.enable:
-        if data["kde"] is not None:
+        # when the column is constant, we wont display kde plot
+        if data["kde"] is not None and (not math.isclose(data["min"], data["max"])):
             dens, kde = data["dens"], data["kde"]
             tabs.append(kde_viz(dens, kde, col, plot_width, plot_height, cfg.kde))
             htgs["KDE Plot"] = cfg.kde.how_to_guide(plot_height, plot_width)
-    if cfg.qqnorm.enable:
+    if cfg.qqnorm.enable and (not math.isclose(data["min"], data["max"])):
+        # when the column is constant, we wont display qq plot
         if data["qntls"].any():
             qntls, mean, std = data["qntls"], data["mean"], data["std"]
             tabs.append(qqnorm_viz(qntls, mean, std, col, plot_width, plot_height, cfg.qqnorm))

diff --git a/dataprep/tests/eda/test_create_report.py b/dataprep/tests/eda/test_create_report.py
@@ -37,6 +37,13 @@ def simpledf() -> pd.DataFrame:
     return df
 
 
+@pytest.fixture(scope="module")  # type: ignore
+def constantdf() -> pd.DataFrame:
+    df = pd.DataFrame({"a": [0] * 10, "b": [1] * 10, "c": [np.nan] * 10})
+
+    return df
+
+
 def test_report(simpledf: pd.DataFrame) -> None:
     from sys import platform
 
@@ -56,3 +63,13 @@ def test_report_show(simpledf: pd.DataFrame) -> None:
         matplotlib.use("PS")
     report = create_report(simpledf, mode="basic")
     report.show()
+
+
+def test_report_constant(constantdf: pd.DataFrame) -> None:
+    from sys import platform
+
+    if platform == "darwin":
+        import matplotlib
+
+        matplotlib.use("PS")
+    create_report(constantdf, mode="basic")