Skip to content

Commit

Permalink
fix(eda.create_report): fix constant column error
Browse files Browse the repository at this point in the history
fix the errors when a column has constant value or all the entries are
nan

close #565
  • Loading branch information
dylanzxc authored and jinglinpeng committed Apr 21, 2021
1 parent 8b17302 commit 160844a
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 23 deletions.
35 changes: 31 additions & 4 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ def format_basic(df: dd.DataFrame, cfg: Config) -> Dict[str, Any]:
"plots_tab": zip(comp[1][1:], rndrd["meta"][1:], insight_keys),
"insights_tab": rndrd["insights"],
}
# for div, tab, key in res["variables"][col]['plots_tab']:
# print(div)
else:
res["has_variables"] = False

Expand Down Expand Up @@ -267,13 +265,19 @@ def basic_computations(
# variables
if cfg.variables.enable:
for col in df.columns:
if is_dtype(detect_dtype(df.frame[col]), Continuous()):
data[col] = cont_comps(df.frame[col], cfg)
npres = dask.compute(df.frame[col].dropna().shape[0])
# Since it will throw error if a numerical column is all-nan,
# we transform it to categorical column
if npres[0] == 0:
df.frame[col] = df.frame[col].astype(str)
data[col] = nom_comps(df.frame[col], df.frame[col].head(), cfg)
elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df.frame[col] = df.frame[col].astype(str)
data[col] = nom_comps(df.frame[col], first_rows[col], cfg)
elif is_dtype(detect_dtype(df.frame[col]), Continuous()):
data[col] = cont_comps(df.frame[col], cfg)
elif is_dtype(detect_dtype(df.frame[col]), DateTime()):
data[col] = {}
data[col]["stats"] = calc_stats_dt(df.frame[col])
Expand Down Expand Up @@ -323,3 +327,26 @@ def basic_computations(

else:
return data


# def cont_comps_brief(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
#
# """
# Computations required for constant column and all-nan column
# """
# # pylint: disable=too-many-branches
# data: Dict[str, Any] = {}
#
# data["nrows"] = srs.shape[0] # total rows
# srs = srs.dropna()
# data["npres"] = srs.shape[0] # number of present (not null) values
# data["mean"] = srs.mean()
# data["min"] = srs.min()
# data["max"] = srs.max()
# data["nreals"] = srs.shape[0]
# data["nzero"] = (srs == 0).sum()
# data["nneg"] = (srs < 0).sum()
# data["mem_use"] = srs.memory_usage(deep=True)
# data["nuniq"] = srs.nunique_approx()
#
# return data
27 changes: 10 additions & 17 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import Any, Dict, List, Optional

import math
import dask
import dask.array as da
import dask.dataframe as dd
Expand Down Expand Up @@ -207,37 +208,28 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:

if cfg.stats.enable or cfg.hist.enable:
data["nrows"] = srs.shape[0] # total rows

srs = srs.dropna()

if cfg.stats.enable:
data["npres"] = srs.shape[0] # number of present (not null) values

srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values

if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
if cfg.insight.enable:
data["norm"] = normaltest(data["hist"][0])

if cfg.hist.enable and cfg.insight.enable:
data["chisq"] = chisquare(data["hist"][0])

# compute only the required amount of quantiles
if cfg.qqnorm.enable:
data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
elif cfg.stats.enable:
data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
elif cfg.box.enable:
data["qntls"] = srs.quantile([0.25, 0.5, 0.75])

if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
data["skew"] = skew(srs)

if cfg.stats.enable or cfg.qqnorm.enable:
data["mean"] = srs.mean()
data["std"] = srs.std()

if cfg.stats.enable:
data["min"] = srs.min()
data["max"] = srs.max()
Expand All @@ -246,18 +238,19 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
data["nneg"] = (srs < 0).sum()
data["kurt"] = kurtosis(srs)
data["mem_use"] = srs.memory_usage(deep=True)

# compute the density histogram
if cfg.kde.enable:
data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True)
# gaussian kernel density estimate
data["kde"] = gaussian_kde(
srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)
)

# To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
if not math.isclose(dask.compute(data["min"])[0], dask.compute(data["max"])[0]):
data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True)
# gaussian kernel density estimate
data["kde"] = gaussian_kde(
srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)
)
else:
data["kde"] = None
if cfg.box.enable:
data.update(_calc_box(srs, data["qntls"], cfg))

if cfg.value_table.enable:
value_counts = srs.value_counts(sort=False)
if cfg.stats.enable:
Expand Down
7 changes: 5 additions & 2 deletions dataprep/eda/distribution/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Dict, List, Optional, Tuple, Union
import json
import os
import math

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1875,11 +1876,13 @@ def render_num(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
tabs.append(Panel(child=row(fig), title="Histogram"))
htgs["Histogram"] = cfg.hist.how_to_guide(plot_height, plot_width)
if cfg.kde.enable:
if data["kde"] is not None:
# when the column is constant, we wont display kde plot
if data["kde"] is not None and (not math.isclose(data["min"], data["max"])):
dens, kde = data["dens"], data["kde"]
tabs.append(kde_viz(dens, kde, col, plot_width, plot_height, cfg.kde))
htgs["KDE Plot"] = cfg.kde.how_to_guide(plot_height, plot_width)
if cfg.qqnorm.enable:
if cfg.qqnorm.enable and (not math.isclose(data["min"], data["max"])):
# when the column is constant, we wont display qq plot
if data["qntls"].any():
qntls, mean, std = data["qntls"], data["mean"], data["std"]
tabs.append(qqnorm_viz(qntls, mean, std, col, plot_width, plot_height, cfg.qqnorm))
Expand Down
17 changes: 17 additions & 0 deletions dataprep/tests/eda/test_create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def simpledf() -> pd.DataFrame:
return df


@pytest.fixture(scope="module") # type: ignore
def constantdf() -> pd.DataFrame:
df = pd.DataFrame({"a": [0] * 10, "b": [1] * 10, "c": [np.nan] * 10})

return df


def test_report(simpledf: pd.DataFrame) -> None:
from sys import platform

Expand All @@ -56,3 +63,13 @@ def test_report_show(simpledf: pd.DataFrame) -> None:
matplotlib.use("PS")
report = create_report(simpledf, mode="basic")
report.show()


def test_report_constant(constantdf: pd.DataFrame) -> None:
from sys import platform

if platform == "darwin":
import matplotlib

matplotlib.use("PS")
create_report(constantdf, mode="basic")

0 comments on commit 160844a

Please sign in to comment.