Skip to content

Commit

Permalink
perf(eda.plot): changed drop_null to dropna
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Lockhart committed Sep 15, 2020
1 parent 1dbf297 commit 0a7fe56
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 27 deletions.
11 changes: 5 additions & 6 deletions dataprep/eda/distribution/compute/bivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
DTypeDef,
Nominal,
detect_dtype,
drop_null,
is_dtype,
)
from .common import (
Expand Down Expand Up @@ -98,7 +97,7 @@ def compute_bivariate(
except TypeError:
df[x] = df[x].astype(str)

(comps,) = dask.compute(nom_cont_comps(drop_null(df), bins, ngroups, largest))
(comps,) = dask.compute(nom_cont_comps(df.dropna(), bins, ngroups, largest))

return Intermediate(
x=x, y=y, data=comps, ngroups=ngroups, visual_type="cat_and_num_cols"
Expand All @@ -110,7 +109,7 @@ def compute_bivariate(
and is_dtype(ytype, DateTime())
):
x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
df = drop_null(df[[x, y]])
df = df[[x, y]].dropna()
dtnum: List[Any] = []
# line chart
dtnum.append(dask.delayed(_calc_line_dt)(df, timeunit, agg))
Expand All @@ -131,7 +130,7 @@ def compute_bivariate(
and is_dtype(ytype, DateTime())
):
x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
df = drop_null(df[[x, y]])
df = df[[x, y]].dropna()
df[y] = df[y].apply(str, meta=(y, str))
dtcat: List[Any] = []
# line chart
Expand Down Expand Up @@ -160,7 +159,7 @@ def compute_bivariate(
except TypeError:
df[y] = df[y].astype(str)

(comps,) = dask.compute(drop_null(df).groupby([x, y]).size())
(comps,) = dask.compute(df.dropna().groupby([x, y]).size())

return Intermediate(
x=x,
Expand All @@ -171,7 +170,7 @@ def compute_bivariate(
visual_type="two_cat_cols",
)
elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()):
df = drop_null(df[[x, y]])
df = df[[x, y]].dropna()

data: Dict[str, Any] = {}
# scatter plot data
Expand Down
11 changes: 10 additions & 1 deletion dataprep/eda/distribution/compute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scipy.stats import gaussian_kde as gaussian_kde_
from scipy.stats import ks_2samp as ks_2samp_
from scipy.stats import normaltest as normaltest_
from scipy.stats import skewtest as skewtest_

from ...dtypes import drop_null

Expand Down Expand Up @@ -233,5 +234,13 @@ def ks_2samp(data1: np.ndarray, data2: np.ndarray) -> Tuple[float, float]:
name="scipy-gaussian_kde", pure=True, nout=2
)
def gaussian_kde(arr: np.ndarray) -> Tuple[float, float]:
"""Delayed version of scipy ks_2samp."""
"""Delayed version of scipy gaussian_kde."""
return cast(Tuple[np.ndarray, np.ndarray], gaussian_kde_(arr))


@dask.delayed( # pylint: disable=no-value-for-parameter
name="scipy-skewtest", pure=True, nout=2
)
def skewtest(arr: np.ndarray) -> Tuple[float, float]:
"""Delayed version of scipy skewtest."""
return cast(Tuple[float, float], skewtest_(arr))
32 changes: 17 additions & 15 deletions dataprep/eda/distribution/compute/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.array.stats import chisquare, skew
from dask.array.stats import chisquare

from ....errors import UnreachableError
from ...dtypes import (
Expand All @@ -18,12 +18,11 @@
DTypeDef,
Nominal,
detect_dtype,
drop_null,
get_dtype_cnts_and_num_cols,
is_dtype,
)
from ...intermediate import Intermediate
from .common import _calc_line_dt, ks_2samp, normaltest
from .common import _calc_line_dt, ks_2samp, normaltest, skewtest


def compute_overview(
Expand Down Expand Up @@ -80,11 +79,11 @@ def compute_overview(
first_rows[col].apply(hash)
except TypeError:
srs = df[col] = srs.astype(str)
datas.append(calc_nom_col(drop_null(srs), ngroups, largest))
datas.append(calc_nom_col(srs.dropna(), first_rows[col], ngroups, largest))
col_names_dtypes.append((col, Nominal()))
elif is_dtype(col_dtype, Continuous()):
## if cfg.hist_enable or cfg.any_insights("hist"):
datas.append(calc_cont_col(drop_null(srs), bins))
datas.append(calc_cont_col(srs.dropna(), bins))
col_names_dtypes.append((col, Continuous()))
elif is_dtype(col_dtype, DateTime()):
datas.append(dask.delayed(_calc_line_dt)(df[[col]], timeunit))
Expand Down Expand Up @@ -145,10 +144,11 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
data["npres"] = srs.shape[0]

## if cfg.insight.infinity_enable:
data["ninf"] = srs.isin({np.inf, -np.inf}).sum()
is_inf_srs = srs.isin({np.inf, -np.inf})
data["ninf"] = is_inf_srs.sum()

# remove infinite values
srs = srs[~srs.isin({np.inf, -np.inf})]
srs = srs[~is_inf_srs]

## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
## bins = cfg.hist_bins
Expand All @@ -164,7 +164,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
data["nneg"] = (srs < 0).sum()

## if cfg.insight.skew_enabled:
data["skew"] = skew(srs)
data["skew"] = skewtest(data["hist"][0])

## if cfg.insight.unique_enabled:
data["nuniq"] = srs.nunique()
Expand All @@ -176,7 +176,9 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:


## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
def calc_nom_col(
srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool
) -> Dict[str, Any]:
"""
Computations for a categorical column in plot(df)
Expand Down Expand Up @@ -222,8 +224,10 @@ def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
## data["npresent"] = srs.shape[0]

## if cfg.insight.constant_length_enable:
length = srs.apply(lambda v: len(str(v)), meta=(srs.name, np.int64))
data["min_len"], data["max_len"] = length.min(), length.max()
if not first_rows.apply(lambda x: isinstance(x, str)).all():
srs = srs.astype(str) # srs must be a string to compute the value lengths
lengths = srs.str.len()
data["min_len"], data["max_len"] = lengths.min(), lengths.max()

return data

Expand All @@ -247,7 +251,6 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:

## if cfg.stats_enable
dtype_cnts, num_cols = get_dtype_cnts_and_num_cols(df, dtype)
stats["nrows"] = df.shape[0]
stats["ncols"] = df.shape[1]
stats["npresent_cells"] = df.count().sum()
stats["nrows_wo_dups"] = df.drop_duplicates().shape[0]
Expand Down Expand Up @@ -327,9 +330,8 @@ def format_cont(col: str, data: Dict[str, Any], nrows: int) -> Any:
ins.append({"Missing": f"{col} has {nmiss} ({pmiss}%) missing values"})

## if cfg.insight.skewed_enable:
if data["skew"] >= 20: ## cfg.insight.skewed_threshold
skew_val = np.round(data["skew"], 4)
ins.append({"Skewed": f"{col} is skewed (\u03B31 = {skew_val})"})
if data["skew"][1] < 1e-5: ## cfg.insight.skewed_threshold
ins.append({"Skewed": f"{col} is skewed"})

## if cfg.insight.infinity_enable:
pinf = round(data["ninf"] / nrows * 100, 2)
Expand Down
6 changes: 2 additions & 4 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
DTypeDef,
Nominal,
detect_dtype,
drop_null,
is_dtype,
)
from ...intermediate import Intermediate
Expand Down Expand Up @@ -177,7 +176,7 @@ def nom_comps(
except TypeError:
srs = srs.astype(str)
# drop null values
srs = drop_null(srs)
srs = srs.dropna()

## if cfg.bar_enable or cfg.pie_enable
# counts of unique values in the series
Expand Down Expand Up @@ -223,7 +222,7 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
## if cfg.stats_enable or cfg.hist_enable or
# calculate the total number of rows then drop the missing values
data["nrows"] = srs.shape[0]
srs = drop_null(srs)
srs = srs.dropna()
## if cfg.stats_enable
# number of not null (present) values
data["npres"] = srs.shape[0]
Expand All @@ -236,7 +235,6 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]])
## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
# NOTE normal test does a .compute() and I cannot fix it with delayed
data["norm"] = normaltest(data["hist"][0])
## if cfg.qqplot_enable
data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
Expand Down
2 changes: 1 addition & 1 deletion dataprep/eda/distribution/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -1636,7 +1636,7 @@ def cont_insights(data: Dict[str, Any], col: str) -> Dict[str, List[str]]:

## if cfg.insight.normal_enable:
if data["norm"][1] > 0.99:
ins["hist"].append(f"{col} is normally distributed")
ins["Histogram"].append(f"{col} is normally distributed")

## if cfg.insight.uniform_enable:
if data["chisq"][1] > 0.999: ## cfg.insight.uniform_threshold
Expand Down

0 comments on commit 0a7fe56

Please sign in to comment.