perf(eda): optimize plot_missing and plot_corr

sfu-db · Oct 4, 2020 · b46036d · b46036d
1 parent ecc0d6c
commit b46036d
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 77 deletions.
diff --git a/dataprep/eda/correlation/compute/__init__.py b/dataprep/eda/correlation/compute/__init__.py
@@ -8,6 +8,8 @@
 from .bivariate import _calc_bivariate
 from .nullivariate import _calc_nullivariate
 from .univariate import _calc_univariate
+from ...dtypes import NUMERICAL_DTYPES
+from ...utils import to_dask
 
 __all__ = ["compute_correlation"]
 
@@ -34,8 +36,10 @@ def compute_correlation(
     k
         Choose top-k element
     """
-
-    df = DataArray(df).select_num_columns()
+    if x is not None and y is not None:
+        df = to_dask(df.select_dtypes(NUMERICAL_DTYPES))
+    else:
+        df = DataArray(df).select_num_columns()
 
     if x is None and y is None:  # pylint: disable=no-else-return
         return _calc_nullivariate(df, value_range=value_range, k=k)

diff --git a/dataprep/eda/correlation/compute/bivariate.py b/dataprep/eda/correlation/compute/bivariate.py
@@ -5,45 +5,27 @@
 from typing import Optional, Tuple
 
 import dask
+import dask.dataframe as dd
 import dask.array as da
 import numpy as np
-import pandas as pd
 
-
-from ...data_array import DataArray
 from ...intermediate import Intermediate
 
 
 def _calc_bivariate(
-    df: DataArray,
-    x: Optional[str] = None,
-    y: Optional[str] = None,
-    *,
-    k: Optional[int] = None,
+    df: dd.DataFrame, x: str, y: str, *, k: Optional[int] = None,
 ) -> Intermediate:
     if x not in df.columns:
         raise ValueError(f"{x} not in columns names")
     if y not in df.columns:
         raise ValueError(f"{y} not in columns names")
 
-    xname, yname = x, y
-
-    df.compute()
-
-    xloc = df.columns.get_loc(x)
-    yloc = df.columns.get_loc(y)
+    df = df[[x, y]].dropna()
+    coeffs, df_smp, influences = scatter_with_regression(df, sample_size=1000, k=k)
 
-    x = df.values[:, xloc]
-    y = df.values[:, yloc]
-    coeffs, (x, y), influences = scatter_with_regression(x, y, k=k, sample_size=1000,)
+    coeffs, df_smp, influences = dask.compute(coeffs, df_smp, influences)
 
-    coeffs, (x, y), influences = dask.compute(coeffs, (x, y), influences)
-
-    # lazy/eager border line
-    result = {
-        "coeffs": coeffs,
-        "data": pd.DataFrame({xname: x, yname: y}),
-    }
+    result = {"coeffs": coeffs, "data": df_smp}
 
     if (influences is None) != (k is None):
         raise RuntimeError("Not possible")
@@ -55,51 +37,45 @@ def _calc_bivariate(
         labels[infidx[-k:]] = "-"  # type: ignore
         # pylint: enable=invalid-unary-operand-type
         labels[infidx[:k]] = "+"
-        result["data"]["influence"] = labels
+        result["data"]["influence"] = labels  # type: ignore
 
     return Intermediate(**result, visual_type="correlation_scatter")
 
 
 def scatter_with_regression(
-    x: da.Array, y: da.Array, sample_size: int, k: Optional[int] = None
+    df: dd.DataFrame, sample_size: int, k: Optional[int] = None
 ) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array], Optional[da.Array]]:
     """Calculate pearson correlation on 2 given arrays.
 
     Parameters
     ----------
-    xarr : da.Array
-    yarr : da.Array
-    sample_size : int
+    df
+        dataframe
+    sample_size
+        Number of points to show in the scatter plot
     k : Optional[int] = None
         Highlight k points which influence pearson correlation most
     """
-    if k == 0:
-        raise ValueError("k should be larger than 0")
-
-    xp1 = da.vstack([x, da.ones_like(x)]).T
-    xp1 = xp1.rechunk((xp1.chunks[0], -1))
-
-    mask = ~(da.isnan(x) | da.isnan(y))
-    # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr,
-    # where the former does not support nan in shape.
-
-    if len(xp1.chunks[0]) == 1:
-        xp1 = xp1.rechunk((2, -1))
-        y = y.rechunk((2, -1))
-        mask = mask.rechunk((2, -1))
-
-    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask])
-
-    if sample_size < x.shape[0]:
-        samplesel = da.random.choice(x.shape[0], int(sample_size), chunks=x.chunksize)
-        x = x[samplesel]
-        y = y[samplesel]
-
-    if k is None:
-        return (coeffa, coeffb), (x, y), None
+    df["ones"] = 1
+    arr = df.to_dask_array(lengths=True)
+
+    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(arr[:, [0, 2]], arr[:, 1])
+
+    df = df.drop(columns=["ones"])
+    df_smp = df.map_partitions(
+        lambda x: x.sample(min(sample_size, x.shape[0])), meta=df
+    )
+    # TODO influences should not be computed on a sample
+    influences = (
+        pearson_influence(
+            df_smp[df.columns[0]].to_dask_array(lengths=True),
+            df_smp[df.columns[1]].to_dask_array(lengths=True),
+        )
+        if k
+        else None
+    )
 
-    influences = pearson_influence(x, y)
-    return (coeffa, coeffb), (x, y), influences
+    return (coeffa, coeffb), df_smp, influences
 
 
 def pearson_influence(xarr: da.Array, yarr: da.Array) -> da.Array:

diff --git a/dataprep/eda/distribution/compute/bivariate.py b/dataprep/eda/distribution/compute/bivariate.py
@@ -170,7 +170,8 @@ def compute_bivariate(
             visual_type="two_cat_cols",
         )
     elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()):
-        df = df[[x, y]].dropna()
+        # one partition required for apply(pd.cut) in calc_box_num
+        df = df[[x, y]].dropna().repartition(npartitions=1)
 
         data: Dict[str, Any] = {}
         # scatter plot data

diff --git a/dataprep/eda/distribution/render.py b/dataprep/eda/distribution/render.py
@@ -572,7 +572,7 @@ def box_viz(
         plot_height=plot_height,
         title=title,
         toolbar_location=None,
-        x_range=list(df["grp"]),
+        x_range=list(df["grp"].astype(str)),
     )
     low = fig.segment(x0="x0", y0="lw", x1="x1", y1="lw", line_color="black", source=df)
     ltail = fig.segment(
@@ -807,6 +807,7 @@ def line_viz(
     # pylint: disable=too-many-arguments,too-many-locals
     palette = CATEGORY20 * (len(df) // len(CATEGORY20) + 1)
     title = _make_title({f"{x}_ttl": ttl_grps, f"{x}_shw": len(df)}, x, y)
+    df.index = df.index.astype(str)
 
     fig = figure(
         plot_height=plot_height,
@@ -1525,7 +1526,7 @@ def nom_insights(data: Dict[str, Any], col: str) -> Dict[str, List[str]]:
 
     ## if cfg.insight.attribution_enable
     if data["pie"][:2].sum() / data["nrows"] > 0.5 and len(data["pie"]) >= 2:
-        vals = ", ".join(data["pie"].index[i] for i in range(2))
+        vals = ", ".join(str(data["pie"].index[i]) for i in range(2))
         ins["Pie Chart"].append(f"The top 2 categories ({vals}) take over 50%")
 
     ## if cfg.insight.high_word_cardinlaity_enable
@@ -1768,6 +1769,7 @@ def render_two_cat(itmdt: Intermediate, plot_width: int, plot_height: int,) -> T
     y_lrgst = ygrps.nlargest(itmdt["nsubgroups"])
     df = df[df[y].isin(y_lrgst.index)]
     stats.update(zip((f"{y}_ttl", f"{y}_shw"), (len(ygrps), len(y_lrgst))))
+    df[[x, y]] = df[[x, y]].astype(str)
 
     # final format
     df = df.pivot_table(index=y, columns=x, values="cnt", fill_value=0, aggfunc="sum")

diff --git a/dataprep/eda/missing/compute/common.py b/dataprep/eda/missing/compute/common.py
@@ -15,6 +15,33 @@
 LABELS = ["With Missing", "Missing Dropped"]
 
 
+def uni_histogram(
+    srs: dd.Series, bins: int, dtype: Optional[DTypeDef] = None,
+) -> Tuple[da.Array, ...]:
+    """Calculate "histogram" for both numerical and categorical."""
+
+    if is_dtype(detect_dtype(srs, dtype), Continuous()):
+
+        counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()])
+        centers = (edges[:-1] + edges[1:]) / 2
+
+        return counts, centers, edges
+
+    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
+        # Dask array's unique is way slower than the values_counts on Series
+        # See https://github.com/dask/dask/issues/2851
+        # centers, counts = da.unique(arr, return_counts=True)
+
+        value_counts = srs.value_counts()
+
+        counts = value_counts.to_dask_array()
+        centers = value_counts.index.to_dask_array()
+
+        return (counts, centers)
+    else:
+        raise ValueError(f"Unsupported dtype {srs.dtype}")
+
+
 def histogram(
     arr: da.Array,
     bins: Optional[int] = None,

diff --git a/dataprep/eda/missing/compute/univariate.py b/dataprep/eda/missing/compute/univariate.py
@@ -8,43 +8,36 @@
 
 from ...data_array import DataArray
 from ...dtypes import (
-    Continuous,
     DTypeDef,
     Nominal,
     detect_dtype,
     is_dtype,
 )
 from ...intermediate import ColumnsMetadata, Intermediate
 from ...staged import staged
-from .common import LABELS, histogram
+from .common import LABELS, uni_histogram
 
 
 def _compute_missing_univariate(  # pylint: disable=too-many-locals
     df: DataArray, x: str, bins: int, dtype: Optional[DTypeDef] = None,
 ) -> Generator[Any, Any, Intermediate]:
     """Calculate the distribution change on other columns when
     the missing values in x is dropped."""
-    j = df.columns.get_loc(x)
+
+    # dataframe with all rows where column x is null removed
+    ddf = df.frame[~df.frame[x].isna()]
 
     hists = {}
 
-    for i in range(len(df.columns)):
-        if i == j:
+    for col in df.columns:
+        if col == x:
             continue
-        col_name = df.columns[i]
-
-        col0 = df.values[~df.nulls[:, i], i].astype(df.dtypes[col_name])
-        col1 = df.values[~(df.nulls[:, j] | df.nulls[:, i]), i].astype(
-            df.dtypes[col_name]
-        )
 
-        hist_range = None  # pylint: disable=redefined-builtin
-        if is_dtype(detect_dtype(col0, dtype), Continuous()):
-            hist_range = (col0.min(axis=0), col0.max(axis=0))
+        srs0 = df.frame[col].dropna()  # series from original dataframe
+        srs1 = ddf[col].dropna()  # series with null rows from col x removed
 
-        hists[col_name] = [
-            histogram(col, dtype=dtype, bins=bins, return_edges=True, range=hist_range)
-            for col in [col0, col1]
+        hists[col] = [
+            uni_histogram(srs, bins=bins, dtype=dtype) for srs in [srs0, srs1]
         ]
 
     ### Lazy Region End