feat(eda): enrich plot_correlation

sfu-db · Nov 19, 2020 · 29c444e · 29c444e
1 parent d5cc7bd
commit 29c444e
Show file tree

Hide file tree

Showing 8 changed files with 3,828 additions and 69 deletions.
diff --git a/dataprep/eda/container.py b/dataprep/eda/container.py
@@ -48,6 +48,7 @@ def __init__(
         elif "_column" in visual_type or visual_type in (
             "missing_impact",
             "missing_impact_1v1",
+            "correlation_impact",
         ):
             # todo: param management
             if to_render.get("tabledata"):
@@ -61,7 +62,10 @@ def __init__(
                 "title": "DataPrep.EDA Report",
                 "rnd": random.randint(100, 999),  # for multiple cells running in the same notebook
             }
-            self.template_base = ENV_LOADER.get_template("tab_base.html")
+            if visual_type == "correlation_impact":
+                self.template_base = ENV_LOADER.get_template("tab_base_corr.html")
+            else:
+                self.template_base = ENV_LOADER.get_template("tab_base.html")
         else:
             raise TypeError(f"Unsupported Visual Type: {visual_type}.")
 

diff --git a/dataprep/eda/correlation/__init__.py b/dataprep/eda/correlation/__init__.py
@@ -11,6 +11,7 @@
 from ..report import Report
 from .compute import compute_correlation
 from .render import render_correlation
+from ..container import Container
 
 __all__ = ["render_correlation", "compute_correlation", "plot_correlation"]
 
@@ -23,7 +24,7 @@ def plot_correlation(
     value_range: Optional[Tuple[float, float]] = None,
     k: Optional[int] = None,
     progress: bool = True,
-) -> Report:
+) -> Union[Report, Container]:
     """
     This function is designed to calculate the correlation between columns
     There are three functions: plot_correlation(df), plot_correlation(df, x)
@@ -65,7 +66,10 @@ def plot_correlation(
     and it is better to drop None, Nan and Null value before using it
     """
     with ProgressBar(minimum=1, disable=not progress):
-        intermediate = compute_correlation(df, x=x, y=y, value_range=value_range, k=k)
-    figure = render_correlation(intermediate)
+        itmdt = compute_correlation(df, x=x, y=y, value_range=value_range, k=k)
+    fig = render_correlation(itmdt)
 
-    return Report(figure)
+    if itmdt.visual_type == "correlation_impact" or "_column" in itmdt.visual_type:
+        return Container(fig, itmdt.visual_type)
+    else:
+        return Report(fig)
diff --git a/dataprep/eda/correlation/compute/nullivariate.py b/dataprep/eda/correlation/compute/nullivariate.py
@@ -3,7 +3,7 @@
 Currently this boils down to pandas' implementation."""
 
 from functools import partial
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, List, Any
 
 import dask
 import dask.array as da
@@ -13,6 +13,7 @@
 from ...data_array import DataArray
 from ...intermediate import Intermediate
 from .common import CorrelationMethod
+from ...utils import cut_long_name
 
 
 def _calc_nullivariate(
@@ -21,6 +22,10 @@ def _calc_nullivariate(
     value_range: Optional[Tuple[float, float]] = None,
     k: Optional[int] = None,
 ) -> Intermediate:
+    # pylint: disable=too-many-statements,too-many-locals,too-many-branches
+
+    most_show = 6  # the most number of column/row to show in "insight"
+    # longest = 5  # the longest length of word to show in "insight"
 
     if value_range is not None and k is not None:
         raise ValueError("value_range and k cannot be present in both")
@@ -31,6 +36,38 @@ def _calc_nullivariate(
     # So we do them in pandas
 
     (corrs,) = dask.compute(corrs)
+    pearson_corr, spearman_corr, kendalltau_corr = corrs.values()
+
+    pearson_pos_max, pearson_neg_max, pearson_mean, pearson_pos_cols, pearson_neg_cols = most_corr(
+        pearson_corr
+    )
+    (
+        spearman_pos_max,
+        spearman_neg_max,
+        spearman_mean,
+        spearman_pos_cols,
+        spearman_neg_cols,
+    ) = most_corr(spearman_corr)
+    (
+        kendalltau_pos_max,
+        kendalltau_neg_max,
+        kendalltau_mean,
+        kendalltau_pos_cols,
+        kendalltau_neg_cols,
+    ) = most_corr(kendalltau_corr)
+    pearson_min, pearson_cols = least_corr(pearson_corr)
+    spearman_min, spearman_cols = least_corr(spearman_corr)
+    kendalltau_min, kendalltau_cols = least_corr(kendalltau_corr)
+
+    p_p_corr = create_string("positive", pearson_pos_cols, most_show, df)
+    s_p_corr = create_string("positive", spearman_pos_cols, most_show, df)
+    k_p_corr = create_string("positive", kendalltau_pos_cols, most_show, df)
+    p_n_corr = create_string("negative", pearson_neg_cols, most_show, df)
+    s_n_corr = create_string("negative", spearman_neg_cols, most_show, df)
+    k_n_corr = create_string("negative", kendalltau_neg_cols, most_show, df)
+    p_corr = create_string("least", pearson_cols, most_show, df)
+    s_corr = create_string("least", spearman_cols, most_show, df)
+    k_corr = create_string("least", kendalltau_cols, most_show, df)
 
     dfs = {}
     for method, corr in corrs.items():
@@ -55,7 +92,34 @@ def _calc_nullivariate(
     return Intermediate(
         data=dfs,
         axis_range=list(df.columns.unique()),
-        visual_type="correlation_heatmaps",
+        visual_type="correlation_impact",
+        tabledata={
+            "Highest Positive Correlation": {
+                "Pearson": pearson_pos_max,
+                "Spearman": spearman_pos_max,
+                "KendallTau": kendalltau_pos_max,
+            },
+            "Highest Negative Correlation": {
+                "Pearson": pearson_neg_max,
+                "Spearman": spearman_neg_max,
+                "KendallTau": kendalltau_neg_max,
+            },
+            "Lowest Correlation": {
+                "Pearson": pearson_min,
+                "Spearman": spearman_min,
+                "KendallTau": kendalltau_min,
+            },
+            "Mean Correlation": {
+                "Pearson": pearson_mean,
+                "Spearman": spearman_mean,
+                "KendallTau": kendalltau_mean,
+            },
+        },
+        insights={
+            "Pearson": [p_p_corr, p_n_corr, p_corr],
+            "Spearman": [s_p_corr, s_n_corr, s_corr],
+            "KendallTau": [k_p_corr, k_n_corr, k_corr],
+        },
     )
 
 
@@ -110,6 +174,92 @@ def _kendall_tau_nxn(df: DataArray) -> da.Array:
     )
 
 
+def most_corr(corrs: np.ndarray) -> Tuple[float, float, float, List[Any], List[Any]]:
+    """Find the most correlated columns."""
+    positive_col_set = set()
+    negative_col_set = set()
+    corrs_copy = corrs
+    for i in range(corrs_copy.shape[0]):
+        corrs_copy[i, i] = 0
+    mean = corrs_copy.mean()
+    p_maximum = corrs_copy.max()
+    n_maximum = (-corrs_copy).max()
+
+    if p_maximum != 0:
+        p_col1, p_col2 = np.where(corrs_copy == p_maximum)
+    else:
+        p_col1, p_col2 = [], []
+    if n_maximum != 0:
+        n_col1, n_col2 = np.where(corrs_copy == -n_maximum)
+    else:
+        n_col1, n_col2 = [], []
+
+    for i, _ in enumerate(p_col1):
+        if p_col1[i] < p_col2[i]:
+            positive_col_set.add((p_col1[i], p_col2[i]))
+        elif p_col1[i] > p_col2[i]:
+            positive_col_set.add((p_col2[i], p_col1[i]))
+    for i, _ in enumerate(n_col1):
+        if n_col1[i] < n_col2[i]:
+            negative_col_set.add((n_col1[i], n_col2[i]))
+        elif n_col1[i] > n_col2[i]:
+            negative_col_set.add((n_col2[i], n_col1[i]))
+
+    return (
+        round(p_maximum, 3),
+        round(-n_maximum, 3),
+        round(mean, 3),
+        list(positive_col_set),
+        list(negative_col_set),
+    )
+
+
+def least_corr(corrs: np.ndarray) -> Tuple[float, List[Any]]:
+    """Find the least correlated columns."""
+    col_set = set()
+    corrs_copy = corrs
+    for i in range(corrs_copy.shape[0]):
+        corrs_copy[i, i] = 2
+    minimum = abs(corrs_copy).min()
+    col1, col2 = np.where(corrs_copy == minimum)
+
+    for i, _ in enumerate(col1):
+        if col1[i] < col2[i]:
+            col_set.add((col1[i], col2[i]))
+        elif col1[i] > col2[i]:
+            col_set.add((col2[i], col1[i]))
+
+    return round(minimum, 3), list(col_set)
+
+
+def create_string(flag: str, source: List[Any], most_show: int, df: DataArray) -> str:
+    """Create the output string"""
+    suffix = "" if len(source) <= most_show else ", ..."
+    if flag == "positive":
+        prefix = "Most positive correlated: "
+        temp = "Most positive correlated: None"
+    elif flag == "negative":
+        prefix = "Most negative correlated: "
+        temp = "Most negative correlated: None"
+    elif flag == "least":
+        prefix = "Least correlated: "
+        temp = "Least correlated: None"
+
+    if source != []:
+        out = (
+            prefix
+            + ", ".join(
+                "(" + cut_long_name(df.columns[e[0]]) + ", " + cut_long_name(df.columns[e[1]]) + ")"
+                for e in source[:most_show]
+            )
+            + suffix
+        )
+    else:
+        out = temp
+
+    return out
+
+
 ## The code below is the correlation algorithms for array. Since we don't have
 ## block-wise algorithms for spearman and kendalltal, it might be more suitable
 ## to just use the pandas version of correlation.

diff --git a/dataprep/eda/correlation/render.py b/dataprep/eda/correlation/render.py
@@ -2,7 +2,7 @@
     This module implements the visualization for
     plot_correlation(df) function
 """
-from typing import List, Optional, Sequence, Tuple
+from typing import List, Optional, Sequence, Tuple, Any, Dict
 
 import numpy as np
 from bokeh.layouts import column, row
@@ -58,6 +58,8 @@ def render_correlation(
     """
     if itmdt.visual_type is None:
         visual_elem = Figure()
+    elif itmdt.visual_type == "correlation_impact":
+        visual_elem = render_correlation_impact(itmdt, plot_width, plot_height, palette or RDBU)
     elif itmdt.visual_type == "correlation_heatmaps":
         visual_elem = render_correlation_heatmaps(itmdt, plot_width, plot_height, palette or RDBU)
     elif itmdt.visual_type == "correlation_single_heatmaps":
@@ -123,6 +125,61 @@ def tweak_figure(fig: Figure) -> None:
     fig.yaxis.formatter = FuncTickFormatter(code=format_js)
 
 
+def render_correlation_impact(
+    itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
+) -> Dict[str, Any]:
+    """
+    Render correlation heatmaps in to tabs
+    """
+    tabs: List[Panel] = []
+    tooltips = [("x", "@x"), ("y", "@y"), ("correlation", "@correlation{1.11}")]
+    axis_range = itmdt["axis_range"]
+
+    for method, df in itmdt["data"].items():
+        # in case of numerical column names
+        df = df.copy()
+        df["x"] = df["x"].apply(str)
+        df["y"] = df["y"].apply(str)
+
+        mapper, color_bar = create_color_mapper(palette)
+        x_range = FactorRange(*axis_range)
+        y_range = FactorRange(*reversed(axis_range))
+        fig = Figure(
+            x_range=x_range,
+            y_range=y_range,
+            plot_width=plot_width,
+            plot_height=plot_height,
+            x_axis_location="below",
+            tools="hover",
+            toolbar_location=None,
+            tooltips=tooltips,
+            background_fill_color="#fafafa",
+        )
+
+        tweak_figure(fig)
+
+        fig.rect(
+            x="x",
+            y="y",
+            width=1,
+            height=1,
+            source=df,
+            fill_color={"field": "correlation", "transform": mapper},
+            line_color=None,
+        )
+
+        fig.add_layout(color_bar, "right")
+        tab = Panel(child=fig, title=method)
+        tabs.append(tab)
+
+    return {
+        "insights": itmdt["insights"],
+        "tabledata": itmdt["tabledata"],
+        "layout": [panel.child for panel in tabs],
+        "meta": [panel.title for panel in tabs],
+    }
+
+
 def render_correlation_heatmaps(
     itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
 ) -> Tabs:

diff --git a/dataprep/eda/missing/compute/nullivariate.py b/dataprep/eda/missing/compute/nullivariate.py
@@ -9,6 +9,7 @@
 import pandas as pd
 from dask import delayed
 from scipy.cluster import hierarchy
+from ...utils import cut_long_name
 
 from ...data_array import DataArray
 from ...intermediate import Intermediate
@@ -21,7 +22,7 @@ def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, An
     # pylint: disable=too-many-locals
 
     most_show = 5  # the most number of column/row to show in "insight"
-    longest = 5  # the longest length of word to show in "insight"
+    # longest = 5  # the longest length of word to show in "insight"
 
     df.compute()
 
@@ -78,18 +79,18 @@ def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, An
 
     top_miss_col = (
         str(most_col[0])
-        + "-col(s) "
+        + " col(s): "
         + str(
             "("
-            + ", ".join(abbr(df.columns[e], longest) for e in most_col[2][:most_show])
+            + ", ".join(cut_long_name(df.columns[e]) for e in most_col[2][:most_show])
             + suffix_col
             + ")"
         )
     )
 
     top_miss_row = (
         str(most_row[0])
-        + "-row(s) "
+        + " row(s): "
         + str("(" + ", ".join(str(e) for e in most_row[2][:most_show]) + suffix_row + ")")
     )
 
@@ -299,11 +300,3 @@ def missing_most_row(df: DataArray) -> Tuple[int, float, List[Any]]:
     rst = da.where(row_sum == maximum)[0]
 
     return cnt, rate, rst
-
-
-def abbr(name: str, longest: int) -> str:
-    """Cut the name if it is too long."""
-    if len(name) > longest:
-        return str(name[0:longest] + "...")
-    else:
-        return name
diff --git a/dataprep/eda/templates/tab_base.html b/dataprep/eda/templates/tab_base.html
@@ -42,13 +42,13 @@ <h3>{{ title }}</h3>
                     <input type='checkbox' style='display: none' id="ib-{{ context.rnd }}-{{ loop.index0 }}" class="insight-check-{{ context.rnd }}">
                     <label class="insight-btn-{{ context.rnd }}" for="ib-{{ context.rnd }}-{{ loop.index0 }}"></label>
                     <div class="insight-panel-{{ context.rnd }}">
-                        <ol>
+                        <ul>
                             {% for insight in context.insights[context.meta[loop.index]] %}
-                            <li class="entry-{{ context.rnd }}"><span
-                                class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
+                            <li class="entry-{{ context.rnd }}">
+                                {{ insight }}
                             </li>
                             {% endfor %}
-                        </ol>
+                        </ul>
                     </div>
                 </div>
                 {% endif %}