feat(eda): add stat. in plot_missing

sfu-db · Oct 22, 2020 · 0f44f15 · 0f44f15
1 parent b59faa7
commit 0f44f15
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 7 deletions.
diff --git a/dataprep/eda/missing/compute/nullivariate.py b/dataprep/eda/missing/compute/nullivariate.py
@@ -1,7 +1,7 @@
 """This module implements the plot_missing(df) function's
 calculating intermediate part
 """
-from typing import Any, Callable, Dict, Generator, Optional, Tuple
+from typing import Any, Callable, Dict, Generator, Optional, Tuple, List
 
 import dask.array as da
 import dask.dataframe as dd
@@ -18,38 +18,108 @@
 def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, Any, Intermediate]:
     """Calculate the data for visualizing the plot_missing(df).
     This contains the missing spectrum, missing bar chart and missing heatmap."""
+    # pylint: disable=too-many-locals
+
+    most_show = 5  # the most number of column/row to show in "insight"
+    longest = 5  # the longest length of word to show in "insight"
 
     df.compute()
 
     nullity = df.nulls
     null_cnts = nullity.sum(axis=0)
     nrows = df.shape[0]
+    ncols = df.shape[1]
     null_perc = null_cnts / nrows
+    miss_perc = nullity.sum() / (nrows * ncols)
+    avg_row = nullity.sum() / nrows
+    avg_col = nullity.sum() / ncols
 
     tasks = (
         missing_spectrum(df, bins=bins),
         null_perc,
         missing_bars(null_cnts, df.columns.values, nrows),
         missing_heatmap(df),
         missing_dendrogram(df),
+        nullity.sum(),
+        missing_col_cnt(df),
+        missing_row_cnt(df),
+        missing_most_col(df),
+        missing_most_row(df),
+        miss_perc,
+        avg_row,
+        avg_col,
     )
 
     ### Lazy Region End
-    spectrum, null_perc, bars, heatmap, dendrogram = yield tasks
+    (
+        spectrum,
+        null_perc,
+        bars,
+        heatmap,
+        dendrogram,
+        cnt,
+        col_cnt,
+        row_cnt,
+        most_col,
+        most_row,
+        miss_perc,
+        avg_row,
+        avg_col,
+    ) = yield tasks
     ### Eager Region Begin
 
     sel = ~((null_perc == 0) | (null_perc == 1))
     heatmap = pd.DataFrame(
         data=heatmap[:, sel][sel, :], columns=df.columns[sel], index=df.columns[sel]
     )
 
+    suffix_col = "" if most_col[0] <= most_show else ", ..."
+    suffix_row = "" if most_row[0] <= most_show else ", ..."
+
+    top_miss_col = (
+        str(most_col[0])
+        + "-col(s) "
+        + str(
+            "("
+            + ", ".join(abbr(df.columns[e], longest) for e in most_col[2][:most_show])
+            + suffix_col
+            + ")"
+        )
+    )
+
+    top_miss_row = (
+        str(most_row[0])
+        + "-row(s) "
+        + str("(" + ", ".join(str(e) for e in most_row[2][:most_show]) + suffix_row + ")")
+    )
+
     return Intermediate(
         data_total_missing={col: null_perc[idx] for idx, col in enumerate(df.columns)},
         data_spectrum=pd.DataFrame(spectrum),
         data_bars=bars,
         data_heatmap=heatmap,
         data_dendrogram=dendrogram,
         visual_type="missing_impact",
+        missing_stat={
+            "Missing Cells": cnt,
+            "Missing Cells (%)": str(round(miss_perc * 100, 1)) + "%",
+            "Missing Columns": col_cnt,
+            "Missing Rows": row_cnt,
+            "Avg Missing Cells per Column": round(avg_col, 2),
+            "Avg Missing Cells per Row": round(avg_row, 2),
+        },
+        insights={
+            "Bar Chart": [
+                top_miss_col
+                + " contain the most missing values with rate "
+                + str(round(most_col[1] * 100, 1))
+                + "%",
+                top_miss_row
+                + " contain the most missing columns with rate "
+                + str(round(most_row[1] * 100, 1))
+                + "%",
+            ]
+        },
     )
 
 
@@ -157,3 +227,83 @@ def missing_dendrogram(df: DataArray) -> Any:
     )
 
     return dendrogram
+
+
+def missing_col_cnt(df: DataArray) -> Any:
+    """Calculate how many columns contain missing values."""
+    nulls = df.nulls
+    rst = nulls.sum(0)
+    rst = rst[rst > 0]
+
+    return (rst > 0).sum()
+
+
+def missing_row_cnt(df: DataArray) -> Any:
+    """Calculate how many rows contain missing values."""
+    nulls = df.nulls
+    rst = nulls.sum(1)
+    rst = rst[rst > 0]
+
+    return (rst > 0).sum()
+
+
+def missing_most_col(df: DataArray) -> Tuple[int, float, List[Any]]:
+    """Find which column has the most number of missing values.
+
+    Parameters
+    ----------
+    df
+        the DataArray data_frame
+
+    Outputs
+    -------
+    cnt
+        the count of columns having the most missing values
+    rate
+        the highest rate of missing values in one column
+    rst
+        a list of column indices with highest missing rate
+    """
+    nulls = df.nulls
+    col_sum = nulls.sum(axis=0)
+    maximum = col_sum.max()
+    rate = maximum / df.shape[0]
+    cnt = (col_sum == maximum).sum()
+    rst = da.where(col_sum == maximum)[0]
+
+    return cnt, rate, rst
+
+
+def missing_most_row(df: DataArray) -> Tuple[int, float, List[Any]]:
+    """Find which row has the most number of missing values.
+
+    Parameters
+    ----------
+    df
+        the DataArray data_frame
+
+    Outputs
+    -------
+    cnt
+        the count of rows having the most missing values
+    rate
+        the highest rate of missing values in one row
+    rst
+        a list of row indices with highest missing rate
+    """
+    nulls = df.nulls
+    row_sum = nulls.sum(axis=1)
+    maximum = row_sum.max()
+    rate = maximum / df.shape[1]
+    cnt = (row_sum == maximum).sum()
+    rst = da.where(row_sum == maximum)[0]
+
+    return cnt, rate, rst
+
+
+def abbr(name: str, longest: int) -> str:
+    """Cut the name if it is too long."""
+    if len(name) > longest:
+        return str(name[0:longest] + "...")
+    else:
+        return name
diff --git a/dataprep/eda/missing/render.py b/dataprep/eda/missing/render.py
@@ -310,9 +310,7 @@ def create_color_mapper_heatmap(
     return mapper, colorbar
 
 
-def render_missing_impact(
-    itmdt: Intermediate, plot_width: int, plot_height: int
-) -> Dict[str, List[Any]]:
+def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int) -> Dict[str, Any]:
     """
     Render correlation heatmaps in to tabs
     """
@@ -331,7 +329,11 @@ def render_missing_impact(
     fig_dendrogram = render_dendrogram(itmdt["data_dendrogram"], plot_width, plot_height)
     tabs.append(Panel(child=row(fig_dendrogram), title="Dendrogram"))
 
+    stat_dict = {name: itmdt["missing_stat"][name] for name in itmdt["missing_stat"]}
+
     return {
+        "insights": itmdt["insights"],
+        "tabledata": {"Missing Statistics": stat_dict},
         "layout": [panel.child.children[0] for panel in tabs],
         "meta": [panel.title for panel in tabs],
     }

diff --git a/dataprep/eda/templates/tab_base.html b/dataprep/eda/templates/tab_base.html
@@ -44,8 +44,8 @@ <h3>{{ title }}</h3>
                     <div class="insight-panel-{{ context.rnd }}">
                         <ol>
                             {% for insight in context.insights[context.meta[loop.index]] %}
-                            <li class="entry-{{ context.rnd }}">
-                                {{ insight }}
+                            <li class="entry-{{ context.rnd }}"><span
+                                class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
                             </li>
                             {% endfor %}
                         </ol>