Skip to content

Commit

Permalink
feat(eda): add stat. in plot_missing
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhenmao committed Oct 22, 2020
1 parent b59faa7 commit 0f44f15
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 7 deletions.
154 changes: 152 additions & 2 deletions dataprep/eda/missing/compute/nullivariate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This module implements the plot_missing(df) function's
calculating intermediate part
"""
from typing import Any, Callable, Dict, Generator, Optional, Tuple
from typing import Any, Callable, Dict, Generator, Optional, Tuple, List

import dask.array as da
import dask.dataframe as dd
Expand All @@ -18,38 +18,108 @@
def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, Any, Intermediate]:
"""Calculate the data for visualizing the plot_missing(df).
This contains the missing spectrum, missing bar chart and missing heatmap."""
# pylint: disable=too-many-locals

most_show = 5 # the most number of column/row to show in "insight"
longest = 5 # the longest length of word to show in "insight"

df.compute()

nullity = df.nulls
null_cnts = nullity.sum(axis=0)
nrows = df.shape[0]
ncols = df.shape[1]
null_perc = null_cnts / nrows
miss_perc = nullity.sum() / (nrows * ncols)
avg_row = nullity.sum() / nrows
avg_col = nullity.sum() / ncols

tasks = (
missing_spectrum(df, bins=bins),
null_perc,
missing_bars(null_cnts, df.columns.values, nrows),
missing_heatmap(df),
missing_dendrogram(df),
nullity.sum(),
missing_col_cnt(df),
missing_row_cnt(df),
missing_most_col(df),
missing_most_row(df),
miss_perc,
avg_row,
avg_col,
)

### Lazy Region End
spectrum, null_perc, bars, heatmap, dendrogram = yield tasks
(
spectrum,
null_perc,
bars,
heatmap,
dendrogram,
cnt,
col_cnt,
row_cnt,
most_col,
most_row,
miss_perc,
avg_row,
avg_col,
) = yield tasks
### Eager Region Begin

sel = ~((null_perc == 0) | (null_perc == 1))
heatmap = pd.DataFrame(
data=heatmap[:, sel][sel, :], columns=df.columns[sel], index=df.columns[sel]
)

suffix_col = "" if most_col[0] <= most_show else ", ..."
suffix_row = "" if most_row[0] <= most_show else ", ..."

top_miss_col = (
str(most_col[0])
+ "-col(s) "
+ str(
"("
+ ", ".join(abbr(df.columns[e], longest) for e in most_col[2][:most_show])
+ suffix_col
+ ")"
)
)

top_miss_row = (
str(most_row[0])
+ "-row(s) "
+ str("(" + ", ".join(str(e) for e in most_row[2][:most_show]) + suffix_row + ")")
)

return Intermediate(
data_total_missing={col: null_perc[idx] for idx, col in enumerate(df.columns)},
data_spectrum=pd.DataFrame(spectrum),
data_bars=bars,
data_heatmap=heatmap,
data_dendrogram=dendrogram,
visual_type="missing_impact",
missing_stat={
"Missing Cells": cnt,
"Missing Cells (%)": str(round(miss_perc * 100, 1)) + "%",
"Missing Columns": col_cnt,
"Missing Rows": row_cnt,
"Avg Missing Cells per Column": round(avg_col, 2),
"Avg Missing Cells per Row": round(avg_row, 2),
},
insights={
"Bar Chart": [
top_miss_col
+ " contain the most missing values with rate "
+ str(round(most_col[1] * 100, 1))
+ "%",
top_miss_row
+ " contain the most missing columns with rate "
+ str(round(most_row[1] * 100, 1))
+ "%",
]
},
)


Expand Down Expand Up @@ -157,3 +227,83 @@ def missing_dendrogram(df: DataArray) -> Any:
)

return dendrogram


def missing_col_cnt(df: DataArray) -> Any:
"""Calculate how many columns contain missing values."""
nulls = df.nulls
rst = nulls.sum(0)
rst = rst[rst > 0]

return (rst > 0).sum()


def missing_row_cnt(df: DataArray) -> Any:
"""Calculate how many rows contain missing values."""
nulls = df.nulls
rst = nulls.sum(1)
rst = rst[rst > 0]

return (rst > 0).sum()


def missing_most_col(df: DataArray) -> Tuple[int, float, List[Any]]:
"""Find which column has the most number of missing values.
Parameters
----------
df
the DataArray data_frame
Outputs
-------
cnt
the count of columns having the most missing values
rate
the highest rate of missing values in one column
rst
a list of column indices with highest missing rate
"""
nulls = df.nulls
col_sum = nulls.sum(axis=0)
maximum = col_sum.max()
rate = maximum / df.shape[0]
cnt = (col_sum == maximum).sum()
rst = da.where(col_sum == maximum)[0]

return cnt, rate, rst


def missing_most_row(df: DataArray) -> Tuple[int, float, List[Any]]:
"""Find which row has the most number of missing values.
Parameters
----------
df
the DataArray data_frame
Outputs
-------
cnt
the count of rows having the most missing values
rate
the highest rate of missing values in one row
rst
a list of row indices with highest missing rate
"""
nulls = df.nulls
row_sum = nulls.sum(axis=1)
maximum = row_sum.max()
rate = maximum / df.shape[1]
cnt = (row_sum == maximum).sum()
rst = da.where(row_sum == maximum)[0]

return cnt, rate, rst


def abbr(name: str, longest: int) -> str:
"""Cut the name if it is too long."""
if len(name) > longest:
return str(name[0:longest] + "...")
else:
return name
8 changes: 5 additions & 3 deletions dataprep/eda/missing/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,7 @@ def create_color_mapper_heatmap(
return mapper, colorbar


def render_missing_impact(
itmdt: Intermediate, plot_width: int, plot_height: int
) -> Dict[str, List[Any]]:
def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int) -> Dict[str, Any]:
"""
Render correlation heatmaps in to tabs
"""
Expand All @@ -331,7 +329,11 @@ def render_missing_impact(
fig_dendrogram = render_dendrogram(itmdt["data_dendrogram"], plot_width, plot_height)
tabs.append(Panel(child=row(fig_dendrogram), title="Dendrogram"))

stat_dict = {name: itmdt["missing_stat"][name] for name in itmdt["missing_stat"]}

return {
"insights": itmdt["insights"],
"tabledata": {"Missing Statistics": stat_dict},
"layout": [panel.child.children[0] for panel in tabs],
"meta": [panel.title for panel in tabs],
}
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/templates/tab_base.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ <h3>{{ title }}</h3>
<div class="insight-panel-{{ context.rnd }}">
<ol>
{% for insight in context.insights[context.meta[loop.index]] %}
<li class="entry-{{ context.rnd }}">
{{ insight }}
<li class="entry-{{ context.rnd }}"><span
class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
</li>
{% endfor %}
</ol>
Expand Down

0 comments on commit 0f44f15

Please sign in to comment.