Skip to content

Commit

Permalink
feat(eda): add stat. in plot_missing
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhenmao committed Oct 20, 2020
1 parent b3ea118 commit 67fc594
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 6 deletions.
138 changes: 137 additions & 1 deletion dataprep/eda/missing/compute/nullivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from ...intermediate import Intermediate
from ...staged import staged

__most__ = 5
__longest__ = 5


def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, Any, Intermediate]:
"""Calculate the data for visualizing the plot_missing(df).
Expand All @@ -24,32 +27,101 @@ def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, An
nullity = df.nulls
null_cnts = nullity.sum(axis=0)
nrows = df.shape[0]
ncols = df.shape[1]
null_perc = null_cnts / nrows
miss_perc = nullity.sum() / (nrows * ncols)
avg_row = nullity.sum() / nrows
avg_col = nullity.sum() / ncols

tasks = (
missing_spectrum(df, bins=bins),
null_perc,
missing_bars(null_cnts, df.columns.values, nrows),
missing_heatmap(df),
missing_dendrogram(df),
nullity.sum(),
missing_col_cnt(df),
missing_row_cnt(df),
missing_most_col(df),
missing_most_row(df),
miss_perc,
avg_row,
avg_col,
)

### Lazy Region End
spectrum, null_perc, bars, heatmap, dendrogram = yield tasks
(
spectrum,
null_perc,
bars,
heatmap,
dendrogram,
cnt,
col_cnt,
row_cnt,
most_col,
most_row,
miss_perc,
avg_row,
avg_col,
) = yield tasks
### Eager Region Begin

sel = ~((null_perc == 0) | (null_perc == 1))
heatmap = pd.DataFrame(
data=heatmap[:, sel][sel, :], columns=df.columns[sel], index=df.columns[sel]
)

if most_col[0] <= __most__:
top_miss_col = (
str(most_col[0])
+ "-col(s) "
+ str("(" + ", ".join(abbr(df.columns[e]) for e in most_col[2]) + ")")
)
else:
top_miss_col = (
str(most_col[0])
+ "-col(s) "
+ str("(" + ", ".join(abbr(df.columns[e]) for e in most_col[2]) + ", ..." + ")")
)
if most_row[0] <= __most__:
top_miss_row = (
str(most_row[0]) + "-row(s) " + str("(" + ", ".join(str(e) for e in most_row[2]) + ")")
)
else:
top_miss_row = (
str(most_row[0])
+ "-row(s) "
+ str("(" + ", ".join(str(e) for e in most_row[2]) + ", ..." + ")")
)

return Intermediate(
data_total_missing={col: null_perc[idx] for idx, col in enumerate(df.columns)},
data_spectrum=pd.DataFrame(spectrum),
data_bars=bars,
data_heatmap=heatmap,
data_dendrogram=dendrogram,
visual_type="missing_impact",
missing_stat={
"Missing Cells": cnt,
"Missing Cells (%)": str(round(miss_perc * 100, 1)) + "%",
"Missing Columns": col_cnt,
"Missing Rows": row_cnt,
"Avg Missing Cells per Column": round(avg_col, 2),
"Avg Missing Cells per Row": round(avg_row, 2),
},
insights={
"Bar Chart": [
top_miss_col
+ " contain the most missing values with rate "
+ str(round(most_col[1] * 100, 1))
+ "%",
top_miss_row
+ " contain the most missing columns with rate "
+ str(round(most_row[1] * 100, 1))
+ "%",
]
},
)


Expand Down Expand Up @@ -157,3 +229,67 @@ def missing_dendrogram(df: DataArray) -> Any:
)

return dendrogram


def missing_col_cnt(df: DataArray) -> int:
"""Calculate how many columns contain missing values."""
count = 0
nulls = df.nulls
for col in range(df.shape[1]):
if True in nulls[:, col]:
count = count + 1

return count


def missing_row_cnt(df: DataArray) -> int:
"""Calculate how many rows contain missing values."""
count = 0
nulls = df.nulls
for row in range(df.shape[0]):
if True in nulls[row, :]:
count = count + 1

return count


def missing_most_col(df: DataArray) -> Tuple[int, float, list]:
"""Find which column has the most number of missing values."""
nulls = df.nulls
col_sum = nulls.sum(axis=0)
maximum = max(col_sum)
rate = maximum / df.shape[0]
cnt = sum(col_sum == maximum)
rst = list()
for index, count in enumerate(col_sum):
if len(rst) >= __most__:
return cnt, rate, rst
if count == maximum:
rst.append(index)

return cnt, rate, rst


def missing_most_row(df: DataArray) -> Tuple[int, float, list]:
"""Find which row has the most number of missing values."""
nulls = df.nulls
row_sum = nulls.sum(axis=1)
maximum = max(row_sum)
rate = maximum / df.shape[1]
cnt = sum(row_sum == maximum)
rst = list()
for index, count in enumerate(row_sum):
if len(rst) >= __most__:
return cnt, rate, rst
if count == maximum:
rst.append(index)

return cnt, rate, rst


def abbr(name: str) -> str:
"""Cut the name if it is too long."""
if len(name) > __longest__:
return str(name[0:__longest__] + "...")
else:
return name
8 changes: 5 additions & 3 deletions dataprep/eda/missing/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,7 @@ def create_color_mapper_heatmap(
return mapper, colorbar


def render_missing_impact(
itmdt: Intermediate, plot_width: int, plot_height: int
) -> Dict[str, List[Any]]:
def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int) -> Dict[str, Any]:
"""
Render correlation heatmaps in to tabs
"""
Expand All @@ -331,7 +329,11 @@ def render_missing_impact(
fig_dendrogram = render_dendrogram(itmdt["data_dendrogram"], plot_width, plot_height)
tabs.append(Panel(child=row(fig_dendrogram), title="Dendrogram"))

stat_dict = {name: itmdt["missing_stat"][name] for name in itmdt["missing_stat"]}

return {
"insights": itmdt["insights"],
"tabledata": {"Missing Statistics": stat_dict},
"layout": [panel.child.children[0] for panel in tabs],
"meta": [panel.title for panel in tabs],
}
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/templates/tab_base.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ <h3>{{ title }}</h3>
<div class="insight-panel-{{ context.rnd }}">
<ol>
{% for insight in context.insights[context.meta[loop.index]] %}
<li class="entry-{{ context.rnd }}">
{{ insight }}
<li class="entry-{{ context.rnd }}"><span
class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
</li>
{% endfor %}
</ol>
Expand Down

0 comments on commit 67fc594

Please sign in to comment.