Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(eda): add stat. in plot_missing #385

Merged
merged 1 commit into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
154 changes: 152 additions & 2 deletions dataprep/eda/missing/compute/nullivariate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This module implements the plot_missing(df) function's
calculating intermediate part
"""
from typing import Any, Callable, Dict, Generator, Optional, Tuple
from typing import Any, Callable, Dict, Generator, Optional, Tuple, List

import dask.array as da
import dask.dataframe as dd
Expand All @@ -18,38 +18,108 @@
def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, Any, Intermediate]:
"""Calculate the data for visualizing the plot_missing(df).
This contains the missing spectrum, missing bar chart and missing heatmap."""
# pylint: disable=too-many-locals

most_show = 5 # the most number of column/row to show in "insight"
longest = 5 # the longest length of word to show in "insight"

df.compute()

nullity = df.nulls
null_cnts = nullity.sum(axis=0)
nrows = df.shape[0]
ncols = df.shape[1]
null_perc = null_cnts / nrows
miss_perc = nullity.sum() / (nrows * ncols)
avg_row = nullity.sum() / nrows
avg_col = nullity.sum() / ncols

tasks = (
missing_spectrum(df, bins=bins),
null_perc,
missing_bars(null_cnts, df.columns.values, nrows),
missing_heatmap(df),
missing_dendrogram(df),
nullity.sum(),
missing_col_cnt(df),
missing_row_cnt(df),
missing_most_col(df),
missing_most_row(df),
miss_perc,
avg_row,
avg_col,
)

### Lazy Region End
spectrum, null_perc, bars, heatmap, dendrogram = yield tasks
(
spectrum,
null_perc,
bars,
heatmap,
dendrogram,
cnt,
col_cnt,
row_cnt,
most_col,
most_row,
miss_perc,
avg_row,
avg_col,
) = yield tasks
### Eager Region Begin

sel = ~((null_perc == 0) | (null_perc == 1))
heatmap = pd.DataFrame(
data=heatmap[:, sel][sel, :], columns=df.columns[sel], index=df.columns[sel]
)

suffix_col = "" if most_col[0] <= most_show else ", ..."
suffix_row = "" if most_row[0] <= most_show else ", ..."

top_miss_col = (
str(most_col[0])
+ "-col(s) "
+ str(
"("
+ ", ".join(abbr(df.columns[e], longest) for e in most_col[2][:most_show])
+ suffix_col
+ ")"
)
)

top_miss_row = (
str(most_row[0])
+ "-row(s) "
+ str("(" + ", ".join(str(e) for e in most_row[2][:most_show]) + suffix_row + ")")
)

return Intermediate(
data_total_missing={col: null_perc[idx] for idx, col in enumerate(df.columns)},
data_spectrum=pd.DataFrame(spectrum),
data_bars=bars,
data_heatmap=heatmap,
data_dendrogram=dendrogram,
visual_type="missing_impact",
missing_stat={
"Missing Cells": cnt,
"Missing Cells (%)": str(round(miss_perc * 100, 1)) + "%",
"Missing Columns": col_cnt,
"Missing Rows": row_cnt,
"Avg Missing Cells per Column": round(avg_col, 2),
"Avg Missing Cells per Row": round(avg_row, 2),
},
insights={
"Bar Chart": [
top_miss_col
+ " contain the most missing values with rate "
+ str(round(most_col[1] * 100, 1))
+ "%",
top_miss_row
+ " contain the most missing columns with rate "
+ str(round(most_row[1] * 100, 1))
+ "%",
]
},
)


Expand Down Expand Up @@ -157,3 +227,83 @@ def missing_dendrogram(df: DataArray) -> Any:
)

return dendrogram


def missing_col_cnt(df: DataArray) -> Any:
"""Calculate how many columns contain missing values."""
nulls = df.nulls
rst = nulls.sum(0)
rst = rst[rst > 0]

return (rst > 0).sum()


def missing_row_cnt(df: DataArray) -> Any:
"""Calculate how many rows contain missing values."""
nulls = df.nulls
rst = nulls.sum(1)
rst = rst[rst > 0]

return (rst > 0).sum()


def missing_most_col(df: DataArray) -> Tuple[int, float, List[Any]]:
"""Find which column has the most number of missing values.

Parameters
----------
df
the DataArray data_frame

Outputs
-------
cnt
the count of columns having the most missing values
rate
the highest rate of missing values in one column
rst
a list of column indices with highest missing rate
"""
nulls = df.nulls
col_sum = nulls.sum(axis=0)
maximum = col_sum.max()
rate = maximum / df.shape[0]
cnt = (col_sum == maximum).sum()
rst = da.where(col_sum == maximum)[0]

return cnt, rate, rst


def missing_most_row(df: DataArray) -> Tuple[int, float, List[Any]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add the meaning of input and output parameter in the docstring? E.g., is the input df a row or a dataframe; and the meaning of the three output values. The same comments also hold for missing_most_col function.

"""Find which row has the most number of missing values.

Parameters
----------
df
the DataArray data_frame

Outputs
-------
cnt
the count of rows having the most missing values
rate
the highest rate of missing values in one row
rst
a list of row indices with highest missing rate
"""
nulls = df.nulls
row_sum = nulls.sum(axis=1)
maximum = row_sum.max()
rate = maximum / df.shape[1]
cnt = (row_sum == maximum).sum()
rst = da.where(row_sum == maximum)[0]

return cnt, rate, rst


def abbr(name: str, longest: int) -> str:
"""Cut the name if it is too long."""
if len(name) > longest:
return str(name[0:longest] + "...")
else:
return name
8 changes: 5 additions & 3 deletions dataprep/eda/missing/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,7 @@ def create_color_mapper_heatmap(
return mapper, colorbar


def render_missing_impact(
itmdt: Intermediate, plot_width: int, plot_height: int
) -> Dict[str, List[Any]]:
def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int) -> Dict[str, Any]:
"""
Render correlation heatmaps in to tabs
"""
Expand All @@ -331,7 +329,11 @@ def render_missing_impact(
fig_dendrogram = render_dendrogram(itmdt["data_dendrogram"], plot_width, plot_height)
tabs.append(Panel(child=row(fig_dendrogram), title="Dendrogram"))

stat_dict = {name: itmdt["missing_stat"][name] for name in itmdt["missing_stat"]}

return {
"insights": itmdt["insights"],
"tabledata": {"Missing Statistics": stat_dict},
"layout": [panel.child.children[0] for panel in tabs],
"meta": [panel.title for panel in tabs],
}
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/templates/tab_base.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ <h3>{{ title }}</h3>
<div class="insight-panel-{{ context.rnd }}">
<ol>
{% for insight in context.insights[context.meta[loop.index]] %}
<li class="entry-{{ context.rnd }}">
{{ insight }}
<li class="entry-{{ context.rnd }}"><span
class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
</li>
{% endfor %}
</ol>
Expand Down