Skip to content

Commit

Permalink
fix(plot_diff):fix ci issue
Browse files Browse the repository at this point in the history
  • Loading branch information
samplertechreport authored and jinglinpeng committed Aug 3, 2021
1 parent 3bfb4f5 commit 44ce81c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 14 deletions.
3 changes: 2 additions & 1 deletion dataprep/eda/diff/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def compute_diff(
dtype = {"a": Continuous(), "b": "nominal"}
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
"""
# pylint:disable = too-many-branches
if isinstance(cfg, dict):
cfg = Config.from_dict(display, cfg)
elif not cfg:
Expand Down Expand Up @@ -73,7 +74,7 @@ def compute_diff(
if [col for dfs in df for col in dfs.columns].count(x) < 2:
raise DataprepError("x must exist in at least two DataFrames")
# return compare_multiple_on_column(df_list, x)
return compare_multiple_col(df_list, x, cfg)
return compare_multiple_col(df_list, x, cfg) # type: ignore
else:
return compare_multiple_df(df_list, cfg, dtype) # type: ignore

Expand Down
6 changes: 4 additions & 2 deletions dataprep/eda/diff/compute/multiple_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from collections import UserList
from typing import Any, Callable, Dict, List, Union, Optional

import sys
import math
import pandas as pd
import numpy as np
import dask
import math
import dask.array as da
import dask.dataframe as dd
import sys

from dask.array.stats import kurtosis, skew
from ...utils import gaussian_kde
Expand Down Expand Up @@ -158,6 +158,8 @@ def _cont_calcs(srs: Srs, cfg: Config, df_list: List[dd.DataFrame], x: str) -> D
Computations for a continuous column in plot_diff([df...],x)
"""

# pylint:disable = too-many-branches, too-many-locals

data: Dict[str, List[Any]] = {}

# drop infinite values
Expand Down
32 changes: 21 additions & 11 deletions dataprep/eda/diff/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
""" # pylint: disable=too-many-lines
from typing import Any, Dict, List, Tuple, Optional

import math
import numpy as np
import pandas as pd
import math
from bokeh.models import (
HoverTool,
Panel,
Expand Down Expand Up @@ -246,7 +246,7 @@ def kde_viz(
toolbar_location=None,
y_axis_type=cfg.kde.yscale,
)
for i, (data, kde) in enumerate(zip(hist, kde)):
for i, (data, kde2) in enumerate(zip(hist, kde)):
dens, bins = data
intvls = _format_bin_intervals(bins)
df = pd.DataFrame(
Expand Down Expand Up @@ -274,7 +274,7 @@ def kde_viz(
mode="vline",
)
pts_rng = np.linspace(df.loc[0, "left"], df.loc[len(df) - 1, "right"], 1000)
pdf = kde(pts_rng)
pdf = kde2(pts_rng)
line = fig.line(x=pts_rng, y=pdf, line_color=CATEGORY10[i], line_width=2, alpha=0.5)
hover_dist = HoverTool(renderers=[line], tooltips=[("x", "@x"), ("y", "@y")])
fig.add_tools(hover_hist)
Expand Down Expand Up @@ -344,6 +344,7 @@ def dt_line_viz(
return fig


# pylint:disable = unused-argument
def box_viz(
df_list: List[pd.DataFrame],
x: str,
Expand Down Expand Up @@ -432,27 +433,30 @@ def box_viz(
fig.xaxis.axis_label = None
fig.yaxis.axis_label = x

# pylint:disable = undefined-loop-variable
minw = min(otlrs) if otlrs else np.nan
maxw = max(otlrs) if otlrs else np.nan
_format_axis(fig, min(df["lw"].min(), minw), max(df["uw"].max(), maxw), "y")

return Panel(child=row(fig), title="Box Plot")


# pylint:disable = unused-argument
def render_correlation_single_heatmaps(
df_list: List[Dict[str, pd.DataFrame]], col: str, plot_width: int, plot_height: int, cfg: Config
) -> List[Panel]:
"""
Render correlation heatmaps, but with single column
"""
# pylint:disable = too-many-locals
corr: Dict[str, List[Any]] = {}
group_all_x = [col + "_" + str(i + 1) for i in range(len(df_list))]
group_all_y = df_list[0]["Pearson"]["y"].unique()
for meth in ["Pearson", "Spearman", "KendallTau"]:
corr[meth] = []
for i in range(len(df_list)):
df_list[i][meth]["x"] = df_list[i][meth]["x"] + "_" + str(i + 1)
corr[meth].append(df_list[i][meth])
for i, df in enumerate(df_list):
df[meth]["x"] = df[meth]["x"] + "_" + str(i + 1)
corr[meth].append(df[meth])
tabs: List[Panel] = []
tooltips = [("y", "@y"), ("correlation", "@correlation{1.11}")]
for method, dfs in corr.items():
Expand Down Expand Up @@ -655,10 +659,14 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:


def render_comparison_continous(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
"""
Render for continuous variable comparison
"""
# pylint:disable = too-many-locals
plot_width = cfg.plot.width if cfg.plot.width is not None else 450
plot_height = cfg.plot.height if cfg.plot.height is not None else 400
df_labels: List[str] = cfg.diff.label # type: ignore
baseline: int = cfg.diff.baseline
# baseline: int = cfg.diff.baseline
tabs: List[Panel] = []
htgs: Dict[str, List[Tuple[str, str]]] = {}
col, data = itmdt["col"], itmdt["data"][0]
Expand Down Expand Up @@ -700,6 +708,11 @@ def render_comparison_continous(itmdt: Intermediate, cfg: Config) -> Dict[str, A
render_correlation_single_heatmaps(data["corr"], col, plot_width, plot_height, cfg)
)

# pylint:disable=line-too-long
legend_lables = [
{"label": label, "color": color}
for label, color in zip(cfg.diff.label, CATEGORY10[: len(cfg.diff.label)]) # type: ignore
]
return {
"comparison_stats": format_num_stats(itmdt["stats"]) if cfg.stats.enable else [],
"value_table": [],
Expand All @@ -709,10 +722,7 @@ def render_comparison_continous(itmdt: Intermediate, cfg: Config) -> Dict[str, A
"container_width": plot_width + 110,
"how_to_guide": htgs,
"df_labels": cfg.diff.label,
"legend_labels": [
{"label": label, "color": color}
for label, color in zip(cfg.diff.label, CATEGORY10[: len(cfg.diff.label)]) # type: ignore
],
"legend_labels": legend_lables,
}


Expand Down

1 comment on commit 44ce81c

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: 44ce81c Previous: 19077c6 Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.19925862718524068 iter/sec (stddev: 0.035122191646225084) 0.180197762286889 iter/sec (stddev: 0.10882099934591108) 0.90

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.