Skip to content

Commit

Permalink
feat(eda): added overview and variables section for create_diff_report
Browse files Browse the repository at this point in the history
  • Loading branch information
devinllu authored and jinglinpeng committed Nov 25, 2021
1 parent 3a0653e commit dc4cf7d
Show file tree
Hide file tree
Showing 9 changed files with 1,602 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,6 @@ profiling
.coverage
report.xml
.vim

# personal tests
test.ipynb
2 changes: 2 additions & 0 deletions dataprep/eda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..utils import is_notebook
from .correlation import compute_correlation, plot_correlation, render_correlation
from .create_report import create_report
from .create_diff_report import create_diff_report
from .distribution import compute, plot, render
from .dtypes import (
Categorical,
Expand Down Expand Up @@ -44,6 +45,7 @@
"DateTime",
"Text",
"create_report",
"create_diff_report",
"plot_diff",
"compute_diff",
"render_diff",
Expand Down
99 changes: 99 additions & 0 deletions dataprep/eda/create_diff_report/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
This module implements the create_diff_report([df1, df2]) function.
"""
import warnings
from typing import Any, Dict, List, Optional, Union

import pandas as pd
from bokeh.resources import INLINE
from jinja2 import Environment, PackageLoader

from .diff_formatter import format_diff_report
from ..configs import Config
from ..create_report.report import Report
from collections import defaultdict

__all__ = ["create_diff_report"]

ENV_LOADER = Environment(
loader=PackageLoader("dataprep", "eda/create_diff_report/templates"),
)


def create_diff_report(
df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]],
config: Optional[Dict[str, Any]] = None,
display: Optional[List[str]] = None,
title: Optional[str] = "DataPrep Report",
mode: Optional[str] = "basic",
progress: bool = True,
) -> Report:
"""
This function is to generate and render elements in a report object given multiple dataframes.
Parameters
----------
df_list
The DataFrames for which data are calculated.
config
A dictionary for configuring the visualizations
E.g. config={"hist.bins": 20}
display
The list that contains the names of plots user wants to display,
E.g. display = ["bar", "hist"]
Without user's specifications, the default is "auto"
title: Optional[str], default "DataPrep Report"
The title of the report, which will be shown on the navigation bar.
mode: Optional[str], default "basic"
This controls what type of report to be generated.
Currently only the 'basic' is fully implemented.
progress
Whether to show the progress bar.
Examples
--------
from dataprep.datasets import load_dataset
from dataprep.eda import create_diff_report
df_train = load_dataset('house_prices_train')
df_test = load_dataset('house_prices_test')
create_diff_report([df_train, df_test]) # show in browser on jupyter notebook
"""

_suppress_warnings()
cfg = Config.from_dict(display, config)

components = format_diff_report(df_list, cfg, mode, progress)

dict_stats = defaultdict(list)

for comps in components["dfs"]:
for key, value in comps["overview"][0].items():
if value is not None:
dict_stats[key].append(value)

context = {
"resources": INLINE.render(),
"title": title,
"stats": dict_stats,
"components": components,
"is_diff_report": True,
"df_labels": cfg.diff.label,
"legend_labels": components["legend_lables"],
}

# return context

template_base = ENV_LOADER.get_template("base.html")
report = template_base.render(context=context)
return Report(report)


def _suppress_warnings() -> None:
"""
suppress warnings in create_diff_report
"""
warnings.filterwarnings(
"ignore",
"The default value of regex will change from True to False in a future version",
category=FutureWarning,
)

0 comments on commit dc4cf7d

Please sign in to comment.