Jinja report (#168)

* implements a new jinja2 html report output * added jinja2 and bleach libraries to Pipfile * removed markdown report. * moved rule outcome definition to Result * Reports are now callable; Created base template for reports * fixed plots on full report * links open in a new tab * fix single rule output * fix pipeline imports * fix formating * fix formatting * fix import order * transform method in statatic method * removed line on arche.py, now method compare_metadata is equals to master * removed template * fixed test_arche_dataframe * fixed test_report_all * fixed formating * added comment and removed unused code * fix code style * fixed all test_arche tests * renamed test_write_details to test_display and fixed it * fixed test_write_rule_details and removed test_write_none_rule_details * fixed code style * fixed test result * added urls list macro * finished fixing all tests * added test__order_rules to test_report * code style fix * minor fix on base template macros * moved call to .figures to template * updated setup.cfg * fixed docs tests * added tests for Outcome eq and ne * improves single_rule template * Apply suggestions from code review to setup.cfg Co-Authored-By: Valeriy Mukhtarulin <manycoding@users.noreply.github.com> * pr fix - removed _order_rules function and changed Outcome values * removed unused imports * added keys_limit parameter on macros and report * code review fixes for test_report * pr fix for test_arche * reports now rendering in a iframe * optimizations * fixed iframe closing tag * renamed templates * improved redability * simplified code * fixed empty spaces in report * fixed tests * Add _outcome field and setter, return SKIPPED outcome from rules (#173) * PASSED if only info messages * change if not rule to if rule on arche report * added two lines on CHANGES.MD file * Update CHANGES.md
scrapinghub · Oct 24, 2019 · f46c40e · f46c40e
1 parent 6be3d7a
commit f46c40e
Show file tree

Hide file tree

Showing 26 changed files with 440 additions and 247 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,6 +16,9 @@ Note that the top-most release is changes in the unreleased master branch on Git
 - Support to **Bitbucket API**, in order to access files from private repositories, #71
 - Extend inferred schema with `additionalProperties: False and uniqueItems: True`, #21
 - **Fields Difference** rule to find the difference between field values of two jobs. Supports normalization, nested fields, full access to the data, #167
+- Added `outcome` property on Result, in order to define a rule outcome based on message levells. #173
+### Changed
+- Reports rendering. Reports are being generated as HTML with a jinja2 template. `Arche.report_all()` displays the rules results grouped by outcome. The plots are displayed on the "plots" tab. #168
 
 
 ## [0.3.6] (2019-07-12)

diff --git a/Pipfile b/Pipfile
@@ -14,6 +14,8 @@ fastjsonschema = "*"
 perfect-jsonschema = "*"
 tqdm = "*"
 ipywidgets = "*"
+Jinja2 = "*"
+bleach = "*"
 
 [dev-packages]
 jupyterlab = "*"

diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb
@@ -277,7 +277,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.2"
+   "version": "3.7.3"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

diff --git a/setup.cfg b/setup.cfg
@@ -14,6 +14,9 @@ classifiers =
  	Intended Audience :: Developers
 	Programming Language :: Python :: 3.7
 
+[files]
+package-data = "arche/templates/*.html"
+
 [options]
 package_dir = 
 	=src
@@ -30,7 +33,10 @@ install_requires =
     perfect-jsonschema
     tqdm
     ipywidgets
+    Jinja2
+    bleach
 zip_safe = True
+include_package_data = True
 
 [options.extras_require]
 tests =
@@ -52,3 +58,4 @@ docs =
 
 [options.packages.find]
 where = src
+
diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -127,9 +127,7 @@ def save_result(self, rule_result):
     def report_all(self, short: bool = False) -> None:
         self.run_all_rules()
         IPython.display.clear_output()
-        self.report.write_summaries()
-        self.report.write("\n" * 2)
-        self.report.write_details(short)
+        self.report(keys_limit=10 if short else None)
 
     def run_all_rules(self):
         if isinstance(self.source_items, JobItems):
@@ -167,7 +165,7 @@ def validate_with_json_schema(self) -> None:
             self.schema.raw, self.source_items.raw, self.source_items.df.index
         )
         self.save_result(res)
-        res.show()
+        self.report(res)
 
     def glance(self) -> None:
         """Run JSON schema check and output results. In most cases it will return

diff --git a/src/arche/report.py b/src/arche/report.py
@@ -1,105 +1,51 @@
-from functools import partial
-from typing import Dict, Union
+from typing import Dict
+
 
 from arche import SH_URL
-from arche.rules.result import Level, Outcome, Result
-from IPython.display import display_markdown
+from arche.rules.result import Result
+from bleach import linkify, callbacks
+from IPython.display import display_html
+from jinja2 import Environment, PackageLoader, select_autoescape
 import numpy as np
 import pandas as pd
 
-display_markdown = partial(display_markdown, raw=True)
-
 
 class Report:
     def __init__(self):
         self.results: Dict[str, Result] = {}
 
+        self.env = Environment(
+            loader=PackageLoader("arche", "templates"),
+            autoescape=select_autoescape(["html"]),
+            extensions=["jinja2.ext.loopcontrols"],
+        )
+        self.env.filters["linkify"] = linkify
+
     def save(self, result: Result) -> None:
         self.results[result.name] = result
 
-    @staticmethod
-    def write_color_text(text: str, color: str = "#8A0808") -> None:
-        display_markdown(f"<font style='color:{color};'>{text}</font>")
-
-    @staticmethod
-    def write_rule_name(rule_name: str) -> None:
-        display_markdown(f"<h4>{rule_name}</h4>")
-
-    @classmethod
-    def write(cls, text: str) -> None:
-        display_markdown(text)
-
-    def write_summaries(self) -> None:
-        display_markdown(f"<h2>Executed {len(self.results)} rules</h2>")
-        for result in self.results.values():
-            self.write_summary(result)
-
-    @classmethod
-    def write_summary(cls, result: Result) -> None:
-        cls.write_rule_name(result.name)
-        if not result.messages:
-            cls.write_rule_outcome(Outcome.PASSED, Level.INFO)
-        for level, rule_msgs in result.messages.items():
-            for rule_msg in rule_msgs:
-                cls.write_rule_outcome(rule_msg.summary, level)
-
-    @classmethod
-    def write_rule_outcome(
-        cls, outcome: Union[str, Outcome], level: Level = Level.INFO
-    ) -> None:
-        if isinstance(outcome, Outcome):
-            outcome = outcome.name
-        msg = outcome
-        if level == Level.ERROR:
-            cls.write_color_text(msg)
-        elif level == Level.WARNING:
-            cls.write_color_text(msg, color="#CCCC00")
-        elif outcome == Outcome.PASSED.name:
-            cls.write_color_text(msg, color="#0B6121")
+    def __call__(self, rule: Result = None, keys_limit: int = None) -> None:
+        if rule:
+            template = self.env.get_template("single-rule.html")
+            resultHTML = template.render(
+                rule=rule,
+                pd=pd,
+                linkfy_callbacks=[callbacks.target_blank],
+                keys_limit=keys_limit,
+            )
         else:
-            cls.write(msg)
-
-    def write_details(self, short: bool = False, keys_limit: int = 10) -> None:
-        display_markdown("<h2>Details</h2>")
-        for result in self.results.values():
-            if result.detailed_messages_count:
-                display_markdown(
-                    f"{result.name} ({result.detailed_messages_count} message(s)):"
-                )
-                self.write_rule_details(result, short, keys_limit)
-                display_markdown("<br>")
-        display_markdown("<h2>Plots</h2>")
-        for result in self.results.values():
-            for f in result.figures:
-                f.show()
-
-    @classmethod
-    def write_rule_details(
-        cls, result: Result, short: bool = False, keys_limit: int = 10
-    ) -> None:
-        for rule_msgs in result.messages.values():
-            for rule_msg in rule_msgs:
-                if rule_msg.errors:
-                    cls.write_detailed_errors(rule_msg.errors, short, keys_limit)
-                if rule_msg.detailed:
-                    cls.write(rule_msg.detailed)
-
-    @classmethod
-    def write_detailed_errors(cls, errors: Dict, short: bool, keys_limit: int) -> None:
-        error_messages = sorted(errors.items(), key=lambda i: len(i[1]), reverse=True)
-
-        if short:
-            keys_limit = 5
-            error_messages = error_messages[:5]
-
-        for attribute, keys in error_messages:
-            if isinstance(keys, list):
-                keys = pd.Series(keys)
-            if isinstance(keys, set):
-                keys = pd.Series(list(keys))
-
-            sample = Report.sample_keys(keys, keys_limit)
-            display_markdown(f"{len(keys)} items affected - {attribute}: {sample}")
+            template = self.env.get_template("full-report.html")
+            resultHTML = template.render(
+                rules=sorted(self.results.values(), key=lambda x: x.outcome.value),
+                pd=pd,
+                linkfy_callbacks=[callbacks.target_blank],
+                keys_limit=keys_limit,
+            )
+        # this renders the report as an iframe
+        # the option was added for generating the docs
+        template = self.env.get_template("iframe.html")
+        resultHTML = template.render(html_str=resultHTML)
+        display_html(resultHTML, raw=True)
 
     @staticmethod
     def sample_keys(keys: pd.Series, limit: int) -> str:

diff --git a/src/arche/rules/category.py b/src/arche/rules/category.py
@@ -55,7 +55,7 @@ def get_difference(
                 f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}"
             )
     if not category_names:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
     return result
 
 
@@ -76,7 +76,7 @@ def get_coverage_per_category(df: pd.DataFrame, category_names: List[str]) -> Re
         result.add_info(f"{len(value_counts)} categories in '{c}'")
         result.stats.append(value_counts)
     if not category_names:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
     return result
 
 
@@ -108,6 +108,7 @@ def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
         result.add_info("Categories were not found")
         return result
     result.add_info(f"{len(result.stats)} category field(s)")
+    result.outcome = Outcome.INFO
     return result
 
 

diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
@@ -91,7 +91,7 @@ def tagged_fields(
         if tag_fields:
             fields_names.extend(tag_fields)
     if not fields_names:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
     result = fields(source_df, target_df, fields_names)
     result.name = name

diff --git a/src/arche/rules/coverage.py b/src/arche/rules/coverage.py
@@ -1,6 +1,6 @@
 from typing import List
 
-from arche.rules.result import Result, Outcome
+from arche.rules.result import Result
 import arche.tools.api as api
 import pandas as pd
 from scrapinghub.client.jobs import Job
@@ -130,9 +130,10 @@ def anomalies(target: str, sample: List[str]) -> Result:
     stats["target deviation"] = stats["target"] - stats["mean"]
     devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])]
     devs.name = "Anomalies"
-    errors = f"{len(devs.index)} field(s) with significant coverage deviation"
     if not devs.empty:
-        result.add_error(Outcome.FAILED, detailed=errors)
+        result.add_error(
+            f"{len(devs.index)} field(s) with significant coverage deviation"
+        )
         result.stats = [devs]
 
     return result
diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py
@@ -15,7 +15,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
     result = Result("Duplicates By **unique** Tag")
 
     if not unique_fields:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     err_keys: Set = set()
@@ -69,7 +69,7 @@ def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
     name = "Duplicates By **name_field, product_url_field** Tags"
     result = Result(name)
     if not name_fields or not url_fields:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
     name_field = name_fields[0]
     url_field = url_fields[0]

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -26,7 +26,7 @@ def compare_boolean_fields(
 
     result = Result("Boolean Fields")
     if not fields_to_compare(source_bool, target_bool):
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     dummy = pd.DataFrame(columns=[True, False])

diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py
@@ -16,7 +16,7 @@ def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
     result = Result("Compare Price Was And Now")
 
     if not price_was_fields or not price_fields:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     price_field = price_fields[0]
@@ -78,7 +78,7 @@ def compare_prices_for_same_urls(
     result = Result("Compare Prices For Same Urls")
     url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field")
     if not url_field_list:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     url_field = url_field_list[0]
@@ -137,7 +137,7 @@ def compare_names_for_same_urls(
     url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field")
     name_field_list: Optional[List[str]] = tagged_fields.get("name_field")
     if not url_field_list or not name_field_list:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     name_field: str = name_field_list[0]
@@ -183,7 +183,7 @@ def compare_prices_for_same_names(
     result = Result("Compare Prices For Same Names")
     name_field_tag = tagged_fields.get("name_field")
     if not name_field_tag:
-        result.add_info(Outcome.SKIPPED)
+        result.outcome = Outcome.SKIPPED
         return result
 
     name_field = name_field_tag[0]