scrapinghub · manycoding · Jul 9, 2019 · Jun 28, 2019 · Jul 1, 2019 · Jul 1, 2019
diff --git a/CHANGES.md b/CHANGES.md
@@ -22,6 +22,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
 - `basic_json_schema()` works with `deleted` jobs
 - `start` is supported for Collections, #112
 - `enum` is counted as a `category` tag, #18
+- `Garbage Symbols` searches in str representation of nested fields instead of expanded df, #130
 ### Fixed
 - `Arche.glance()`, #88
 - Item links in Schema validation errors, #89

diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -143,7 +143,7 @@ def data_quality_report(self, bucket: Optional[str] = None):
 
     @lru_cache(maxsize=32)
     def run_general_rules(self):
-        self.save_result(garbage_symbols(self.source_items))
+        self.save_result(garbage_symbols(self.source_items.df))
         df = self.source_items.df
         self.save_result(
             coverage_rules.check_fields_coverage(

diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py
@@ -73,7 +73,7 @@ def create_figures(self, items: CloudItems):
         ).get_errors_count()
 
         garbage_symbols_result = self.report.results.get(
-            "Garbage Symbols", garbage_symbols(items)
+            "Garbage Symbols", garbage_symbols(items.df)
         )
 
         quality_estimation, field_accuracy = generate_quality_estimation(

diff --git a/src/arche/report.py b/src/arche/report.py
@@ -1,13 +1,16 @@
+from functools import partial
 from typing import Dict
 
 from arche import SH_URL
 from arche.rules.result import Level, Outcome, Result
 from colorama import Fore, Style
-from IPython.display import display, Markdown
+from IPython.display import display_markdown
 import numpy as np
 import pandas as pd
 import plotly.io as pio
 
+display_markdown = partial(display_markdown, raw=True)
+
 
 class Report:
     def __init__(self):
@@ -22,7 +25,7 @@ def write_color_text(text: str, color: Fore = Fore.RED) -> None:
 
     @staticmethod
     def write_rule_name(rule_name: str) -> None:
-        display(Markdown(f"{rule_name}:"))
+        display_markdown(f"{rule_name}:")
 
     @classmethod
     def write(cls, text: str) -> None:
@@ -58,15 +61,13 @@ def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
     def write_details(self, short: bool = False, keys_limit: int = 10) -> None:
         for result in self.results.values():
             if result.detailed_messages_count:
-                display(
-                    Markdown(
-                        f"{result.name} ({result.detailed_messages_count} message(s)):"
-                    )
+                display_markdown(
+                    f"{result.name} ({result.detailed_messages_count} message(s)):"
                 )
                 self.write_rule_details(result, short, keys_limit)
             for f in result.figures:
                 pio.show(f)
-            display(Markdown("<br>"))
+            display_markdown("<br>")
 
     @classmethod
     def write_rule_details(
@@ -93,7 +94,9 @@ def write_detailed_errors(cls, errors: Dict, short: bool, keys_limit: int) -> No
                 keys = pd.Series(list(keys))
 
             sample = Report.sample_keys(keys, keys_limit)
-            display(Markdown(f"{len(keys)} items affected - {attribute}: {sample}"))
+            display_markdown(
+                f"{len(keys)} items affected - {attribute}: {sample}", raw=True
+            )
 
     @staticmethod
     def sample_keys(keys: pd.Series, limit: int) -> str:

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -1,9 +1,10 @@
+import codecs
 import re
 
-from arche.readers.items import Items
 from arche.rules.result import Outcome, Result
 import numpy as np
 import pandas as pd
+from tqdm import tqdm_notebook
 
 
 def compare_boolean_fields(
@@ -74,41 +75,46 @@ def fields_to_compare(source_df: pd.DataFrame, target_df: pd.DataFrame) -> bool:
     return False
 
 
-def garbage_symbols(items: Items) -> Result:
+def garbage_symbols(df: pd.DataFrame) -> Result:
     """Find unwanted symbols in `np.object` columns.
 
     Returns:
         A result containing item keys per field which contained any trash symbol
     """
     garbage = (
         r"(?P<spaces>^\s|\s$)"
-        r"|(?P<html_entities>&amp|&reg)"
-        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
-        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
+        r"|(?P<html_entities>&#?(?:\w*;|\d*;))"
+        r"|(?P<css>(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})"
+        r"|(?P<html_tags></?(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
         r"blockquote)\s*/?>|<!--|-->)"
     )
 
     errors = {}
     row_keys = set()
-    rule_result = Result("Garbage Symbols", items_count=len(items))
+    rule_result = Result("Garbage Symbols", items_count=len(df))
 
-    for column in items.flat_df.select_dtypes([np.object]):
-        matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE)
-        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
+    for column in tqdm_notebook(
+        df.select_dtypes([np.object]).columns, desc="Garbage Symbols"
+    ):
+        matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
         if not matches.empty:
-            error_keys = items.flat_df.loc[matches.unstack().index.values].index
-            original_column = items.origin_column_name(column)
+            error_keys = df.loc[matches.unstack().index.values].index
             bad_texts = matches.stack().value_counts().index.sort_values().tolist()
+            # escape backslashes for markdown repr, `\n > \\n`
+            bad_texts = [
+                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
+                for bx in bad_texts
+            ]
             error = (
-                f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' "
-                f"values contain `{', '.join([t[:20] for t in bad_texts])}`"
+                f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
+                f"values contain `{', '.join(bad_texts)}`"
             )
+
             errors[error] = list(error_keys)
             row_keys = row_keys.union(error_keys)
-
     if errors:
         rule_result.add_error(
-            f"{len(row_keys)/len(items) * 100:.1f}% ({len(row_keys)}) items affected",
+            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
             errors=errors,
         )
         rule_result.err_items_count = len(row_keys)

diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
@@ -1,6 +1,5 @@
 from functools import partial
 
-from arche.readers.items import Items
 from arche.rules.others import compare_boolean_fields, garbage_symbols
 from arche.rules.result import Level, Outcome
 from conftest import create_named_df, create_result
@@ -72,31 +71,32 @@ def test_compare_boolean_fields(
 
 dirty_inputs = [
     (
-        [
-            {
-                "name": " Blacky Robeburned",
-                "address": "here goes &AMP",
-                "phone": "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
-                "rank": 14441,
-            },
-            {
-                "name": "<!--Leprous Jim-->",
-                "address": "Some street",
-                "phone": "1144",
-                "rank": 2_039_857,
-            },
-        ],
+        {
+            "name": [" Blacky Robeburned", "\t<!--Leprous Jim-->"],
+            "address": [["<br> ", {"v", "&amp;"}], "\xa0house"],
+            "phone": [
+                "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
+                {"a": "11"},
+            ],
+            "rank": [141, 2_039_857],
+        },
         {
             Level.ERROR: [
                 (
                     "100.0% (2) items affected",
                     None,
                     {
-                        "100.0% of 'name' values contain ` , -->, <!--`": [0, 1],
-                        "50.0% of 'address' values contain `&AMP`": [0],
+                        "100.0% of 'address' values contain `'&amp;', '<br>', '\\xa0'`": [
+                            0,
+                            1,
+                        ],
+                        "100.0% of 'name' values contain `'\\t', ' ', '-->', '<!--'`": [
+                            0,
+                            1,
+                        ],
                         (
                             "50.0% of 'phone' values contain "
-                            "`.sx-prime-pricing-lo, </h1>, <h1>`"
+                            "`'.sx-prime-pricing-lo', '</h1>', '<h1>'`"
                         ): [0],
                     },
                 )
@@ -116,7 +116,7 @@ def test_compare_boolean_fields(
 def test_garbage_symbols(
     raw_items, expected_messages, expected_items_count, expected_err_items_count
 ):
-    assert garbage_symbols(Items.from_array(raw_items)) == create_result(
+    assert garbage_symbols(pd.DataFrame(raw_items)) == create_result(
         "Garbage Symbols",
         expected_messages,
         items_count=expected_items_count,

diff --git a/tests/rules/test_result.py b/tests/rules/test_result.py
@@ -77,7 +77,7 @@ def test_tensors_not_equal(source, target):
 )
 def test_show(mocker, capsys, message, stats, exp_md_output, exp_txt_outputs):
     mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
-    mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
+    mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
     mocked_print = mocker.patch("builtins.print", autospec=True)
     res = create_result("rule name here", message, stats=stats)
     res.show()

diff --git a/tests/test_arche.py b/tests/test_arche.py
@@ -251,7 +251,7 @@ def test_validate_with_json_schema(mocker, get_job_items, get_schema):
 
 
 def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
-    mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
+    mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
     url = f"{SH_URL}/112358/13/21/item/1"
     res = create_result(
         "JSON Schema Validation",
@@ -273,7 +273,7 @@ def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
     assert len(a.report.results) == 1
     assert a.report.results.get("JSON Schema Validation") == res
     mocked_md.assert_any_call(
-        f"1 items affected - 'price' is a required property: [1]({url})"
+        f"1 items affected - 'price' is a required property: [1]({url})", raw=True
     )
 
 

diff --git a/tests/test_report.py b/tests/test_report.py
@@ -35,7 +35,7 @@
 )
 def test_write_details(mocker, get_df, capsys, messages, expected_details):
     mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
-    md_mock = mocker.patch("arche.report.Markdown", autospec=True)
+    md_mock = mocker.patch("arche.report.display_markdown", autospec=True)
 
     r = Report()
     for m in messages:
@@ -121,9 +121,9 @@ def test_write_rule_details(capsys, message, expected_details):
 )
 def test_write_detailed_errors(mocker, errors, short, keys_limit, expected_messages):
     mocker.patch("pandas.Series.sample", return_value=pd.Series("5"), autospec=True)
-    md_mock = mocker.patch("arche.report.Markdown", autospec=True)
+    md_mock = mocker.patch("arche.report.display_markdown", autospec=True)
     Report.write_detailed_errors(errors, short, keys_limit)
-    calls = [mocker.call(m) for m in expected_messages]
+    calls = [mocker.call(m, raw=True) for m in expected_messages]
     md_mock.assert_has_calls(calls, any_order=True)