From 75b63269a7f73bbca6055349c8ad3fae6a1713b7 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Fri, 28 Jun 2019 19:56:48 -0400
Subject: [PATCH 1/6] Do not use flat_df in garbage_symbols

---
 src/arche/arche.py               |  2 +-
 src/arche/data_quality_report.py |  2 +-
 src/arche/report.py              | 19 ++++++++++--------
 src/arche/rules/others.py        | 33 +++++++++++++++++---------------
 tests/rules/test_others.py       |  7 +++----
 tests/rules/test_result.py       |  2 +-
 tests/test_arche.py              |  4 ++--
 tests/test_report.py             |  6 +++---
 8 files changed, 40 insertions(+), 35 deletions(-)
diff --git a/src/arche/arche.py b/src/arche/arche.py
index ec6e58a..8e5dd32 100755
--- a/src/arche/arche.py
+++ b/src/arche/arche.py
@@ -143,7 +143,7 @@ def data_quality_report(self, bucket: Optional[str] = None):
 
     @lru_cache(maxsize=32)
     def run_general_rules(self):
-        self.save_result(garbage_symbols(self.source_items))
+        self.save_result(garbage_symbols(self.source_items.df))
         df = self.source_items.df
         self.save_result(
             coverage_rules.check_fields_coverage(
diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py
index 7ba0ad8..90f0924 100755
--- a/src/arche/data_quality_report.py
+++ b/src/arche/data_quality_report.py
@@ -73,7 +73,7 @@ def create_figures(self, items: CloudItems):
         ).get_errors_count()
 
         garbage_symbols_result = self.report.results.get(
-            "Garbage Symbols", garbage_symbols(items)
+            "Garbage Symbols", garbage_symbols(items.df)
         )
 
         quality_estimation, field_accuracy = generate_quality_estimation(
diff --git a/src/arche/report.py b/src/arche/report.py
index 22baaf7..2ba7521 100755
--- a/src/arche/report.py
+++ b/src/arche/report.py
@@ -1,13 +1,16 @@
+from functools import partial
 from typing import Dict
 
 from arche import SH_URL
 from arche.rules.result import Level, Outcome, Result
 from colorama import Fore, Style
-from IPython.display import display, Markdown
+from IPython.display import display_markdown
 import numpy as np
 import pandas as pd
 import plotly.io as pio
 
+display_markdown = partial(display_markdown, raw=True)
+
 
 class Report:
     def __init__(self):
@@ -22,7 +25,7 @@ def write_color_text(text: str, color: Fore = Fore.RED) -> None:
 
     @staticmethod
     def write_rule_name(rule_name: str) -> None:
-        display(Markdown(f"{rule_name}:"))
+        display_markdown(f"{rule_name}:")
 
     @classmethod
     def write(cls, text: str) -> None:
@@ -58,15 +61,13 @@ def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
     def write_details(self, short: bool = False, keys_limit: int = 10) -> None:
         for result in self.results.values():
             if result.detailed_messages_count:
-                display(
-                    Markdown(
-                        f"{result.name} ({result.detailed_messages_count} message(s)):"
-                    )
+                display_markdown(
+                    f"{result.name} ({result.detailed_messages_count} message(s)):"
                 )
                 self.write_rule_details(result, short, keys_limit)
             for f in result.figures:
                 pio.show(f)
-            display(Markdown("<br>"))
+            display_markdown("<br>")
 
     @classmethod
     def write_rule_details(
@@ -93,7 +94,9 @@ def write_detailed_errors(cls, errors: Dict, short: bool, keys_limit: int) -> No
                 keys = pd.Series(list(keys))
 
             sample = Report.sample_keys(keys, keys_limit)
-            display(Markdown(f"{len(keys)} items affected - {attribute}: {sample}"))
+            display_markdown(
+                f"{len(keys)} items affected - {attribute}: {sample}", raw=True
+            )
 
     @staticmethod
     def sample_keys(keys: pd.Series, limit: int) -> str:
diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index b579a15..aba9ca6 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -1,9 +1,9 @@
 import re
 
-from arche.readers.items import Items
 from arche.rules.result import Outcome, Result
 import numpy as np
 import pandas as pd
+from tqdm import tqdm_notebook
 
 
 def compare_boolean_fields(
@@ -74,7 +74,7 @@ def fields_to_compare(source_df: pd.DataFrame, target_df: pd.DataFrame) -> bool:
     return False
 
 
-def garbage_symbols(items: Items) -> Result:
+def garbage_symbols(df: pd.DataFrame) -> Result:
     """Find unwanted symbols in `np.object` columns.
 
     Returns:
@@ -82,33 +82,36 @@ def garbage_symbols(items: Items) -> Result:
     """
     garbage = (
         r"(?P<spaces>^\s|\s$)"
-        r"|(?P<html_entities>&amp|&reg)"
-        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
-        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
+        r"|(?P<html_entities>&#?(?:\w*;|\d*;))"
+        r"|(?P<css>(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})"
+        r"|(?P<html_tags></?(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
         r"blockquote)\s*/?>|<!--|-->)"
     )
 
     errors = {}
     row_keys = set()
-    rule_result = Result("Garbage Symbols", items_count=len(items))
+    rule_result = Result("Garbage Symbols", items_count=len(df))
 
-    for column in items.flat_df.select_dtypes([np.object]):
-        matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE)
-        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
+    for column in tqdm_notebook(
+        df.select_dtypes([np.object]).columns, desc="Garbage Symbols"
+    ):
+        matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
         if not matches.empty:
-            error_keys = items.flat_df.loc[matches.unstack().index.values].index
-            original_column = items.origin_column_name(column)
+            error_keys = df.loc[matches.unstack().index.values].index
             bad_texts = matches.stack().value_counts().index.sort_values().tolist()
+            bad_texts = [
+                bx.replace("\n", "\\n").replace("\t", "\\t")[:20] for bx in bad_texts
+            ]
             error = (
-                f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' "
-                f"values contain `{', '.join([t[:20] for t in bad_texts])}`"
+                f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
+                f"values contain `{', '.join(bad_texts)}`"
             )
+
             errors[error] = list(error_keys)
             row_keys = row_keys.union(error_keys)
-
     if errors:
         rule_result.add_error(
-            f"{len(row_keys)/len(items) * 100:.1f}% ({len(row_keys)}) items affected",
+            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
             errors=errors,
         )
         rule_result.err_items_count = len(row_keys)
diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
index 7a423a5..452ee8f 100755
--- a/tests/rules/test_others.py
+++ b/tests/rules/test_others.py
@@ -1,6 +1,5 @@
 from functools import partial
 
-from arche.readers.items import Items
 from arche.rules.others import compare_boolean_fields, garbage_symbols
 from arche.rules.result import Level, Outcome
 from conftest import create_named_df, create_result
@@ -75,7 +74,7 @@ def test_compare_boolean_fields(
         [
             {
                 "name": " Blacky Robeburned",
-                "address": "here goes &AMP",
+                "address": "here goes &amp;",
                 "phone": "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
                 "rank": 14441,
             },
@@ -92,8 +91,8 @@ def test_compare_boolean_fields(
                     "100.0% (2) items affected",
                     None,
                     {
+                        "50.0% of 'address' values contain `&amp;`": [0],
                         "100.0% of 'name' values contain ` , -->, <!--`": [0, 1],
-                        "50.0% of 'address' values contain `&AMP`": [0],
                         (
                             "50.0% of 'phone' values contain "
                             "`.sx-prime-pricing-lo, </h1>, <h1>`"
@@ -116,7 +115,7 @@ def test_compare_boolean_fields(
 def test_garbage_symbols(
     raw_items, expected_messages, expected_items_count, expected_err_items_count
 ):
-    assert garbage_symbols(Items.from_array(raw_items)) == create_result(
+    assert garbage_symbols(pd.DataFrame(raw_items)) == create_result(
         "Garbage Symbols",
         expected_messages,
         items_count=expected_items_count,
diff --git a/tests/rules/test_result.py b/tests/rules/test_result.py
index e36fa4c..f248df0 100755
--- a/tests/rules/test_result.py
+++ b/tests/rules/test_result.py
@@ -77,7 +77,7 @@ def test_tensors_not_equal(source, target):
 )
 def test_show(mocker, capsys, message, stats, exp_md_output, exp_txt_outputs):
     mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
-    mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
+    mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
     mocked_print = mocker.patch("builtins.print", autospec=True)
     res = create_result("rule name here", message, stats=stats)
     res.show()
diff --git a/tests/test_arche.py b/tests/test_arche.py
index 4d108cb..3c634e0 100755
--- a/tests/test_arche.py
+++ b/tests/test_arche.py
@@ -251,7 +251,7 @@ def test_validate_with_json_schema(mocker, get_job_items, get_schema):
 
 
 def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
-    mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
+    mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
     url = f"{SH_URL}/112358/13/21/item/1"
     res = create_result(
         "JSON Schema Validation",
@@ -273,7 +273,7 @@ def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
     assert len(a.report.results) == 1
     assert a.report.results.get("JSON Schema Validation") == res
     mocked_md.assert_any_call(
-        f"1 items affected - 'price' is a required property: [1]({url})"
+        f"1 items affected - 'price' is a required property: [1]({url})", raw=True
     )
 
 
diff --git a/tests/test_report.py b/tests/test_report.py
index 3403cc7..63beb02 100755
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -35,7 +35,7 @@
 )
 def test_write_details(mocker, get_df, capsys, messages, expected_details):
     mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
-    md_mock = mocker.patch("arche.report.Markdown", autospec=True)
+    md_mock = mocker.patch("arche.report.display_markdown", autospec=True)
 
     r = Report()
     for m in messages:
@@ -121,9 +121,9 @@ def test_write_rule_details(capsys, message, expected_details):
 )
 def test_write_detailed_errors(mocker, errors, short, keys_limit, expected_messages):
     mocker.patch("pandas.Series.sample", return_value=pd.Series("5"), autospec=True)
-    md_mock = mocker.patch("arche.report.Markdown", autospec=True)
+    md_mock = mocker.patch("arche.report.display_markdown", autospec=True)
     Report.write_detailed_errors(errors, short, keys_limit)
-    calls = [mocker.call(m) for m in expected_messages]
+    calls = [mocker.call(m, raw=True) for m in expected_messages]
     md_mock.assert_has_calls(calls, any_order=True)
 
 

From 6f3f9615ff57812d92209c2425c5a655bd792eda Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Mon, 1 Jul 2019 13:54:48 -0400
Subject: [PATCH 2/6] Escape backslash with codecs

---
 CHANGES.md                 |  1 +
 src/arche/rules/others.py  |  5 ++++-
 tests/rules/test_others.py | 32 +++++++++++++++-----------------
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 3c25504..5b3d35f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -22,6 +22,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
 - `basic_json_schema()` works with `deleted` jobs
 - `start` is supported for Collections, #112
 - `enum` is counted as a `category` tag, #18
+- `Garbage Symbols` searches in str representation of nested fields instead of expanded df, #130
 ### Fixed
 - `Arche.glance()`, #88
 - Item links in Schema validation errors, #89
diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index aba9ca6..90e5ae7 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -1,3 +1,4 @@
+import codecs
 import re
 
 from arche.rules.result import Outcome, Result
@@ -99,8 +100,10 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
         if not matches.empty:
             error_keys = df.loc[matches.unstack().index.values].index
             bad_texts = matches.stack().value_counts().index.sort_values().tolist()
+            # escape backslashes for markdown repr, `\n > \\n`
             bad_texts = [
-                bx.replace("\n", "\\n").replace("\t", "\\t")[:20] for bx in bad_texts
+                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
+                for bx in bad_texts
             ]
             error = (
                 f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
index 452ee8f..1234fb8 100755
--- a/tests/rules/test_others.py
+++ b/tests/rules/test_others.py
@@ -71,31 +71,29 @@ def test_compare_boolean_fields(
 
 dirty_inputs = [
     (
-        [
-            {
-                "name": " Blacky Robeburned",
-                "address": "here goes &amp;",
-                "phone": "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
-                "rank": 14441,
-            },
-            {
-                "name": "<!--Leprous Jim-->",
-                "address": "Some street",
-                "phone": "1144",
-                "rank": 2_039_857,
-            },
-        ],
+        {
+            "name": [" Blacky Robeburned", "\t<!--Leprous Jim-->"],
+            "address": ["<br> &amp;", "\xa0house"],
+            "phone": ["<h1>144</h1>.sx-prime-pricing-long-row { float: left; }", "11"],
+            "rank": [141, 2_039_857],
+        },
         {
             Level.ERROR: [
                 (
                     "100.0% (2) items affected",
                     None,
                     {
-                        "50.0% of 'address' values contain `&amp;`": [0],
-                        "100.0% of 'name' values contain ` , -->, <!--`": [0, 1],
+                        "100.0% of 'address' values contain `'&amp;', '<br>', '\\xa0'`": [
+                            0,
+                            1,
+                        ],
+                        "100.0% of 'name' values contain `'\\t', ' ', '-->', '<!--'`": [
+                            0,
+                            1,
+                        ],
                         (
                             "50.0% of 'phone' values contain "
-                            "`.sx-prime-pricing-lo, </h1>, <h1>`"
+                            "`'.sx-prime-pricing-lo', '</h1>', '<h1>'`"
                         ): [0],
                     },
                 )

From 837c0abcbe7054d71fa7e07cbe7df18ad1a6fb1d Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Mon, 1 Jul 2019 18:49:17 -0400
Subject: [PATCH 3/6] Add nested data to tests

---
 tests/rules/test_others.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
index 1234fb8..213da6d 100755
--- a/tests/rules/test_others.py
+++ b/tests/rules/test_others.py
@@ -73,8 +73,11 @@ def test_compare_boolean_fields(
     (
         {
             "name": [" Blacky Robeburned", "\t<!--Leprous Jim-->"],
-            "address": ["<br> &amp;", "\xa0house"],
-            "phone": ["<h1>144</h1>.sx-prime-pricing-long-row { float: left; }", "11"],
+            "address": [["<br> ", {"v", "&amp;"}], "\xa0house"],
+            "phone": [
+                "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
+                {"a": "11"},
+            ],
             "rank": [141, 2_039_857],
         },
         {

From 7f6a94ec5d0db54cf0036d5105a6f2cc44964e14 Mon Sep 17 00:00:00 2001
From: Valery M <manycoding@users.noreply.github.com>
Date: Mon, 8 Jul 2019 12:52:12 -0400
Subject: [PATCH 4/6] More precise regex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Adrián Chaves <adrian@chaves.io>
---
 src/arche/rules/others.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index 90e5ae7..bcdf340 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -83,8 +83,8 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     """
     garbage = (
         r"(?P<spaces>^\s|\s$)"
-        r"|(?P<html_entities>&#?(?:\w*;|\d*;))"
-        r"|(?P<css>(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})"
+        r"|(?P<html_entities>&#?\w*;)"
+        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?[\\n \n\t]*?})"
         r"|(?P<html_tags></?(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
         r"blockquote)\s*/?>|<!--|-->)"
     )

From 74488c8053ad53199f17232866c0e686804061f0 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Mon, 8 Jul 2019 13:15:42 -0400
Subject: [PATCH 5/6] Non-greedy quantifiers, exact html_entities

---
 src/arche/rules/others.py  | 6 +++---
 tests/rules/test_others.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index bcdf340..4b23d38 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -83,10 +83,10 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     """
     garbage = (
         r"(?P<spaces>^\s|\s$)"
-        r"|(?P<html_entities>&#?\w*;)"
+        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
         r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?[\\n \n\t]*?})"
-        r"|(?P<html_tags></?(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
-        r"blockquote)\s*/?>|<!--|-->)"
+        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
+        r"blockquote)\s*?/??>|<!--|-->)"
     )
 
     errors = {}
diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
index 213da6d..b7c1423 100755
--- a/tests/rules/test_others.py
+++ b/tests/rules/test_others.py
@@ -72,7 +72,7 @@ def test_compare_boolean_fields(
 dirty_inputs = [
     (
         {
-            "name": [" Blacky Robeburned", "\t<!--Leprous Jim-->"],
+            "name": [" Blacky Robeburned", "\t<!--Leprous &#9; Jim-->"],
             "address": [["<br> ", {"v", "&amp;"}], "\xa0house"],
             "phone": [
                 "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
@@ -90,7 +90,7 @@ def test_compare_boolean_fields(
                             0,
                             1,
                         ],
-                        "100.0% of 'name' values contain `'\\t', ' ', '-->', '<!--'`": [
+                        "100.0% of 'name' values contain `'\\t', ' ', '&#9;', '-->', '<!--'`": [
                             0,
                             1,
                         ],

From 0fc1ed0ed96a51bcce397c7771e5ae29fe247453 Mon Sep 17 00:00:00 2001
From: Valery M <manycoding@users.noreply.github.com>
Date: Tue, 9 Jul 2019 13:01:46 -0400
Subject: [PATCH 6/6] Update src/arche/rules/others.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Adrián Chaves <adrian@chaves.io>
---
 src/arche/rules/others.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index 4b23d38..6cd0d87 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -84,7 +84,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     garbage = (
         r"(?P<spaces>^\s|\s$)"
         r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
-        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?[\\n \n\t]*?})"
+        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
         r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
         r"blockquote)\s*?/??>|<!--|-->)"
     )