Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix duplicated garbage #130

Merged
merged 7 commits into from
Jul 9, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
- `basic_json_schema()` works with `deleted` jobs
- `start` is supported for Collections, #112
- `enum` is counted as a `category` tag, #18
- `Garbage Symbols` searches in str representation of nested fields instead of expanded df, #130
### Fixed
- `Arche.glance()`, #88
- Item links in Schema validation errors, #89
Expand Down
2 changes: 1 addition & 1 deletion src/arche/arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def data_quality_report(self, bucket: Optional[str] = None):

@lru_cache(maxsize=32)
def run_general_rules(self):
self.save_result(garbage_symbols(self.source_items))
self.save_result(garbage_symbols(self.source_items.df))
df = self.source_items.df
self.save_result(
coverage_rules.check_fields_coverage(
Expand Down
2 changes: 1 addition & 1 deletion src/arche/data_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def create_figures(self, items: CloudItems):
).get_errors_count()

garbage_symbols_result = self.report.results.get(
"Garbage Symbols", garbage_symbols(items)
"Garbage Symbols", garbage_symbols(items.df)
)

quality_estimation, field_accuracy = generate_quality_estimation(
Expand Down
19 changes: 11 additions & 8 deletions src/arche/report.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from functools import partial
from typing import Dict

from arche import SH_URL
from arche.rules.result import Level, Outcome, Result
from colorama import Fore, Style
from IPython.display import display, Markdown
from IPython.display import display_markdown
import numpy as np
import pandas as pd
import plotly.io as pio

display_markdown = partial(display_markdown, raw=True)


class Report:
def __init__(self):
Expand All @@ -22,7 +25,7 @@ def write_color_text(text: str, color: Fore = Fore.RED) -> None:

@staticmethod
def write_rule_name(rule_name: str) -> None:
display(Markdown(f"{rule_name}:"))
display_markdown(f"{rule_name}:")

@classmethod
def write(cls, text: str) -> None:
Expand Down Expand Up @@ -58,15 +61,13 @@ def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
def write_details(self, short: bool = False, keys_limit: int = 10) -> None:
for result in self.results.values():
if result.detailed_messages_count:
display(
Markdown(
f"{result.name} ({result.detailed_messages_count} message(s)):"
)
display_markdown(
f"{result.name} ({result.detailed_messages_count} message(s)):"
)
self.write_rule_details(result, short, keys_limit)
for f in result.figures:
pio.show(f)
display(Markdown("<br>"))
display_markdown("<br>")

@classmethod
def write_rule_details(
Expand All @@ -93,7 +94,9 @@ def write_detailed_errors(cls, errors: Dict, short: bool, keys_limit: int) -> No
keys = pd.Series(list(keys))

sample = Report.sample_keys(keys, keys_limit)
display(Markdown(f"{len(keys)} items affected - {attribute}: {sample}"))
display_markdown(
f"{len(keys)} items affected - {attribute}: {sample}", raw=True
)

@staticmethod
def sample_keys(keys: pd.Series, limit: int) -> str:
Expand Down
36 changes: 21 additions & 15 deletions src/arche/rules/others.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import codecs
import re

from arche.readers.items import Items
from arche.rules.result import Outcome, Result
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook


def compare_boolean_fields(
Expand Down Expand Up @@ -74,41 +75,46 @@ def fields_to_compare(source_df: pd.DataFrame, target_df: pd.DataFrame) -> bool:
return False


def garbage_symbols(items: Items) -> Result:
def garbage_symbols(df: pd.DataFrame) -> Result:
"""Find unwanted symbols in `np.object` columns.

Returns:
A result containing item keys per field which contained any trash symbol
"""
garbage = (
r"(?P<spaces>^\s|\s$)"
r"|(?P<html_entities>&amp|&reg)"
r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
r"|(?P<html_entities>&#?(?:\w*;|\d*;))"
manycoding marked this conversation as resolved.
Show resolved Hide resolved
r"|(?P<css>(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})"
manycoding marked this conversation as resolved.
Show resolved Hide resolved
r"|(?P<html_tags></?(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
r"blockquote)\s*/?>|<!--|-->)"
)

errors = {}
row_keys = set()
rule_result = Result("Garbage Symbols", items_count=len(items))
rule_result = Result("Garbage Symbols", items_count=len(df))

for column in items.flat_df.select_dtypes([np.object]):
matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE)
matches = matches[["spaces", "html_entities", "css", "html_tags"]]
for column in tqdm_notebook(
df.select_dtypes([np.object]).columns, desc="Garbage Symbols"
):
matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
if not matches.empty:
error_keys = items.flat_df.loc[matches.unstack().index.values].index
original_column = items.origin_column_name(column)
error_keys = df.loc[matches.unstack().index.values].index
bad_texts = matches.stack().value_counts().index.sort_values().tolist()
# escape backslashes for markdown repr, `\n > \\n`
bad_texts = [
f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
for bx in bad_texts
]
error = (
f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' "
f"values contain `{', '.join([t[:20] for t in bad_texts])}`"
f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
f"values contain `{', '.join(bad_texts)}`"
)

errors[error] = list(error_keys)
row_keys = row_keys.union(error_keys)

if errors:
rule_result.add_error(
f"{len(row_keys)/len(items) * 100:.1f}% ({len(row_keys)}) items affected",
f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
errors=errors,
)
rule_result.err_items_count = len(row_keys)
Expand Down
38 changes: 19 additions & 19 deletions tests/rules/test_others.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from functools import partial

from arche.readers.items import Items
from arche.rules.others import compare_boolean_fields, garbage_symbols
from arche.rules.result import Level, Outcome
from conftest import create_named_df, create_result
Expand Down Expand Up @@ -72,31 +71,32 @@ def test_compare_boolean_fields(

dirty_inputs = [
(
[
{
"name": " Blacky Robeburned",
"address": "here goes &AMP",
"phone": "<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
"rank": 14441,
},
{
"name": "<!--Leprous Jim-->",
"address": "Some street",
"phone": "1144",
"rank": 2_039_857,
},
],
{
"name": [" Blacky Robeburned", "\t<!--Leprous Jim-->"],
"address": [["<br> ", {"v", "&amp;"}], "\xa0house"],
"phone": [
"<h1>144</h1>.sx-prime-pricing-long-row { float: left; }",
{"a": "11"},
],
"rank": [141, 2_039_857],
},
{
Level.ERROR: [
(
"100.0% (2) items affected",
None,
{
"100.0% of 'name' values contain ` , -->, <!--`": [0, 1],
"50.0% of 'address' values contain `&AMP`": [0],
"100.0% of 'address' values contain `'&amp;', '<br>', '\\xa0'`": [
0,
1,
],
"100.0% of 'name' values contain `'\\t', ' ', '-->', '<!--'`": [
0,
1,
],
(
"50.0% of 'phone' values contain "
"`.sx-prime-pricing-lo, </h1>, <h1>`"
"`'.sx-prime-pricing-lo', '</h1>', '<h1>'`"
): [0],
},
)
Expand All @@ -116,7 +116,7 @@ def test_compare_boolean_fields(
def test_garbage_symbols(
raw_items, expected_messages, expected_items_count, expected_err_items_count
):
assert garbage_symbols(Items.from_array(raw_items)) == create_result(
assert garbage_symbols(pd.DataFrame(raw_items)) == create_result(
"Garbage Symbols",
expected_messages,
items_count=expected_items_count,
Expand Down
2 changes: 1 addition & 1 deletion tests/rules/test_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_tensors_not_equal(source, target):
)
def test_show(mocker, capsys, message, stats, exp_md_output, exp_txt_outputs):
mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
mocked_print = mocker.patch("builtins.print", autospec=True)
res = create_result("rule name here", message, stats=stats)
res.show()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_validate_with_json_schema(mocker, get_job_items, get_schema):


def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
mocked_md = mocker.patch("arche.report.Markdown", autospec=True)
mocked_md = mocker.patch("arche.report.display_markdown", autospec=True)
url = f"{SH_URL}/112358/13/21/item/1"
res = create_result(
"JSON Schema Validation",
Expand All @@ -273,7 +273,7 @@ def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
assert len(a.report.results) == 1
assert a.report.results.get("JSON Schema Validation") == res
mocked_md.assert_any_call(
f"1 items affected - 'price' is a required property: [1]({url})"
f"1 items affected - 'price' is a required property: [1]({url})", raw=True
)


Expand Down
6 changes: 3 additions & 3 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
)
def test_write_details(mocker, get_df, capsys, messages, expected_details):
mock_pio_show = mocker.patch("plotly.io.show", autospec=True)
md_mock = mocker.patch("arche.report.Markdown", autospec=True)
md_mock = mocker.patch("arche.report.display_markdown", autospec=True)

r = Report()
for m in messages:
Expand Down Expand Up @@ -121,9 +121,9 @@ def test_write_rule_details(capsys, message, expected_details):
)
def test_write_detailed_errors(mocker, errors, short, keys_limit, expected_messages):
mocker.patch("pandas.Series.sample", return_value=pd.Series("5"), autospec=True)
md_mock = mocker.patch("arche.report.Markdown", autospec=True)
md_mock = mocker.patch("arche.report.display_markdown", autospec=True)
Report.write_detailed_errors(errors, short, keys_limit)
calls = [mocker.call(m) for m in expected_messages]
calls = [mocker.call(m, raw=True) for m in expected_messages]
md_mock.assert_has_calls(calls, any_order=True)


Expand Down