From 75b63269a7f73bbca6055349c8ad3fae6a1713b7 Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 28 Jun 2019 19:56:48 -0400 Subject: [PATCH 1/6] Do not use flat_df in garbage_symbols --- src/arche/arche.py | 2 +- src/arche/data_quality_report.py | 2 +- src/arche/report.py | 19 ++++++++++-------- src/arche/rules/others.py | 33 +++++++++++++++++--------------- tests/rules/test_others.py | 7 +++---- tests/rules/test_result.py | 2 +- tests/test_arche.py | 4 ++-- tests/test_report.py | 6 +++--- 8 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/arche/arche.py b/src/arche/arche.py index ec6e58a..8e5dd32 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -143,7 +143,7 @@ def data_quality_report(self, bucket: Optional[str] = None): @lru_cache(maxsize=32) def run_general_rules(self): - self.save_result(garbage_symbols(self.source_items)) + self.save_result(garbage_symbols(self.source_items.df)) df = self.source_items.df self.save_result( coverage_rules.check_fields_coverage( diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py index 7ba0ad8..90f0924 100755 --- a/src/arche/data_quality_report.py +++ b/src/arche/data_quality_report.py @@ -73,7 +73,7 @@ def create_figures(self, items: CloudItems): ).get_errors_count() garbage_symbols_result = self.report.results.get( - "Garbage Symbols", garbage_symbols(items) + "Garbage Symbols", garbage_symbols(items.df) ) quality_estimation, field_accuracy = generate_quality_estimation( diff --git a/src/arche/report.py b/src/arche/report.py index 22baaf7..2ba7521 100755 --- a/src/arche/report.py +++ b/src/arche/report.py @@ -1,13 +1,16 @@ +from functools import partial from typing import Dict from arche import SH_URL from arche.rules.result import Level, Outcome, Result from colorama import Fore, Style -from IPython.display import display, Markdown +from IPython.display import display_markdown import numpy as np import pandas as pd import plotly.io as pio +display_markdown = partial(display_markdown, raw=True) + class Report: def __init__(self): @@ -22,7 +25,7 @@ def write_color_text(text: str, color: Fore = Fore.RED) -> None: @staticmethod def write_rule_name(rule_name: str) -> None: - display(Markdown(f"{rule_name}:")) + display_markdown(f"{rule_name}:") @classmethod def write(cls, text: str) -> None: @@ -58,15 +61,13 @@ def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None: def write_details(self, short: bool = False, keys_limit: int = 10) -> None: for result in self.results.values(): if result.detailed_messages_count: - display( - Markdown( - f"{result.name} ({result.detailed_messages_count} message(s)):" - ) + display_markdown( + f"{result.name} ({result.detailed_messages_count} message(s)):" ) self.write_rule_details(result, short, keys_limit) for f in result.figures: pio.show(f) - display(Markdown("
")) + display_markdown("
") @classmethod def write_rule_details( @@ -93,7 +94,9 @@ def write_detailed_errors(cls, errors: Dict, short: bool, keys_limit: int) -> No keys = pd.Series(list(keys)) sample = Report.sample_keys(keys, keys_limit) - display(Markdown(f"{len(keys)} items affected - {attribute}: {sample}")) + display_markdown( + f"{len(keys)} items affected - {attribute}: {sample}", raw=True + ) @staticmethod def sample_keys(keys: pd.Series, limit: int) -> str: diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index b579a15..aba9ca6 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -1,9 +1,9 @@ import re -from arche.readers.items import Items from arche.rules.result import Outcome, Result import numpy as np import pandas as pd +from tqdm import tqdm_notebook def compare_boolean_fields( @@ -74,7 +74,7 @@ def fields_to_compare(source_df: pd.DataFrame, target_df: pd.DataFrame) -> bool: return False -def garbage_symbols(items: Items) -> Result: +def garbage_symbols(df: pd.DataFrame) -> Result: """Find unwanted symbols in `np.object` columns. Returns: @@ -82,33 +82,36 @@ def garbage_symbols(items: Items) -> Result: """ garbage = ( r"(?P^\s|\s$)" - r"|(?P&|®)" - r"|(?P(?:(?:\.|#)[^#. ]+\s*){.+})" - r"|(?P&#?(?:\w*;|\d*;))" + r"|(?P(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})" + r"|(?P|)" ) errors = {} row_keys = set() - rule_result = Result("Garbage Symbols", items_count=len(items)) + rule_result = Result("Garbage Symbols", items_count=len(df)) - for column in items.flat_df.select_dtypes([np.object]): - matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE) - matches = matches[["spaces", "html_entities", "css", "html_tags"]] + for column in tqdm_notebook( + df.select_dtypes([np.object]).columns, desc="Garbage Symbols" + ): + matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE) if not matches.empty: - error_keys = items.flat_df.loc[matches.unstack().index.values].index - original_column = items.origin_column_name(column) + error_keys = df.loc[matches.unstack().index.values].index bad_texts = matches.stack().value_counts().index.sort_values().tolist() + bad_texts = [ + bx.replace("\n", "\\n").replace("\t", "\\t")[:20] for bx in bad_texts + ] error = ( - f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' " - f"values contain `{', '.join([t[:20] for t in bad_texts])}`" + f"{len(error_keys)/len(df)*100:.1f}% of '{column}' " + f"values contain `{', '.join(bad_texts)}`" ) + errors[error] = list(error_keys) row_keys = row_keys.union(error_keys) - if errors: rule_result.add_error( - f"{len(row_keys)/len(items) * 100:.1f}% ({len(row_keys)}) items affected", + f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected", errors=errors, ) rule_result.err_items_count = len(row_keys) diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py index 7a423a5..452ee8f 100755 --- a/tests/rules/test_others.py +++ b/tests/rules/test_others.py @@ -1,6 +1,5 @@ from functools import partial -from arche.readers.items import Items from arche.rules.others import compare_boolean_fields, garbage_symbols from arche.rules.result import Level, Outcome from conftest import create_named_df, create_result @@ -75,7 +74,7 @@ def test_compare_boolean_fields( [ { "name": " Blacky Robeburned", - "address": "here goes &", + "address": "here goes &", "phone": "

144

.sx-prime-pricing-long-row { float: left; }", "rank": 14441, }, @@ -92,8 +91,8 @@ def test_compare_boolean_fields( "100.0% (2) items affected", None, { + "50.0% of 'address' values contain `&`": [0], "100.0% of 'name' values contain ` , -->, ", - "address": "Some street", - "phone": "1144", - "rank": 2_039_857, - }, - ], + { + "name": [" Blacky Robeburned", "\t"], + "address": ["
&", "\xa0house"], + "phone": ["

144

.sx-prime-pricing-long-row { float: left; }", "11"], + "rank": [141, 2_039_857], + }, { Level.ERROR: [ ( "100.0% (2) items affected", None, { - "50.0% of 'address' values contain `&`": [0], - "100.0% of 'name' values contain ` , -->, ', '"], - "address": ["
&", "\xa0house"], - "phone": ["

144

.sx-prime-pricing-long-row { float: left; }", "11"], + "address": [["
", {"v", "&"}], "\xa0house"], + "phone": [ + "

144

.sx-prime-pricing-long-row { float: left; }", + {"a": "11"}, + ], "rank": [141, 2_039_857], }, { From 7f6a94ec5d0db54cf0036d5105a6f2cc44964e14 Mon Sep 17 00:00:00 2001 From: Valery M Date: Mon, 8 Jul 2019 12:52:12 -0400 Subject: [PATCH 4/6] More precise regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Adrián Chaves --- src/arche/rules/others.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index 90e5ae7..bcdf340 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -83,8 +83,8 @@ def garbage_symbols(df: pd.DataFrame) -> Result: """ garbage = ( r"(?P^\s|\s$)" - r"|(?P&#?(?:\w*;|\d*;))" - r"|(?P(?:\.|#|@)[^0-9{}#. ][^{}#.]+{(?:[^:;{}]+:[^:;{}]+;)+[\\n \n\t]*})" + r"|(?P&#?\w*;)" + r"|(?P[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?[\\n \n\t]*?})" r"|(?P|)" ) From 74488c8053ad53199f17232866c0e686804061f0 Mon Sep 17 00:00:00 2001 From: manycoding Date: Mon, 8 Jul 2019 13:15:42 -0400 Subject: [PATCH 5/6] Non-greedy quantifiers, exact html_entities --- src/arche/rules/others.py | 6 +++--- tests/rules/test_others.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index bcdf340..4b23d38 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -83,10 +83,10 @@ def garbage_symbols(df: pd.DataFrame) -> Result: """ garbage = ( r"(?P^\s|\s$)" - r"|(?P&#?\w*;)" + r"|(?P&[a-zA-Z]{2,}?;|&#\d*?;)" r"|(?P[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?[\\n \n\t]*?})" - r"|(?P|)" + r"|(?P|)" ) errors = {} diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py index 213da6d..b7c1423 100755 --- a/tests/rules/test_others.py +++ b/tests/rules/test_others.py @@ -72,7 +72,7 @@ def test_compare_boolean_fields( dirty_inputs = [ ( { - "name": [" Blacky Robeburned", "\t"], + "name": [" Blacky Robeburned", "\t"], "address": [["
", {"v", "&"}], "\xa0house"], "phone": [ "

144

.sx-prime-pricing-long-row { float: left; }", @@ -90,7 +90,7 @@ def test_compare_boolean_fields( 0, 1, ], - "100.0% of 'name' values contain `'\\t', ' ', '-->', '', ')" )