diff --git a/hespi/hespi.py b/hespi/hespi.py index b08da02..731b51d 100644 --- a/hespi/hespi.py +++ b/hespi/hespi.py @@ -231,8 +231,8 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict: else: detection_results[key] = [detection_results[key], value] - # Determining Recognised Text - + + # Determining Recognised Text results = {} for key, value in detection_results.items(): if 'ocr_results' in key: @@ -283,8 +283,16 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict: if field_name in self.reference.keys(): results[f"{field_name}_match_score"] = match_score_TrOCR + # splitting multiple image files into two columns + elif 'image' in key: + if len(value) >1: + for i in value: + if value.index(i) != 0: + results[f"{key}_{value.index(i)}"] = i + detection_results[key] = value[0] + detection_results.update(results) - + return detection_results diff --git a/hespi/templates/report-template.html b/hespi/templates/report-template.html index 2849cd8..85d1f55 100644 --- a/hespi/templates/report-template.html +++ b/hespi/templates/report-template.html @@ -281,8 +281,8 @@

Institutional Labels

{% endif %} {{ row[field] }} - {{ row[field+"_Tesseract"] }} - {{ row[field+"_TrOCR"] }} + {{ row[field+"_Tesseract_original"] }} + {{ row[field+"_TrOCR_original"] }} {% endif %} {% endfor %} diff --git a/hespi/util.py b/hespi/util.py index 19cc0bb..0574f5e 100644 --- a/hespi/util.py +++ b/hespi/util.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Dict import pandas as pd +import numpy as np from rich.console import Console from difflib import get_close_matches, SequenceMatcher @@ -67,7 +68,44 @@ def label_sort_key(s): return 11 else: return 12 - + + +def process_dicts(row, field_name): + _TrOCR_original = [] + _TrOCR_adjusted = [] + _TrOCR_match_score = [] + Tesseract_original = [] + Tesseract_adjusted = [] + Tesseract_match_score = [] + + for d in row: + if d['ocr'] == '_TrOCR': + _TrOCR_original.append(d['original_text_detected']) + _TrOCR_adjusted.append(d['adjusted_text']) + _TrOCR_match_score.append(d['match_score']) + elif d['ocr'] == '_Tesseract': + Tesseract_original.append(d['original_text_detected']) + Tesseract_adjusted.append(d['adjusted_text']) + Tesseract_match_score.append(d['match_score']) + + return { + f"{field_name}_TrOCR_original": _TrOCR_original, + f"{field_name}_TrOCR_adjusted": _TrOCR_adjusted, + f"{field_name}_TrOCR_match_score": _TrOCR_match_score, + f"{field_name}_Tesseract_original": Tesseract_original, + f"{field_name}_Tesseract_adjusted": Tesseract_adjusted, + f"{field_name}_Tesseract_match_score": Tesseract_match_score + } + + +def flatten_single_item_lists(lst): + if isinstance(lst, list): + if len(lst) == 1: + return lst[0] + elif len(lst) == 0: + return '' + return lst + def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: """ @@ -85,6 +123,17 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: df = pd.DataFrame.from_dict(data, orient="index") df = df.reset_index().rename(columns={"index": "institutional label"}) + # Splitting the ocr_results columns into seperate original text, adjusted, and score + # Enables the html report to pull the data + + for col in df.columns: + if 'ocr_results' in col: + field_name = col.replace('_ocr_results', '') + new_columns = df[f"{field_name}_ocr_results"].apply(process_dicts, field_name=field_name).apply(pd.Series) + + df = pd.concat([df, new_columns], axis=1) + + # insert columns not included in dataframe, and re-order # including any columns not included in col_options to account for any updates col_options = [ "institutional label", "id" ] + label_fields @@ -92,11 +141,18 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: missing_cols = [col for col in col_options if col not in df.columns] df[missing_cols] = "" - score_cols = sorted([col for col in df.columns if '_match_score' in col], key=label_sort_key) - ocr_cols = sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key) - image_files_cols = sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key) + # creating break columns + df['<--results|ocr_details-->'] = ' | ' + df['image_links-->'] = ' | ' + df['ocr_results_split-->'] = ' | ' - cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols + # grouping other columns + score_cols = sorted([col for col in df.columns if '_match_score' in col and 'Tesseract' not in col and 'TrOCR' not in col], key=label_sort_key) + ocr_cols = ['<--results|ocr_details-->'] + sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key) + image_files_cols = ['image_links-->'] + sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key) + result_cols = ['ocr_results_split-->'] + sorted([col for col in df.columns if 'Tesseract' in col or 'TrOCR' in col], key=label_sort_key) + + cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols + result_cols extra_cols = [col for col in df.columns if col not in cols] @@ -104,7 +160,10 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: df = df[cols] df = df.fillna('') - + # flattening all the lists so that if only one item, the list is removed, if no items, list is replaced with an empty string + for col in df.columns: + df[col] = df[col].apply(flatten_single_item_lists) + # CSV output if output_path: output_path = Path(output_path)