Skip to content

Commit

Permalink
🐛 returned html table output
Browse files Browse the repository at this point in the history
  • Loading branch information
EmilyFitzgerald committed Mar 2, 2024
1 parent 2dd6a16 commit 1ca9cb0
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 11 deletions.
14 changes: 11 additions & 3 deletions hespi/hespi.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict:
else:
detection_results[key] = [detection_results[key], value]

# Determining Recognised Text


# Determining Recognised Text
results = {}
for key, value in detection_results.items():
if 'ocr_results' in key:
Expand Down Expand Up @@ -283,8 +283,16 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict:
if field_name in self.reference.keys():
results[f"{field_name}_match_score"] = match_score_TrOCR

# splitting multiple image files into two columns
elif 'image' in key:
if len(value) >1:
for i in value:
if value.index(i) != 0:
results[f"{key}_{value.index(i)}"] = i
detection_results[key] = value[0]

detection_results.update(results)

return detection_results


Expand Down
4 changes: 2 additions & 2 deletions hespi/templates/report-template.html
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ <h2>Institutional Labels</h2>
{% endif %}
</td>
<td>{{ row[field] }}</td>
<td>{{ row[field+"_Tesseract"] }}</td>
<td>{{ row[field+"_TrOCR"] }}</td>
<td>{{ row[field+"_Tesseract_original"] }}</td>
<td>{{ row[field+"_TrOCR_original"] }}</td>
</div>
{% endif %}
{% endfor %}
Expand Down
71 changes: 65 additions & 6 deletions hespi/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Dict
import pandas as pd
import numpy as np
from rich.console import Console
from difflib import get_close_matches, SequenceMatcher

Expand Down Expand Up @@ -67,7 +68,44 @@ def label_sort_key(s):
return 11
else:
return 12



def process_dicts(row, field_name):
_TrOCR_original = []
_TrOCR_adjusted = []
_TrOCR_match_score = []
Tesseract_original = []
Tesseract_adjusted = []
Tesseract_match_score = []

for d in row:
if d['ocr'] == '_TrOCR':
_TrOCR_original.append(d['original_text_detected'])
_TrOCR_adjusted.append(d['adjusted_text'])
_TrOCR_match_score.append(d['match_score'])
elif d['ocr'] == '_Tesseract':
Tesseract_original.append(d['original_text_detected'])
Tesseract_adjusted.append(d['adjusted_text'])
Tesseract_match_score.append(d['match_score'])

return {
f"{field_name}_TrOCR_original": _TrOCR_original,
f"{field_name}_TrOCR_adjusted": _TrOCR_adjusted,
f"{field_name}_TrOCR_match_score": _TrOCR_match_score,
f"{field_name}_Tesseract_original": Tesseract_original,
f"{field_name}_Tesseract_adjusted": Tesseract_adjusted,
f"{field_name}_Tesseract_match_score": Tesseract_match_score
}


def flatten_single_item_lists(lst):
if isinstance(lst, list):
if len(lst) == 1:
return lst[0]
elif len(lst) == 0:
return ''
return lst


def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
"""
Expand All @@ -85,26 +123,47 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
df = pd.DataFrame.from_dict(data, orient="index")
df = df.reset_index().rename(columns={"index": "institutional label"})

# Splitting the ocr_results columns into seperate original text, adjusted, and score
# Enables the html report to pull the data

for col in df.columns:
if 'ocr_results' in col:
field_name = col.replace('_ocr_results', '')
new_columns = df[f"{field_name}_ocr_results"].apply(process_dicts, field_name=field_name).apply(pd.Series)

df = pd.concat([df, new_columns], axis=1)


# insert columns not included in dataframe, and re-order
# including any columns not included in col_options to account for any updates
col_options = [ "institutional label", "id" ] + label_fields

missing_cols = [col for col in col_options if col not in df.columns]
df[missing_cols] = ""

score_cols = sorted([col for col in df.columns if '_match_score' in col], key=label_sort_key)
ocr_cols = sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key)
image_files_cols = sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key)
# creating break columns
df['<--results|ocr_details-->'] = ' | '
df['image_links-->'] = ' | '
df['ocr_results_split-->'] = ' | '

cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols
# grouping other columns
score_cols = sorted([col for col in df.columns if '_match_score' in col and 'Tesseract' not in col and 'TrOCR' not in col], key=label_sort_key)
ocr_cols = ['<--results|ocr_details-->'] + sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key)
image_files_cols = ['image_links-->'] + sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key)
result_cols = ['ocr_results_split-->'] + sorted([col for col in df.columns if 'Tesseract' in col or 'TrOCR' in col], key=label_sort_key)

cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols + result_cols

extra_cols = [col for col in df.columns if col not in cols]

cols = cols + extra_cols
df = df[cols]
df = df.fillna('')


# flattening all the lists so that if only one item, the list is removed, if no items, list is replaced with an empty string
for col in df.columns:
df[col] = df[col].apply(flatten_single_item_lists)

# CSV output
if output_path:
output_path = Path(output_path)
Expand Down

0 comments on commit 1ca9cb0

Please sign in to comment.