🐛 returned html table output

rbturnbull · Mar 2, 2024 · 1ca9cb0 · 1ca9cb0
1 parent 2dd6a16
commit 1ca9cb0
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 11 deletions.
diff --git a/hespi/hespi.py b/hespi/hespi.py
@@ -231,8 +231,8 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict:
                         else:
                             detection_results[key] = [detection_results[key], value]
 
-        # Determining Recognised Text
-
+
+        # Determining Recognised Text                    
         results = {}
         for key, value in detection_results.items():
             if 'ocr_results' in key:
@@ -283,8 +283,16 @@ def institutional_label_detect(self, component, stub, output_dir) -> Dict:
                             if field_name in self.reference.keys():
                                 results[f"{field_name}_match_score"] = match_score_TrOCR
 
+        # splitting multiple image files into two columns
+            elif 'image' in key:
+                if len(value) >1:
+                    for i in value:
+                        if value.index(i) != 0:
+                            results[f"{key}_{value.index(i)}"] = i
+                    detection_results[key] = value[0]
+
         detection_results.update(results)
-
+                
         return detection_results
 
 

diff --git a/hespi/templates/report-template.html b/hespi/templates/report-template.html
@@ -281,8 +281,8 @@ <h2>Institutional Labels</h2>
                               {% endif %}
                             </td>
                             <td>{{ row[field] }}</td>
-                            <td>{{ row[field+"_Tesseract"] }}</td>
-                            <td>{{ row[field+"_TrOCR"] }}</td>
+                            <td>{{ row[field+"_Tesseract_original"] }}</td>
+                            <td>{{ row[field+"_TrOCR_original"] }}</td>
                           </div>
                         {% endif %}
                       {% endfor %}

diff --git a/hespi/util.py b/hespi/util.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Dict
 import pandas as pd
+import numpy as np
 from rich.console import Console
 from difflib import get_close_matches, SequenceMatcher
 
@@ -67,7 +68,44 @@ def label_sort_key(s):
         return 11
     else:
         return 12
-
+
+
+def process_dicts(row, field_name):
+        _TrOCR_original = []
+        _TrOCR_adjusted = []
+        _TrOCR_match_score = []
+        Tesseract_original = []
+        Tesseract_adjusted = []
+        Tesseract_match_score = []
+
+        for d in row:
+            if d['ocr'] == '_TrOCR':
+                _TrOCR_original.append(d['original_text_detected'])
+                _TrOCR_adjusted.append(d['adjusted_text'])
+                _TrOCR_match_score.append(d['match_score'])
+            elif d['ocr'] == '_Tesseract':
+                Tesseract_original.append(d['original_text_detected'])
+                Tesseract_adjusted.append(d['adjusted_text'])
+                Tesseract_match_score.append(d['match_score'])
+
+        return {
+            f"{field_name}_TrOCR_original": _TrOCR_original,
+            f"{field_name}_TrOCR_adjusted": _TrOCR_adjusted,
+            f"{field_name}_TrOCR_match_score": _TrOCR_match_score,
+            f"{field_name}_Tesseract_original": Tesseract_original,
+            f"{field_name}_Tesseract_adjusted": Tesseract_adjusted,
+            f"{field_name}_Tesseract_match_score": Tesseract_match_score
+        }
+
+
+def flatten_single_item_lists(lst):
+    if isinstance(lst, list):
+        if len(lst) == 1:
+            return lst[0]
+        elif len(lst) == 0:
+            return ''
+    return lst
+
 
 def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
     """    
@@ -85,26 +123,47 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
     df = pd.DataFrame.from_dict(data, orient="index")
     df = df.reset_index().rename(columns={"index": "institutional label"})
 
+    # Splitting the ocr_results columns into seperate original text, adjusted, and score
+    # Enables the html report to pull the data 
+
+    for col in df.columns:
+        if 'ocr_results' in col:
+            field_name = col.replace('_ocr_results', '')
+            new_columns = df[f"{field_name}_ocr_results"].apply(process_dicts, field_name=field_name).apply(pd.Series)
+
+            df = pd.concat([df, new_columns], axis=1)
+
+
     # insert columns not included in dataframe, and re-order
     # including any columns not included in col_options to account for any updates
     col_options = [ "institutional label", "id" ] + label_fields
 
     missing_cols = [col for col in col_options if col not in df.columns]
     df[missing_cols] = ""
 
-    score_cols = sorted([col for col in df.columns if '_match_score' in col], key=label_sort_key)
-    ocr_cols = sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key)
-    image_files_cols = sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key)
+    # creating break columns
+    df['<--results|ocr_details-->'] = '    |    '
+    df['image_links-->'] = '    |    '
+    df['ocr_results_split-->'] = '    |    '
 
-    cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols
+    # grouping other columns 
+    score_cols = sorted([col for col in df.columns if '_match_score' in col and 'Tesseract' not in col and 'TrOCR' not in col], key=label_sort_key)
+    ocr_cols = ['<--results|ocr_details-->'] + sorted([col for col in df.columns if '_ocr_results' in col], key=label_sort_key)
+    image_files_cols = ['image_links-->'] + sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key)
+    result_cols = ['ocr_results_split-->'] + sorted([col for col in df.columns if 'Tesseract' in col or 'TrOCR' in col], key=label_sort_key)
+
+    cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols + result_cols
 
     extra_cols = [col for col in df.columns if col not in cols]
 
     cols = cols + extra_cols
     df = df[cols]
     df = df.fillna('')
 
-
+    # flattening all the lists so that if only one item, the list is removed, if no items, list is replaced with an empty string
+    for col in df.columns:
+        df[col] = df[col].apply(flatten_single_item_lists)
+
     # CSV output
     if output_path:
         output_path = Path(output_path)