From 25ef5837753ab0d4c1281808a50d41800a849597 Mon Sep 17 00:00:00 2001
From: rturnbull <robert.turnbull@unimelb.edu.au>
Date: Fri, 8 Mar 2024 13:51:26 +1100
Subject: [PATCH] :zap: making TrOCR base the default model. fixing issue with
 multiple institutional labels

---
 hespi/main.py | 2 +-
 hespi/util.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hespi/main.py b/hespi/main.py
index 90d9718..c64d1f5 100644
--- a/hespi/main.py
+++ b/hespi/main.py
@@ -32,7 +32,7 @@ def detect(
         help="Whether or not to do handwritten text recognition using Microsoft's TrOCR.",
     ),
     trocr_size: TrOCRSize = typer.Option(
-        TrOCRSize.BASE.value,
+        TrOCRSize.LARGE.value,
         help="The size of the TrOCR model to use for handwritten text recognition.",
         case_sensitive=False,
     ),
diff --git a/hespi/util.py b/hespi/util.py
index 207d8d4..33cfbc1 100644
--- a/hespi/util.py
+++ b/hespi/util.py
@@ -104,6 +104,7 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
         pd.DataFrame: The text recognition data as a Pandas dataframe
     """
     df = pd.DataFrame.from_dict(data, orient="index")
+    df = df.fillna(value="")
     df = df.reset_index().rename(columns={"index": "institutional label"})
     
     # Splitting the ocr_results columns into seperate original text, adjusted, and score