From 25ef5837753ab0d4c1281808a50d41800a849597 Mon Sep 17 00:00:00 2001 From: rturnbull Date: Fri, 8 Mar 2024 13:51:26 +1100 Subject: [PATCH] :zap: making TrOCR base the default model. fixing issue with multiple institutional labels --- hespi/main.py | 2 +- hespi/util.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hespi/main.py b/hespi/main.py index 90d9718..c64d1f5 100644 --- a/hespi/main.py +++ b/hespi/main.py @@ -32,7 +32,7 @@ def detect( help="Whether or not to do handwritten text recognition using Microsoft's TrOCR.", ), trocr_size: TrOCRSize = typer.Option( - TrOCRSize.BASE.value, + TrOCRSize.LARGE.value, help="The size of the TrOCR model to use for handwritten text recognition.", case_sensitive=False, ), diff --git a/hespi/util.py b/hespi/util.py index 207d8d4..33cfbc1 100644 --- a/hespi/util.py +++ b/hespi/util.py @@ -104,6 +104,7 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: pd.DataFrame: The text recognition data as a Pandas dataframe """ df = pd.DataFrame.from_dict(data, orient="index") + df = df.fillna(value="") df = df.reset_index().rename(columns={"index": "institutional label"}) # Splitting the ocr_results columns into seperate original text, adjusted, and score