diff --git a/hespi/main.py b/hespi/main.py index 1e12677..960373a 100644 --- a/hespi/main.py +++ b/hespi/main.py @@ -32,7 +32,7 @@ def detect( help="Whether or not to do handwritten text recognition using Microsoft's TrOCR.", ), trocr_size: TrOCRSize = typer.Option( - TrOCRSize.BASE.value, + TrOCRSize.LARGE.value, help="The size of the TrOCR model to use for handwritten text recognition.", case_sensitive=False, ), diff --git a/hespi/util.py b/hespi/util.py index 207d8d4..33cfbc1 100644 --- a/hespi/util.py +++ b/hespi/util.py @@ -104,6 +104,7 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame: pd.DataFrame: The text recognition data as a Pandas dataframe """ df = pd.DataFrame.from_dict(data, orient="index") + df = df.fillna(value="") df = df.reset_index().rename(columns={"index": "institutional label"}) # Splitting the ocr_results columns into seperate original text, adjusted, and score