In [None]:
import os
import re
import jellyfish
import pandas as pd
from jiwer import wer, cer, mer
from datetime import datetime

In [None]:
def read_dataset(dataset_csv, ann_columns, ots_columns, **kwargs):
    
    if os.path.isdir(dataset_csv):
        listdir = os.listdir(dataset_csv)
        dataset_csv = [ os.path.join(dataset_csv, d) for d in listdir if ".csv" in d ][0]

    df = pd.read_csv(dataset_csv, sep=";")
    df = df.rename(columns={ann_columns: "text_ann", ots_columns: "text_ots"})

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"compiled_csv_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")

    return df

def evaluation_metrics(df, **kwargs):
    df['max_len'] = df.apply(lambda x: max(tuple((len(x['text_ots']), len(x['text_ann'])))), axis=1)
    df['levd'] = df.apply(lambda x: jellyfish.levenshtein_distance(x['text_ots'], x['text_ann']), axis=1)
    # df['levd_wer'] = df.apply(lambda x: 1-x['levd']/len(x['text_ann']), axis=1)
    df['levd_score'] = df.apply(lambda x: 1-x['levd']/x['max_len'], axis=1)

    df['wer'] = df.apply(lambda x: 1-wer(x['text_ann'], x['text_ots']), axis=1)
    df['cer'] = df.apply(lambda x: 1-cer(x['text_ann'], x['text_ots']), axis=1)
    df['mer'] = df.apply(lambda x: 1-mer(x['text_ann'], x['text_ots']), axis=1)

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"eval_csv_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")

    return df

## How to use:

This is script for evaluating OCR extracted text againts Human annotated text as the ground truth. It uses Comma Separated Value (CSV) file input, please follow through below points to use the evaluation script. Tks.

_*notes: Here we use semicolon (;) for separating each column-rows_ 

#### 3-easy-Steps:  

- **First**, make sure you know where you store your dataset, and make sure the file format was .csv already.  
- **Second**, define your human annotated column name on `ann_columns` and OCR extracted content on `ots_columns`, this included if you want to evaluate the result of extracted OCR with LLM-based error correction.  
- **Third (optional)**, you can export your works by defining `export` argument on each function you'll see below.  

In [None]:
# You can freely define your own dataset location 
language_name = "balinese"
dataset_dirname = f"dataset/csv/{language_name}"

# after you've defined your dataset location, two important things you have to define is 
# which human annotated column (ann_columns) and which the OCR extracted text (ots_columns),
# you can ignore the rest of the script
compiled = read_dataset(dataset_csv=dataset_dirname, ann_columns="nganu", ots_columns="iki", export="compiled")
evaluate = evaluation_metrics(df=compiled, export="result")