In [None]:
import os
import re
import jellyfish
import pandas as pd
from jiwer import wer, cer, mer
from datetime import datetime

In [None]:
def compile_dataset(gt_dir, engine_dir, **kwargs):

    gt_listdir = os.listdir(gt_dir)
    gt_dataset = [ os.path.join(gt_dir, d) for d in gt_listdir ]

    engine_listdir = os.listdir(engine_dir)
    engine_dataset = [ os.path.join(engine_dir, d) for d in engine_listdir ]

    gt_texts = []
    for gt in gt_dataset:
        with open(gt, "r") as f:
            filename = os.path.basename(gt)
            text = f.read()
            gt_texts.append({"entry_name": filename.split(".")[0], "filename": gt, "text_gt": text})


    engine_texts = []
    for engine in engine_dataset:
        with open(engine, "r") as f:
            filename = os.path.basename(engine)
            text = f.read()
            engine_texts.append({"entry_name": filename.split(".")[0], "filename": engine, "text_engine": text})
    
    
    gtdf = pd.DataFrame(gt_texts)
    gtdf = gtdf.sort_values(by=['entry_name'])
    
    enginedf = pd.DataFrame(engine_texts)
    enginedf = enginedf.sort_values(by=['entry_name'])

    df = pd.merge(gtdf, enginedf, on=['entry_name'], suffixes=['_gt', '_engine'])
    df = df.sort_index(axis=1)

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"compiled_data_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")


    return df

def evaluation_metrics(df, **kwargs):
    df['max_len'] = df.apply(lambda x: max(tuple((len(x['text_engine']), len(x['text_gt'])))), axis=1)
    df['levd'] = df.apply(lambda x: jellyfish.levenshtein_distance(x['text_engine'], x['text_gt']), axis=1)
    # df['levd_wer'] = df.apply(lambda x: 1-x['levd']/len(x['text_gt']), axis=1)
    df['levd_score'] = df.apply(lambda x: 1-x['levd']/x['max_len'], axis=1)

    df['wer'] = df.apply(lambda x: 1-wer(x['text_gt'], x['text_engine']), axis=1)
    df['cer'] = df.apply(lambda x: 1-cer(x['text_gt'], x['text_engine']), axis=1)
    df['mer'] = df.apply(lambda x: 1-mer(x['text_gt'], x['text_engine']), axis=1)

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"eval_data_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")

    return df

## How to use:

This is script for evaluating OCR post-correction text againts Human annotated text as the ground truth, adjusted. 

#### 3-easy-Steps:  

- **First**, make sure you know where you store your dataset and make sure they are in the .txt file format.
- **Second**, define each directory names on `groundtruth_dir` and `postocr_dir` respectively.
- **Second (optional)**, you can export your works by defining `export` argument on each function you'll see below, and you can freely define your directory by passing value on `export` args.  

In [None]:
# Define your ground-truth and post-ocr file collection directory 
groundtruth_dir = "testset/Balinese/raw/Balinese_texts_ann"
postocr_dir = "testset/Balinese/raw/Balinese_texts_ots"

# you can ignore the rest of the script
compiled = compile_dataset(gt_dir=groundtruth_dir, engine_dir=postocr_dir)
evaluate = evaluation_metrics(df=compiled, export="balinese_res")

evaluate