In [4]:
import os
import re
import jellyfish
import pandas as pd
from jiwer import wer, cer, mer
from datetime import datetime

In [5]:
def compile_dataset(dataset_dir, ann_suffix="ann", ots_suffix="ots", **kwargs):

    listdir = os.listdir(dataset_dir)
    
    ann_dataset = [ os.path.join(dataset_dir, d) for d in listdir if ".{0}".format(re.sub('[\W_]+', '', ann_suffix)) in d ]
    ots_dataset = [ os.path.join(dataset_dir, d) for d in listdir if ".{0}".format(re.sub('[\W_]+', '', ots_suffix)) in d ]

    ann_texts = []
    for ann in ann_dataset:
        with open(ann, "r") as f:
            filename = os.path.basename(ann)
            text = f.read()
            ann_texts.append({"entry_name": filename.split(".")[0], "filename": filename, "text_ann": text})


    ots_texts = []
    for ots in ots_dataset:
        with open(ots, "r") as f:
            filename = os.path.basename(ots)
            text = f.read()
            ots_texts.append({"entry_name": filename.split(".")[0], "filename": filename, "text_ots": text})
    
    anndf = pd.DataFrame(ann_texts)
    anndf = anndf.sort_values(by=['entry_name'])
    otsdf = pd.DataFrame(ots_texts)
    otsdf = otsdf.sort_values(by=['entry_name'])

    df = pd.merge(anndf, otsdf, on=['entry_name'], suffixes=['_ann', '_ots'])
    df = df.sort_index(axis=1)

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"compiled_data_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")

    return df

def evaluation_metrics(df, **kwargs):
    df['max_len'] = df.apply(lambda x: max(tuple((len(x['text_ots']), len(x['text_ann'])))), axis=1)
    df['levd'] = df.apply(lambda x: jellyfish.levenshtein_distance(x['text_ots'], x['text_ann']), axis=1)
    # df['levd_wer'] = df.apply(lambda x: 1-x['levd']/len(x['text_ann']), axis=1)
    df['levd_score'] = df.apply(lambda x: 1-x['levd']/x['max_len'], axis=1)

    df['wer'] = df.apply(lambda x: 1-wer(x['text_ann'], x['text_ots']), axis=1)
    df['cer'] = df.apply(lambda x: 1-cer(x['text_ann'], x['text_ots']), axis=1)
    df['mer'] = df.apply(lambda x: 1-mer(x['text_ann'], x['text_ots']), axis=1)

    if kwargs.get('export'):

        if not os.path.exists(kwargs['export']):
            os.makedirs(kwargs['export'])

        filename = f"eval_data_{datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')}.csv"
        filename = os.path.join(kwargs['export'], filename)
        df.to_csv(filename, sep=";")

    return df

## How to use:

This is script for evaluating OCR extracted text againts Human annotated text as the ground truth. It uses Raw file input (please see `dataset/raw` folder within this repository). This script will try to find which files identified as human annotation (`.ann`) and which identified as machine extraction content (`.ots`).

_*ots means off-the-shelf, but you can use this script to evaluate post-processed OCR extracted text_

#### 3-easy-Steps:  

- **First**, make sure you know where you store your dataset and make sure on the same folder/directory the file format was .txt, has `.ann` for human annotated text and `.ots` for machine extracted text (see folder `dataset/raw` for the examples).  
- **Second (optional)**, you can export your works by defining `export` argument on each function you'll see below.  

In [6]:
# You can freely define your own dataset location, 
# make sure you have .ann and .ots on the sampe places 
# just like what we've done on 'dataset/raw' folder within this repository
language_name = "balinese"
dataset_dirname = f"dataset/raw/{language_name}"

# you can ignore the rest of the script
compiled = compile_dataset(dataset_dir=dataset_dirname, export="compiled")
evaluate = evaluation_metrics(df=compiled, export="result")