## Script to evaluate the E2E and two-step NER from speech as mentioned in the paper

In [None]:
import numpy as np
from tqdm .auto import tqdm
import glob, os

In [None]:
def rea(file):
    '''
    function to read the text files and 
    return a list containig the named entities in the input text file.
    
    file: path to theinput text file (ground truth orpredicted)
    
    '''
    with open(file, 'r') as f:
        dummy = f.read()
    l = ''
    w = ''
    for i in dummy:
        if i in start:
            l+= i
            w= '-'
        elif w == '-' and len(l) != 0:
            l+= i
        if i in stop:
            l+= ','
            w = ''
    return l[:-1].split(',')

In [None]:
def main(true, pred):
    '''
    Function to condition a named enity has a start nd end symbol
    
    true: list of list of true named entities.
    pred: list of list of predicted named entities
    '''
    t = []
    for i in true:
        for j in i:
            if j: t.append(j)

    t_ = []
    for i in t:
        s,st =0, 0
        for j in i:
            if j in start:
                s+=1
            if j in stop:
                st+=1
        if s ==st:
            t_.append(i)    
    t = t_

    p = []
    for i in pred:
        for j in i:
            if j: p.append(j)

    p_ = []
    for i in p:
        s,st =0, 0
        for j in i:
            if j in start:
                s+=1
            if j in stop:
                st+=1
        if s ==st:
            p_.append(i)    
    p = p_
    return t,p

In [None]:
def score(tp, fp, fn):
    '''
    
    Function to calcualte the F1 score.
    tp: true positives.
    fp: false positives.
    fn: false negatives.
    
    '''
    if tp == 0:
        pre, rec, f1 = 0, 0, 0
    else:
        pre = np.round(tp/(tp+fp),decimals=3)
        rec = np.round(tp/(tp+fn),decimals=3)
        f1 = np.round(2*((pre*rec)/(pre+rec)),decimals=3)
    return pre, rec, f1

In [None]:
# path to the folder containing all the ground truth files
true_path =  '/home/hemant/asr_wm/data/ner/txt/without_space/dev/'
# path to the folder containing all the predicted files
pred_path = '/home/hemant/asr_wm/data/dev/'

files = [os.path.basename(i) for i in glob.glob(f"{true_path}*.txt")]

prf = [] # precison, recall, and f1 score for all the individual files.
tpfp = [] # true positives,false postives,and false negatives for all the individual files.

# symbol to denote the end of a named entity.
stop = [']']
# symbol to denote the start of each 3 named entitites.
start = ["|", "{", "$"]

In [None]:
# calcualtes the score for each individual named entity i.e., person, location and organization.

for j in ["|", "{", "$"]:
    start = [j]
    print(start)
    tp, fp, fn = 0, 0, 0
    t_ = []
    
    for i in tqdm(range(len(files))):
        true = [rea(f"{true_path}/{files[i]}")]
        pred = [rea(f"{pred_path}/{files[i]}")]
        
        t, p = main(true,pred)
        t = [i for i in t]
        p = [i for i in p]

        tp+=  len(set(p).intersection(t))
        fp+= len(set(p) - set(t)) 
        fn+=  len(set(t) - set(p))
    pre, rec,f1 = score(tp, fp, fn)
    prf.append([pre,rec,f1])
    tpfp.append([tp,fp,fn])
    print(f"prec is {pre}\n recall is {rec}\n f1 score is {f1}")

In [None]:
# macro score
np.round((prf[0][0] + prf[1][0] + prf[0][0] ) /3,3),np.round((prf[0][1] + prf[1][1] + prf[2][1] ) /3,3), np.round((prf[0][2] + prf[1][2] + prf[2][2] ) /3,3) 

In [None]:
# micro score
score(tpfp[0][0] + tpfp[1][0] + tpfp[2][0], tpfp[0][1] +tpfp[1][1] + tpfp[2][1], tpfp[0][2] + tpfp[1][2] + tpfp[2][2])

In [None]:
# calcualtes the f1 score considering all the named entities as one.

start = ["|", "{", "$"]
for i in tqdm(range(len(files))):
        
    true = [rea(f"{true_path}/{files[i]}")]
    pred = [rea(f"{pred_path}/{files[i]}")]
        
    t, p = main(true,pred)
    t = [i for i in t]
    p = [i for i in p]

    tp+=  len(set(p).intersection(t))
    fp+= len(set(p) - set(t)) 
    fn+=  len(set(t) - set(p))

pre, rec,f1 = score(tp, fp, fn)
print(f"prec is {pre}\n recall is {rec}\n f1 score is {f1}")