In [1]:
import csv
import os
import numpy as np
import pandas as pd

def find_files(root_dir, binary=None, iob=None):
    file_paths = {}
    for dir_name, subdir_list, file_list in os.walk(root_dir):
        for file_name in file_list:
            if binary:
                if 'parl_CR_binary' in file_name:
                    file_path = os.path.join(dir_name, file_name)
                    key = file_name.replace('parl_CR_binary', '')
                    file_paths[key] = file_path
            if iob:
                if "NSPC_SAMPLES_CR_all_labels_" in file_name:
                    file_path = os.path.join(dir_name, file_name)
                    key = file_name.replace('NSPC_SAMPLES_CR_IOB2_', '')
                    file_paths[key] = file_path
    return file_paths





In [2]:
root_directory = r'C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\GitHub\MA_Peter-R-ysland-Aarnes\Results'
files = find_files(root_directory, binary=True)
files_iob = find_files(root_directory, iob=True)

In [3]:
files_iob

{}

In [4]:
files

{'_bert-base-multilingual-cased.csv': 'C:\\Users\\Aarne\\OneDrive - University of Bergen\\Dokumenter\\GitHub\\MA_Peter-R-ysland-Aarnes\\Results\\bert-base-multilingual-cased\\parl_CR_binary_bert-base-multilingual-cased.csv',
 '_nb-bert-base.csv': 'C:\\Users\\Aarne\\OneDrive - University of Bergen\\Dokumenter\\GitHub\\MA_Peter-R-ysland-Aarnes\\Results\\nb-bert-base\\parl_CR_binary_nb-bert-base.csv',
 '_nbailab-base-ner-scandi.csv': 'C:\\Users\\Aarne\\OneDrive - University of Bergen\\Dokumenter\\GitHub\\MA_Peter-R-ysland-Aarnes\\Results\\nbailab-base-ner-scandi\\parl_CR_binary_nbailab-base-ner-scandi.csv',
 '_norbert.csv': 'C:\\Users\\Aarne\\OneDrive - University of Bergen\\Dokumenter\\GitHub\\MA_Peter-R-ysland-Aarnes\\Results\\norbert\\parl_CR_binary_norbert.csv',
 '_norbert2.csv': 'C:\\Users\\Aarne\\OneDrive - University of Bergen\\Dokumenter\\GitHub\\MA_Peter-R-ysland-Aarnes\\Results\\norbert2\\parl_CR_binary_norbert2.csv'}

In [7]:
def extract_rows(file_path, row_type, binary=None, IOB2=None): #row type "macro_avg" etc
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        r, p, f1, s = [],[],[], []
        for row in reader:
            d = {}
            if len(row) > 0 and row[0] == row_type and binary:
                r.append(float(row[1])*100)
                p.append(float(row[2])*100)
                f1.append(float(row[3])*100)
            elif len(row) > 0 and row[0] == row_type and IOB2:
                r.append(float(row[1])*100)
                p.append(float(row[2])*100)
                f1.append(float(row[3])*100)
                s.append(int(float(row[4])))
        d[f"precision"]=p
        d["recall"]=r
        d[f"f1"]=f1
        d[f"support"]=s
    return d


In [8]:
def get_sd_from_extracted_rows_keys(iob2_data, key):
    texts = []
    latex = []
    model_name=key.split(".")[0]

    print(model_name)
    for key in iob2_data:
        values = [float(x) for x in iob2_data[key]]
        avg = np.mean(values)
        std = np.std(values)
        text = (f"{key}: Average = {avg:.2f}, Standard Deviation = {std:.2f}")
        latex_text = (f"{avg:.2f} (\pm{{{std:.2f}}})")
        texts.append(text)
        latex.append(latex_text)
    for i in texts:
        print(i)
    for t in latex:
        if t!=latex[-1]:
            print(f"{t} & ", end="")
        else:
            print(f"{t} \\\\ [2pt]")
    print()

    return latex

In [9]:
for key, v in files_iob.items():
    iob2_data = extract_rows(v, "micro avg", IOB2=True)
    get_sd_from_extracted_rows_keys(iob2_data, key)

In [10]:
for key, v in files_iob.items():
    if key=="nbailab-base-ner-scandi.csv":
        iob2_data = extract_rows(v, "micro avg", IOB2=True)
        get_sd_from_extracted_rows_keys(iob2_data, key)

In [11]:
lst = ['B-GPE_LOC', 'I-DRV', 'I-LOC', 'B-PER', 'I-PER', 'B-PROD', 
                'I-GPE_ORG', 'B-GPE_ORG', 'B-EVT', 'B-DRV', 'I-PROD', 'B-ORG', 'B-MISC',
                'I-MISC', 'I-GPE_LOC', 'B-LOC', 'I-ORG', 'I-EVT']
d = {}
for name in lst:
    for key, v in files_iob.items():
        if key=="NSPC_SAMPLES_CR_all_labels_nb-bert-base.csv":
            d[name]=extract_rows(v, name, IOB2=True)

all_latex = []
for k, v in d.items():
    latex = get_sd_from_extracted_rows_keys(v, k)
    all_latex.append(latex)

for latex, name in zip(all_latex, lst):
    for i1 in latex:
        if i1==latex[0]:
            print(f"{name} & {i1} &", end=" ")
        elif i1!=latex[-1] and i1!=[0]:
            print(f"{i1}", end=" ")
        else:
            print(f"& {i1[:-15]} \\\\ [2pt]")

In [12]:
for k, v in files_iob.items():
    model_name=k.split(".")[0]
    iob2_data = extract_rows(v, "micro avg", IOB2=True)
    print()
    print(model_name)
    for key in iob2_data:
        values = [float(x) for x in iob2_data[key]]
        avg = np.mean(values)
        std = np.std(values)
        print(f"{key}: Average = {avg:.2f}, Standard Deviation = {std:.2f}")

In [42]:
len("(\pm{0.00})")

11

In [21]:
for k, v in files.items():
    model_name=k.split(".")[0]
    data = extract_rows(v, row_type="macro avg", binary=True)
    print()
    print(model_name)
    for key in data:
        values = [float(x) for x in data[key]]
        avg = np.mean(values)
        std = np.std(values)
        #print(f"{key}: Average = {avg:.2f}, Standard Deviation = {std:.2f}")
        print(f"{key}    {avg:.2f} (\pm{{{std:.2f}}})")


_bert-base-multilingual-cased
precision    95.25 (\pm{0.32})
recall    92.91 (\pm{0.58})
f1    94.04 (\pm{0.20})
support    nan (\pm{nan})

_nb-bert-base
precision    97.78 (\pm{0.23})
recall    92.76 (\pm{0.39})
f1    95.11 (\pm{0.23})
support    nan (\pm{nan})

_nbailab-base-ner-scandi
precision    97.56 (\pm{0.18})
recall    93.43 (\pm{0.48})
f1    95.39 (\pm{0.23})
support    nan (\pm{nan})

_norbert
precision    96.55 (\pm{0.47})
recall    93.19 (\pm{0.97})
f1    94.79 (\pm{0.35})
support    nan (\pm{nan})

_norbert2
precision    97.10 (\pm{0.22})
recall    93.10 (\pm{0.44})
f1    95.00 (\pm{0.19})
support    nan (\pm{nan})
