In [1]:
import json
import time
import os
import re
import pandas as pd

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from estnltk.taggers import Retagger
from estnltk.taggers import CompoundTokenTagger

from nervaluate import Evaluator
from modules.preprocessing_protocols import preprocess_text

nertagger = NerTagger()
word_level_ner = WordLevelNerTagger()

### Read files from the distributed corpus:

In [3]:
files = {}

with open(os.path.join('..', 'data', 'divided_corpus.txt'), 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for filename in txt:
    file, subdistribution = filename.split(':')
    files[file] = subdistribution.rstrip('\n')

### Make `ner` and `wordner` layers from goldstandard files

In [6]:
# Remove layers to keep file sizes low
removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']

In [7]:
for file in files:
    with open(os.path.join('..', 'data', 'vallakohtufailid-json-flattened', file), 'r', encoding='UTF-8') as f:
        text = json_to_text(f.read()).text
        
        if file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
            text.replace('..', '. .')
            
        text = Text(text)
        text = preprocess_text(text)
        
        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))
        
        word_level_ner.tag(text)
        text.add_layer(flatten(text['wordner'], 'flat_wordner'))
        
        for x in removed_layers:
            text.pop_layer(x)
        
        text_to_json(text, file=os.path.join('..', 'data', 'vallakohtufailid_nertagger_baseline', file))

### Calculate the scores

In [8]:
files_not_working = [
                    'J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json'
                    ]
# These files don't work because the protocols are written in a different language, which the goldstandard didn't
# recognise, hence have no goldstandard tags.

In [47]:
%%time
gold_ner_loc = []
gold_ner_org = []
gold_ner = []
test_ner = []

for file in sorted(os.listdir(os.path.join('..', 'data', 'vallakohtufailid_nertagger_baseline'))):
    appendable_gold_ner = []
    appendable_test_ner = []
    appendable_gold_ner_loc = []
    appendable_gold_ner_org = []
    
    if file.endswith(".json"):
        if file in files_not_working:
            continue
        else:
            with open(os.path.join('..', 'data', 'vallakohtufailid_nertagger_baseline', file), 'r', encoding='UTF-8') as f_test, \
                open(os.path.join('..', 'data', 'vallakohtufailid-json-flattened', file), 'r', encoding='UTF-8') as f_gold:
                    test_import = json_to_text(f_test.read())
                    gold_import = json_to_text(f_gold.read())
                    
            '''
            for i in range(len(gold_import['gold_wordner'])):
                tag = gold_import['gold_wordner'][i].nertag
                gold.append(tag)
            for i in range(len(test_import['flat_wordner'])):
                tag = test_import['flat_wordner'][i].nertag
                test.append(tag)
            '''
            
            for i in range(len(gold_import['gold_ner'])):
                ner = gold_import['gold_ner'][i]
                label = ner.nertag
                start = int(ner.start)
                end = int(ner.end)
                appendable_gold_ner.append({"label": label, "start": start, "end": end})
                appendable_gold_ner_loc.append({"label": 'LOC' if label == 'LOC_ORG' else label, "start": start, "end": end})
                appendable_gold_ner_org.append({"label": 'ORG' if label == 'LOC_ORG' else label, "start": start, "end": end})

            for i in range(len(test_import['flat_ner'])):
                ner = test_import['flat_ner'][i]
                label = ner.nertag[0]
                start = int(ner.start)
                end = int(ner.end)
                appendable_test_ner.append({"label": label, "start": start, "end": end})

    gold_ner.append(appendable_gold_ner)
    gold_ner_loc.append(appendable_gold_ner_loc)
    gold_ner_org.append(appendable_gold_ner_org)
    test_ner.append(appendable_test_ner)

Wall time: 20.4 s


### Results for LOC_ORG:

In [49]:
evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
results, results_per_tag = evaluator.evaluate()

In [50]:
display(pd.DataFrame(results))

Unnamed: 0,ent_type,partial,strict,exact
correct,18331.0,16817.0,15332.0,16817.0
incorrect,4327.0,0.0,7326.0,5841.0
partial,0.0,5841.0,0.0,0.0
missed,4886.0,4886.0,4886.0,4886.0
spurious,4027.0,4027.0,4027.0,4027.0
possible,27544.0,27544.0,27544.0,27544.0
actual,26685.0,26685.0,26685.0,26685.0
precision,0.68694,0.739648,0.574555,0.630204
recall,0.665517,0.716581,0.556637,0.61055
f1,0.676059,0.727932,0.565454,0.620222


In [51]:
display(pd.DataFrame(results_per_tag))

Unnamed: 0,ORG,PER,MISC,LOC,LOC_ORG
ent_type,"{'correct': 145, 'incorrect': 175, 'partial': 0, 'missed': 99, 'spurious': 1312, 'possible': 419, 'actual': 1632, 'precision': 0.08884803921568628, 'recall': 0.3460620525059666, 'f1': 0.14139444173573865}","{'correct': 17911, 'incorrect': 1901, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.8159537150927065, 'recall': 0.7743623000432339, 'f1': 0.7946141389942548}","{'correct': 0, 'incorrect': 143, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 275, 'incorrect': 312, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.236457437661221, 'recall': 0.2728174603174603, 'f1': 0.2533394748963611}","{'correct': 0, 'incorrect': 1796, 'partial': 0, 'missed': 937, 'spurious': 0, 'possible': 2733, 'actual': 1796, 'precision': 0.0, 'recall': 0.0, 'f1': 0}"
partial,"{'correct': 67, 'incorrect': 0, 'partial': 253, 'missed': 99, 'spurious': 1312, 'possible': 419, 'actual': 1632, 'precision': 0.11856617647058823, 'recall': 0.4618138424821002, 'f1': 0.1886884446611409}","{'correct': 15810, 'incorrect': 0, 'partial': 4002, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.8113981139811398, 'recall': 0.7700389105058366, 'f1': 0.7901776801756837}","{'correct': 15, 'incorrect': 0, 'partial': 128, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.5524475524475524, 'recall': 0.3110236220472441, 'f1': 0.3979848866498741}","{'correct': 371, 'incorrect': 0, 'partial': 216, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.411865864144454, 'recall': 0.4751984126984127, 'f1': 0.4412713035467526}","{'correct': 554, 'incorrect': 0, 'partial': 1242, 'missed': 937, 'spurious': 0, 'possible': 2733, 'actual': 1796, 'precision': 0.6542316258351893, 'recall': 0.42993047932674716, 'f1': 0.5188783395893133}"
strict,"{'correct': 64, 'incorrect': 256, 'partial': 0, 'missed': 99, 'spurious': 1312, 'possible': 419, 'actual': 1632, 'precision': 0.0392156862745098, 'recall': 0.15274463007159905, 'f1': 0.062408581179912236}","{'correct': 15039, 'incorrect': 4773, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.6851168511685117, 'recall': 0.6501945525291829, 'f1': 0.6671990417248952}","{'correct': 0, 'incorrect': 143, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 229, 'incorrect': 358, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.19690455717970765, 'recall': 0.22718253968253968, 'f1': 0.21096269000460616}","{'correct': 0, 'incorrect': 1796, 'partial': 0, 'missed': 937, 'spurious': 0, 'possible': 2733, 'actual': 1796, 'precision': 0.0, 'recall': 0.0, 'f1': 0}"
exact,"{'correct': 67, 'incorrect': 253, 'partial': 0, 'missed': 99, 'spurious': 1312, 'possible': 419, 'actual': 1632, 'precision': 0.04105392156862745, 'recall': 0.15990453460620524, 'f1': 0.06533398342272062}","{'correct': 15810, 'incorrect': 4002, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.7202405357386907, 'recall': 0.6835278858625162, 'f1': 0.7014041392160777}","{'correct': 15, 'incorrect': 128, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.1048951048951049, 'recall': 0.05905511811023622, 'f1': 0.07556675062972291}","{'correct': 371, 'incorrect': 216, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.31900257953568356, 'recall': 0.3680555555555556, 'f1': 0.3417779824965454}","{'correct': 554, 'incorrect': 1242, 'partial': 0, 'missed': 937, 'spurious': 0, 'possible': 2733, 'actual': 1796, 'precision': 0.30846325167037864, 'recall': 0.20270764727405782, 'f1': 0.2446456171340252}"


### Results for LOC:

In [52]:
evaluator = Evaluator(gold_ner_loc, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC'])
results_loc, results_per_tag_loc = evaluator.evaluate()

In [53]:
display(pd.DataFrame(results_loc))

Unnamed: 0,ent_type,partial,strict,exact
correct,18887.0,16817.0,15560.0,16817.0
incorrect,3771.0,0.0,7098.0,5841.0
partial,0.0,5841.0,0.0,0.0
missed,4886.0,4886.0,4886.0,4886.0
spurious,4027.0,4027.0,4027.0,4027.0
possible,27544.0,27544.0,27544.0,27544.0
actual,26685.0,26685.0,26685.0,26685.0
precision,0.707776,0.739648,0.583099,0.630204
recall,0.685703,0.716581,0.564914,0.61055
f1,0.696565,0.727932,0.573863,0.620222


In [33]:
display(pd.DataFrame(results_per_tag_loc))

Unnamed: 0,ORG,PER,MISC,LOC
ent_type,"{'correct': 145, 'incorrect': 175, 'partial': 0, 'missed': 99, 'spurious': 1524, 'possible': 419, 'actual': 1844, 'precision': 0.07863340563991324, 'recall': 0.3460620525059666, 'f1': 0.12814847547503314}","{'correct': 18328, 'incorrect': 2049, 'partial': 0, 'missed': 2755, 'spurious': 2603, 'possible': 23132, 'actual': 22980, 'precision': 0.7975630983463882, 'recall': 0.7923223240532595, 'f1': 0.7949340735600278}","{'correct': 0, 'incorrect': 144, 'partial': 0, 'missed': 110, 'spurious': 0, 'possible': 254, 'actual': 144, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 275, 'incorrect': 312, 'partial': 0, 'missed': 421, 'spurious': 1130, 'possible': 1008, 'actual': 1717, 'precision': 0.16016307513104253, 'recall': 0.2728174603174603, 'f1': 0.2018348623853211}"
partial,"{'correct': 67, 'incorrect': 0, 'partial': 253, 'missed': 99, 'spurious': 1524, 'possible': 419, 'actual': 1844, 'precision': 0.1049349240780911, 'recall': 0.4618138424821002, 'f1': 0.171011931064958}","{'correct': 15810, 'incorrect': 0, 'partial': 4567, 'missed': 2755, 'spurious': 2603, 'possible': 23132, 'actual': 22980, 'precision': 0.7873585726718886, 'recall': 0.7821848521528618, 'f1': 0.7847631852879945}","{'correct': 15, 'incorrect': 0, 'partial': 129, 'missed': 110, 'spurious': 0, 'possible': 254, 'actual': 144, 'precision': 0.5520833333333334, 'recall': 0.31299212598425197, 'f1': 0.39949748743718594}","{'correct': 371, 'incorrect': 0, 'partial': 216, 'missed': 421, 'spurious': 1130, 'possible': 1008, 'actual': 1717, 'precision': 0.2789749563191613, 'recall': 0.4751984126984127, 'f1': 0.3515596330275229}"
strict,"{'correct': 64, 'incorrect': 256, 'partial': 0, 'missed': 99, 'spurious': 1524, 'possible': 419, 'actual': 1844, 'precision': 0.03470715835140998, 'recall': 0.15274463007159905, 'f1': 0.05656208572691118}","{'correct': 15039, 'incorrect': 5338, 'partial': 0, 'missed': 2755, 'spurious': 2603, 'possible': 23132, 'actual': 22980, 'precision': 0.6544386422976501, 'recall': 0.6501383365035449, 'f1': 0.6522814018043025}","{'correct': 0, 'incorrect': 144, 'partial': 0, 'missed': 110, 'spurious': 0, 'possible': 254, 'actual': 144, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 229, 'incorrect': 358, 'partial': 0, 'missed': 421, 'spurious': 1130, 'possible': 1008, 'actual': 1717, 'precision': 0.1333721607454863, 'recall': 0.22718253968253968, 'f1': 0.16807339449541286}"
exact,"{'correct': 67, 'incorrect': 253, 'partial': 0, 'missed': 99, 'spurious': 1524, 'possible': 419, 'actual': 1844, 'precision': 0.03633405639913232, 'recall': 0.15990453460620524, 'f1': 0.059213433495360145}","{'correct': 15810, 'incorrect': 4567, 'partial': 0, 'missed': 2755, 'spurious': 2603, 'possible': 23132, 'actual': 22980, 'precision': 0.6879895561357703, 'recall': 0.6834687878263876, 'f1': 0.6857217210270645}","{'correct': 15, 'incorrect': 129, 'partial': 0, 'missed': 110, 'spurious': 0, 'possible': 254, 'actual': 144, 'precision': 0.10416666666666667, 'recall': 0.05905511811023622, 'f1': 0.07537688442211056}","{'correct': 371, 'incorrect': 216, 'partial': 0, 'missed': 421, 'spurious': 1130, 'possible': 1008, 'actual': 1717, 'precision': 0.2160745486313337, 'recall': 0.3680555555555556, 'f1': 0.2722935779816514}"


### Results for ORG:

In [54]:
evaluator = Evaluator(gold_ner_org, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC'])
results_org, results_per_tag_org = evaluator.evaluate()

In [55]:
display(pd.DataFrame(results_org))

Unnamed: 0,ent_type,partial,strict,exact
correct,18690.0,16817.0,15430.0,16817.0
incorrect,3968.0,0.0,7228.0,5841.0
partial,0.0,5841.0,0.0,0.0
missed,4886.0,4886.0,4886.0,4886.0
spurious,4027.0,4027.0,4027.0,4027.0
possible,27544.0,27544.0,27544.0,27544.0
actual,26685.0,26685.0,26685.0,26685.0
precision,0.700393,0.739648,0.578227,0.630204
recall,0.678551,0.716581,0.560195,0.61055
f1,0.689299,0.727932,0.569068,0.620222


In [56]:
display(pd.DataFrame(results_per_tag_org))

Unnamed: 0,ORG,PER,MISC,LOC
ent_type,"{'correct': 504, 'incorrect': 1612, 'partial': 0, 'missed': 1036, 'spurious': 1312, 'possible': 3152, 'actual': 3428, 'precision': 0.147024504084014, 'recall': 0.1598984771573604, 'f1': 0.15319148936170213}","{'correct': 17911, 'incorrect': 1901, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.8159537150927065, 'recall': 0.7743623000432339, 'f1': 0.7946141389942548}","{'correct': 0, 'incorrect': 143, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 275, 'incorrect': 312, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.236457437661221, 'recall': 0.2728174603174603, 'f1': 0.2533394748963611}"
partial,"{'correct': 621, 'incorrect': 0, 'partial': 1495, 'missed': 1036, 'spurious': 1312, 'possible': 3152, 'actual': 3428, 'precision': 0.3992123687281214, 'recall': 0.4341687817258883, 'f1': 0.41595744680851066}","{'correct': 15810, 'incorrect': 0, 'partial': 4002, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.8113981139811398, 'recall': 0.7700389105058366, 'f1': 0.7901776801756837}","{'correct': 15, 'incorrect': 0, 'partial': 128, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.5524475524475524, 'recall': 0.3110236220472441, 'f1': 0.3979848866498741}","{'correct': 371, 'incorrect': 0, 'partial': 216, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.411865864144454, 'recall': 0.4751984126984127, 'f1': 0.4412713035467526}"
strict,"{'correct': 162, 'incorrect': 1954, 'partial': 0, 'missed': 1036, 'spurious': 1312, 'possible': 3152, 'actual': 3428, 'precision': 0.047257876312718786, 'recall': 0.05139593908629442, 'f1': 0.04924012158054711}","{'correct': 15039, 'incorrect': 4773, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.6851168511685117, 'recall': 0.6501945525291829, 'f1': 0.6671990417248952}","{'correct': 0, 'incorrect': 143, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.0, 'recall': 0.0, 'f1': 0}","{'correct': 229, 'incorrect': 358, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.19690455717970765, 'recall': 0.22718253968253968, 'f1': 0.21096269000460616}"
exact,"{'correct': 621, 'incorrect': 1495, 'partial': 0, 'missed': 1036, 'spurious': 1312, 'possible': 3152, 'actual': 3428, 'precision': 0.18115519253208867, 'recall': 0.19701776649746192, 'f1': 0.18875379939209722}","{'correct': 15810, 'incorrect': 4002, 'partial': 0, 'missed': 3318, 'spurious': 2139, 'possible': 23130, 'actual': 21951, 'precision': 0.7202405357386907, 'recall': 0.6835278858625162, 'f1': 0.7014041392160777}","{'correct': 15, 'incorrect': 128, 'partial': 0, 'missed': 111, 'spurious': 0, 'possible': 254, 'actual': 143, 'precision': 0.1048951048951049, 'recall': 0.05905511811023622, 'f1': 0.07556675062972291}","{'correct': 371, 'incorrect': 216, 'partial': 0, 'missed': 421, 'spurious': 576, 'possible': 1008, 'actual': 1163, 'precision': 0.31900257953568356, 'recall': 0.3680555555555556, 'f1': 0.3417779824965454}"
