# How well does today's Estonian written language NER model perform on the court protocols?

### Imports:

In [1]:
import json
import os
import re
import pandas as pd

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

from nervaluate import Evaluator
from modules.preprocessing_protocols import preprocess_text
from modules.results_extraction import extract_results, results_by_subdistribution, results_by_named_entity

### Re-used variables:

In [2]:
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
baseline_files_location = os.path.join('..', 'data', 'vallakohtufailid_nertagger_baseline')
no_goldstandard_tags_location = os.path.join('..', 'data', 'files_without_goldstandard_annotations.txt')
results_location = os.path.join('..', 'data')

removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
nertagger = NerTagger()

Read files from the distributed corpus:

In [3]:
files = {}

with open(divided_corpus, 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for filename in txt:
    file, subdistribution = filename.split(':')
    files[file] = subdistribution.strip()

Get files without goldstandard annotations:

In [4]:
with open(no_goldstandard_tags_location, 'r', encoding='UTF-8') as in_f:
    lines = in_f.readlines()

no_goldstandard_annotations = [line.strip() for line in lines]

### Make `ner` and `wordner` layers from goldstandard files

In [9]:
for file in files:
    with open(os.path.join(json_files_location, file), 'r', encoding='UTF-8') as in_f:
        text = json_to_text(in_f.read()).text

        text = preprocess_text(Text(text))
        
        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))
                
        for x in removed_layers:
            text.pop_layer(x)
        
        text_to_json(text, file=os.path.join(baseline_files_location, file))

### Calculate the scores

In [7]:
extract_results(files,\
                no_goldstandard_annotations,\
                baseline_files_location,\
                json_files_location,\
                results_location)

Results have been saved to ../data/results.txt


### Results:

In [5]:
with open(os.path.join(results_location, 'results.txt'), 'r', encoding='UTF-8') as in_f:
    json_text = json.loads(in_f.read())
    results_subdist = results_by_subdistribution(json_text, files)
    results_ne = results_by_named_entity(json_text, files)

In [6]:
display(pd.DataFrame(results_subdist))

Alamhulk,1,2,3,4,5,Total
Precision,0.591655,0.584328,0.542553,0.543807,0.606783,0.573695
Recall,0.574115,0.574496,0.54321,0.501393,0.583789,0.553877
F1,0.582753,0.57937,0.542881,0.521739,0.595064,0.563612


In [7]:
display(pd.DataFrame(results_ne))

Unnamed: 0,1,2,3,4,5,Total
ORG_precision,0.051643,0.02974,0.04023,0.039867,0.015209,0.035338
ORG_recall,0.186441,0.121212,0.2,0.164384,0.051282,0.144664
ORG_f1score,0.080882,0.047761,0.066986,0.064171,0.02346,0.056652
PER_precision,0.695688,0.695866,0.662948,0.655266,0.721677,0.686289
PER_recall,0.66658,0.665445,0.634815,0.592559,0.687737,0.649427
PER_f1score,0.680823,0.680316,0.648577,0.622337,0.704298,0.66727
MISC_precision,0.0,0.0,0.0,0.0,0.0,0.0
MISC_recall,0.0,0.0,0.0,0.0,0.0,0.0
MISC_f1score,0.0,0.0,0.0,0.0,0.0,0.0
LOC_precision,0.103825,0.21327,0.161458,0.240437,0.236715,0.191141


Disclaimer: The results for the `LOC_ORG` and `MISC` tag are `0`, because today's written Estonian NER tagger model does not use these labels. `LOC_ORG` could either be defined as `LOC` or `ORG`.