In [1]:
import json
import time
import os
import sklearn_crfsuite
import re
import nereval
import pandas as pd
import CompoundTokenTaggerModule

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from sklearn_crfsuite import metrics

from nervaluate import Evaluator

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil
from estnltk.core import DEFAULT_PY3_NER_MODEL_DIR

In [2]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

These files don't work because the protocols are written in a different language, which the goldstandard didn't recognise, hence have no goldstandard tags.

In [3]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [4]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [8]:
all_results = {}

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    
    # Getting the filenames to be trained on from the files dictionary.
    filenames = {key: value for key, value in files.items() if int(value) in training_subdistributions}
    
    # Creating training_texts from the aforementioned filenames.
    print("Valmistan ette treenimistekste.")
    start = time.time()
    training_texts = []
    for filename in filenames:
        with open('./vallakohtufailid_json_flat/' + str(filename), 'r', encoding='UTF-8') as file:
            if filename in files_not_working:
                continue
            else:
                training_texts.append(CompoundTokenTaggerModule.tag_text(json_to_text(file.read())))
    print(f"Treenimistekstid defineeritud {time.time() - start} sekundiga.")
    
    # Setting up the trainer and training.
    print("\n\nAlustan nertaggeri treenimist.")
    start = time.time()
    model_dir=DEFAULT_PY3_NER_MODEL_DIR
    modelUtil = ModelStorageUtil(model_dir)
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir='test' )
    print(f"NerTagger treenitud {time.time() - start} sekundiga.")
    # Setting up the new trained nertagger and defining layers to be removed later on.
    nertagger = NerTagger(model_dir = 'test')
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    
    # Tagging the files using the new nertagger.
    print("\n\nAlustan failide taggimist.")
    start = time.time()
    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
            text = f.read()
            if file == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
                text = text.replace('..', '. .')
            text = CompoundTokenTaggerModule.tag_text(Text(text))
            nertagger.tag(text)
            text.add_layer(flatten(text['ner'], 'flat_ner'))

            for x in removed_layers:
                text.pop_layer(x)
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)
            print(f'Täägitud fail {file}')
    print(f"Failid taggitud {time.time() - start} sekundiga.")
    
    # Chaning the tags into a readable formats for the evaluator.
    print("\n\nAlustan tulemuste ammutamist.")

    #gold = []
    #test = []
    gold_ner = []
    test_ner = []

    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        appendable_gold_ner = []
        appendable_test_ner = []

        if file.endswith(".json"):
            if file in files_not_working:
                continue
            else:
                with open("./vallakohtufailid_nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                    open("./vallakohtufailid_json_flat/" + str(file), 'r', encoding='UTF-8') as f_gold:
                        test_import = json_to_text(f_test.read())
                        gold_import = json_to_text(f_gold.read())

                        # The commented part is needed for word-level-ner.
                        '''
                        for i in range(len(gold_import['flat_gold_wordner'])):
                            tag = gold_import['flat_gold_wordner'][i].nertag[0]
                            gold.append(tag)
                        for i in range(len(test_import['flat_wordner'])):
                            tag = test_import['flat_wordner'][i].nertag[0]
                            test.append(tag)
                        '''
                        
                        for i in range(len(gold_import['gold_ner'])):
                            ner = gold_import['gold_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_gold_ner.append({"label": label, "start": start, "end": end})

                        for i in range(len(test_import['flat_ner'])):
                            ner = test_import['flat_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_test_ner.append({"label": label, "start": start, "end": end})

        gold_ner.append(appendable_gold_ner)
        test_ner.append(appendable_test_ner)

    evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
    results, results_per_tag = evaluator.evaluate()
    all_results[subdistribution_for_testing] = (results, results_per_tag)
print("Programm on lõpetanud oma töö.")

Valmistan ette treenimistekste.
Treenimistekstid defineeritud 133.85761189460754 sekundiga.


Alustan nertaggeri treenimist.


TypeError: expected bytes, ImmutableList found

In [None]:
with open("results_new.txt", "w+") as results_file:
    results_file.write(json.dumps(all_results))

In [None]:
with open("results_new.txt", "r") as f:
    json = json.loads(f.read())

### Tulemused alamhulkade kaupa:

In [None]:
correct_all = 0
actual_all = 0
possible_all = 0

for i in ['1', '2', '3', '4', '5']:
    train = []
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
    print(f'Testitav alamhulk oli {subdistribution_for_testing} ning treenitavad alamhulgad {train}:')
    correct = json[i][0]['ent_type']['correct']
    correct_all += correct
    actual = json[i][0]['ent_type']['actual']
    actual_all += actual
    possible = json[i][0]['ent_type']['possible']
    possible_all += possible
    precision = (correct / actual)
    recall = (correct / possible)
    f1 = 2 * ((precision * recall) / (precision + recall))
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-score: {f1}\n')


print('Tulemused üle alamhulkade:')
precision = correct_all / actual_all
recall = correct_all / possible_all
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {2 * ((precision * recall) / (precision + recall))}')

### Tulemused nimeüksuste liigi kaupa:

In [None]:
for i in ['1', '2', '3', '4', '5']:
    train = []
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
    print(f'Testitav alamhulk oli {subdistribution_for_testing} ning treenitavad alamhulgad {train}:')
    
    for key in list(json[i][1].keys()):
        correct = json[i][1][str(key)]['ent_type']['correct']
        actual = json[i][1][str(key)]['ent_type']['actual']
        possible = json[i][1][str(key)]['ent_type']['possible']
        precision = (correct / actual)
        recall = (correct / possible)
        f1 = 2 * ((precision * recall) / (precision + recall))
        print(f'Key: {key}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-score: {f1}\n')