In [1]:
import json
import time
import os
import re

from modules.preprocessing_protocols import preprocess_text
from modules.results_extraction import extract_results, results_by_subdistribution,\
                                    results_by_named_entity, confusion_matrix
from modules.tools import find

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil

from estnltk.taggers import VabamorfCorpusTagger
vm_corpus_tagger = VabamorfCorpusTagger()

import pandas as pd

from nervaluate import Evaluator

In [2]:
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
vallakohtufailid_location = os.path.join('..', 'data', 'vallakohtufailid')
no_goldstandard_tags_location = os.path.join('..', 'data', 'files_without_goldstandard_annotations.txt')
testing_files_location = json_files_location
testing_files_names = os.path.join('..', 'data', 'corpus_subdistribution_without_hand_tagged.txt')

removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']

Get files without gold-standard annotations:

In [3]:
with open(no_goldstandard_tags_location, 'r', encoding='UTF-8') as in_f:
    lines = in_f.readlines()

no_goldstandard_annotations = [line.strip() for line in lines]

Get all files (subdistributions 1–5):

In [None]:
files = {}

with open(os.path.join(divided_corpus), 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

Get files for testing (6th subdistribution):

In [4]:
testing_files = {}

with open(os.path.join(testing_files_names), 'r', encoding = 'UTF-8') as in_f:
    txt = in_f.readlines()

for filename in txt:
    file, subdistribution = filename.split(":")
    testing_files[file] = subdistribution.strip()

In [None]:
def create_training_texts(filenames):
    print("(!) Preparing training texts")
    training_texts = []
    
    for file in filenames:
        if file in no_goldstandard_annotations:
            continue
        else:
            with open(os.path.join(json_files_location, file), 'r', encoding='UTF-8') as in_f:
                tagged_text = preprocess_text(json_to_text(in_f.read()))
                training_texts.append(tagged_text)
    print(f"(!) Training texts prepared")
    return training_texts

In [None]:
def train_nertagger(training_texts, new_model_dir):
    print("(!) Training NerTagger")    
    modelUtil = ModelStorageUtil( new_model_dir )
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir=new_model_dir )
    print(f"(!) NerTagger trained")

In [None]:
def tag_files(model_dir, testing_files):
    print("(!) Tagging files")

    nertagger = NerTagger(model_dir)
    iterator = 1

    for test_file in testing_files:
        with open(find(test_file.replace('.json', '.txt'), vallakohtufailid_location), 'r', encoding='UTF-8') as in_f:
            text = in_f.read()
        
        if test_file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
            text = text.replace('..', '. .')
        text = preprocess_text(Text(text))

        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))

        for x in removed_layers:
            text.pop_layer(x)

        path = os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')
        if not os.path.exists(path):
            os.mkdir(path)
            
        text_to_json(text, file=os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', test_file))
        
        print(f'{iterator}. Tagged file {test_file}')
        iterator += 1
    
    print(f"(!) All files tagged")


In [None]:
def train_model(model_directory):

    # Get the filenames to be trained on from the files dictionary
    filenames = [key for key, value in files.items()]

    # Create training_texts from the aforementioned filenames
    training_texts = create_training_texts(filenames)

    # Set up the trainer and training
    new_model_dir = os.path.join('models', model_directory)
    train_nertagger(training_texts, new_model_dir)

    # Set up the new trained nertagger
    tagger = NerTagger(model_dir = new_model_dir)

    # Tag the files using the new nertagger
    tag_files(new_model_dir, testing_files)
            
    # Get results of model
    extract_results(model_directory, #model directory path
                    testing_files,
                    no_goldstandard_annotations,
                    os.path.join('models', model_directory, 'vallakohtufailid-trained-nertagger'), #files tagged by trained nertagger location
                    testing_files_location,
                    os.path.join('models', model_directory)) #results.txt location
    
    print(f"(!) Model {model_directory} trained & tested")

In [6]:
model_directory = os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants_best')

In [None]:
train_model(model_directory)

# Results

Overall results:

In [5]:
with open(os.path.join('models', model_directory, 'results.txt'), 'r', encoding='UTF-8') as in_f:
    print(results_by_subdistribution(json.loads(in_f.read()), testing_files))

   Precision    Recall        F1
0   0.924694  0.892192  0.908152


In [5]:
with open(os.path.join('models', model_directory, 'results.txt'), 'r', encoding='UTF-8') as in_f:
    print(results_by_named_entity(json.loads(in_f.read()), testing_files))

                          6     Total
ORG_precision      0.838235  0.838235
ORG_recall         0.780822  0.780822
ORG_f1score        0.808511  0.808511
PER_precision      0.953627  0.953627
PER_recall         0.936574  0.936574
PER_f1score        0.945024  0.945024
MISC_precision     0.684211  0.684211
MISC_recall        0.577778  0.577778
MISC_f1score       0.626506  0.626506
LOC_precision      0.601695  0.601695
LOC_recall         0.489655  0.489655
LOC_f1score        0.539924  0.539924
LOC_ORG_precision  0.786325  0.786325
LOC_ORG_recall     0.695214  0.695214
LOC_ORG_f1score    0.737968  0.737968


In [9]:
y_true, y_pred = confusion_matrix(model_directory, testing_files, no_goldstandard_annotations, os.path.join('models', model_directory, 'vallakohtufailid-trained-nertagger'), testing_files_location)

print(pd.crosstab(y_true, y_pred))

Predicted  LOC  LOC_ORG  MISC  ORG   PER
Actual                                  
LOC         71       10     0    1    12
LOC_ORG      5      276     0    0    11
MISC         0        0    26    0     5
ORG          0        1     0   57     1
PER          2        3     0    0  3352
