In [1]:
import json
import time
import os
import re

from modules.preprocessing_protocols import preprocess_text
from modules.results_extraction import extract_results, results_by_subdistribution,\
                                    results_by_named_entity, confusion_matrix
from contemporary_ner_training.conll_ner_importer import conll_to_ner_labelling  
from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil

from estnltk.taggers import VabamorfCorpusTagger
vm_corpus_tagger = VabamorfCorpusTagger()

import sklearn_crfsuite
import pandas as pd

from nervaluate import Evaluator
from sklearn.metrics import classification_report
from sklearn_crfsuite import metrics

files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [36]:
def extract_results(model_dir, files):
    gold_ner = []
    test_ner = []

    for file in [key for key, value in files.items()]:
        appendable_gold_ner = []
        appendable_test_ner = []

        if not file.endswith(".json") or file in files_not_working:
            continue
        else:
            with open(os.path.join('models', model_dir, 'vallakohtufailid-trained-nertagger', file), 'r', encoding='UTF-8') as f_test, \
                 open(os.path.join('..', 'data', 'vallakohtufailid-json-flattened', file), 'r', encoding='UTF-8') as f_gold:
                    test_import = json_to_text(f_test.read())
                    gold_import = json_to_text(f_gold.read())

                    # The commented part is needed for word-level-ner.
                    '''
                    for i in range(len(gold_import['flat_gold_wordner'])):
                        tag = gold_import['flat_gold_wordner'][i].nertag[0]
                        gold.append(tag)
                    for i in range(len(test_import['flat_wordner'])):
                        tag = test_import['flat_wordner'][i].nertag[0]
                        test.append(tag)
                    '''

                    for i in range(len(gold_import['gold_ner'])):
                        ner = gold_import['gold_ner'][i]
                        label = ner.nertag
                        start = int(ner.start)
                        end = int(ner.end)
                        appendable_gold_ner.append({"label": label, "start": start, "end": end})

                    for i in range(len(test_import['flat_ner'])):
                        ner = test_import['flat_ner'][i]
                        label = ner.nertag[0]
                        start = int(ner.start)
                        end = int(ner.end)
                        appendable_test_ner.append({"label": label, "start": start, "end": end})

        gold_ner.append(appendable_gold_ner)
        test_ner.append(appendable_test_ner)
    evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
    results, results_per_tag = evaluator.evaluate()
    all_results = (results, results_per_tag)
    
    print("Tulemuste ammutamine on lõpetatud.")
    
    with open(os.path.join('models', model_dir, 'results.txt'), 'w+') as results_file:
        results_file.write(json.dumps(all_results))
    
    return (results, results_per_tag)

In [15]:
def results_by_named_entity(results_json):
    df = dict()
    totals = dict()
    by_kind = dict()

    for key in list(results_json[1].keys()):
        correct_all = 0
        actual_all = 0
        possible_all = 0
        correct = results_json[1][str(key)]['strict']['correct']
        correct_all += correct
        actual = results_json[1][str(key)]['strict']['actual']
        actual_all += actual
        possible = results_json[1][str(key)]['strict']['possible']
        possible_all += possible

        precision = (correct / actual)
        recall = (correct / possible)
        f1 = 2 * ((precision * recall) / (precision + recall))

        precisionname = str(key) + "_precision"
        recallname = str(key) + "_recall"
        f1scorename = str(key) + "_f1score"

        by_kind[precisionname] = precision
        by_kind[recallname] = recall
        by_kind[f1scorename] = f1

    df['Total'] = by_kind
    
    return df

In [3]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [10]:
use_vabamorfcorpustagger = False

In [4]:
files = {}

with open(os.path.join('..', 'data', 'divided_corpus.txt'), 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [12]:
def create_training_texts(filenames):
    print("(!) Valmistan ette treenimistekste")
    
    start = time.time()
    training_texts = []
    for filename in filenames:
        with open(os.path.join('..', 'data', 'vallakohtufailid-json-flattened', filename), 'r', encoding='UTF-8') as file:
            if filename in files_not_working:
                continue
            else:
                tagged_text = preprocess_text(json_to_text(file.read()))
                if use_vabamorfcorpustagger:
                    tagged_text.pop_layer('morph_analysis')
                    vm_corpus_tagger.tag([tagged_text])
                training_texts.append(tagged_text)
    print(f"(!) Treenimistekstid ette valmistatud {time.time() - start} sekundiga")
    return training_texts

In [13]:
def train_nertagger(training_texts, new_model_dir):
    print("(!) Treenin NerTaggerit")
    start = time.time()
    
    modelUtil = ModelStorageUtil( new_model_dir )
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir=new_model_dir )
    print(f"(!) NerTagger treenitud {time.time() - start} sekundiga")

In [14]:
def tag_files(model_dir, testing_files, use_vabamorfcorpustagger):
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    nertagger = NerTagger(model_dir)
    
    print("(!) Märgendan")
    start = time.time()
    iterator = 1
    for test_file in testing_files:
        with open(find(test_file.replace(".json", ".txt"), os.path.join('..', 'data', 'vallakohtufailid')), 'r', encoding='UTF-8') as f:
            text = f.read()
        
        if test_file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
            text = text.replace('..', '. .')
        text = preprocess_text(Text(text))

        if use_vabamorfcorpustagger or "vabamorf" in model_dir:
            text.pop_layer('morph_analysis')
            text = [text]
            vm_corpus_tagger.tag( text )
            text = text[0]
        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))

        for x in removed_layers:
            text.pop_layer(x)

        path = os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')
        if not os.path.exists(path):
            os.mkdir(path)
            
        text_to_json(text, file=os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', test_file))
        
        print(f'{iterator}. Märgendatud fail {test_file}')
        iterator += 1
    print(f"(!) Failid märgendatud {time.time() - start} sekundiga")


In [15]:
test_files = {}

with open(os.path.join('..', 'data', 'corpus_subdistribution_without_hand_tagged.txt'), 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    test_files[file] = subdistribution.rstrip("\n")

In [34]:
def train_model(model_directory):

    # Get the filenames to be trained on from the files dictionary
    filenames = [key for key, value in files.items()]

    # Create training_texts from the aforementioned filenames
    training_texts = create_training_texts(filenames)

    # Set up the trainer and training
    new_model_dir = os.path.join('models', model_directory)
    train_nertagger(training_texts, new_model_dir)

    # Set up the new trained nertagger and defining layers to be removed later on
    tagger = NerTagger(model_dir = new_model_dir)
    #print(tagger.nersettings)
    # Tag the files using the new nertagger
    testing_files = [key for key, value in test_files.items()]
    tag_files(new_model_dir, testing_files, use_vabamorfcorpustagger)
            
    # Get results of model
    extract_results(model_directory, test_files)
    
    print(f"(!) Mudel {model_directory} treenitud")

In [38]:
train_model(os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants_best'))

Tulemuste ammutamine on lõpetatud.


({'ent_type': {'correct': 3894,
   'incorrect': 110,
   'partial': 0,
   'missed': 235,
   'spurious': 86,
   'possible': 4239,
   'actual': 4090,
   'precision': 0.952078239608802,
   'recall': 0.9186128803963199,
   'f1': 0.935046224036499},
  'partial': {'correct': 3833,
   'incorrect': 0,
   'partial': 171,
   'missed': 235,
   'spurious': 86,
   'possible': 4239,
   'actual': 4090,
   'precision': 0.9580684596577017,
   'recall': 0.9243925454116537,
   'f1': 0.9409292832272783},
  'strict': {'correct': 3782,
   'incorrect': 222,
   'partial': 0,
   'missed': 235,
   'spurious': 86,
   'possible': 4239,
   'actual': 4090,
   'precision': 0.9246943765281174,
   'recall': 0.8921915546119368,
   'f1': 0.9081522391643656},
  'exact': {'correct': 3833,
   'incorrect': 171,
   'partial': 0,
   'missed': 235,
   'spurious': 86,
   'possible': 4239,
   'actual': 4090,
   'precision': 0.9371638141809291,
   'recall': 0.9042226940316113,
   'f1': 0.9203986072757834}},
 {'ORG': {'ent_type': {

In [39]:
df = dict()
with open(os.path.join('models','model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants_best', 'results.txt'), 'r', encoding='UTF-8') as in_f:
    results_json = json.loads(in_f.read())

correct = results_json[0]['strict']['correct']
actual = results_json[0]['strict']['actual']
possible = results_json[0]['strict']['possible']

precision = (correct / actual)
recall = (correct / possible)
f1 = 2 * ((precision * recall) / (precision + recall))

df['model_gaz_loc_variants'] = [precision, recall, f1]

In [5]:
with open(os.path.join('models','model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants_best', 'results.txt'), 'r', encoding='UTF-8') as in_f:
    results_json = json.loads(in_f.read())

In [17]:
display(pd.DataFrame(results_by_named_entity(results_json)))

Unnamed: 0,Total
LOC_ORG_f1score,0.737968
LOC_ORG_precision,0.786325
LOC_ORG_recall,0.695214
LOC_f1score,0.539924
LOC_precision,0.601695
LOC_recall,0.489655
MISC_f1score,0.626506
MISC_precision,0.684211
MISC_recall,0.577778
ORG_f1score,0.808511


In [40]:
display(pd.DataFrame(df, index=['Precision', 'Recall', 'F1']))

Unnamed: 0,model_gaz_loc_variants
Precision,0.924694
Recall,0.892192
F1,0.908152
