In [1]:
import json
import time
import os
import re

from modules.preprocessing_protocols import preprocess_text
from modules.results_extraction import extract_results, \
                                       results_by_subdistribution, \
                                       results_by_named_entity, \
                                       confusion_matrix
from modules.tools import find

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil

from estnltk.taggers import VabamorfCorpusTagger
vm_corpus_tagger = VabamorfCorpusTagger()

### Flags & variables:

In [2]:
use_vabamorfcorpustagger = False

In [2]:
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
vallakohtufailid_location = os.path.join('..', 'data', 'vallakohtufailid')
no_goldstandard_tags_location = os.path.join('..', 'data', 'files_without_goldstandard_annotations.txt')
testing_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')

removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']

---

Get files without goldstandard annotations

In [5]:
with open(no_goldstandard_tags_location, 'r', encoding='UTF-8') as in_f:
    lines = in_f.readlines()

no_goldstandard_annotations = [line.strip() for line in lines]

Get all files for the first five subdistributions

In [3]:
files = {}

with open(divided_corpus, 'r', encoding = 'UTF-8') as in_f:
    txt = in_f.readlines()

for filename in txt:
    file, subdistribution = filename.split(":")
    files[file] = subdistribution.strip()

Return the subdistribution for testing and training
(e.g. `1` to test, `2`;`3`;`4`;`5` to train or `2` to test, `1` and `3`;`4`;`5` to train)

In [6]:
def get_testing_and_training_subdistribution(subdistribution):
    training = []
    for y in sorted(set(files.values())):
        if int(y) == subdistribution:
            testing = int(y)
        else:
            training.append(int(y))
    return testing, training

Create a list of Text objects from the files read in before (the subdistributions meant for training)

In [7]:
def create_training_texts(filenames):
    print('(!) Preparing training texts')
    
    training_texts = []
    
    for filename in filenames:
        if filename in no_goldstandard_annotations:
            continue
        else:
            with open(os.path.join(json_files_location, filename), 'r', encoding='UTF-8') as in_f:
                tagged_text = preprocess_text(json_to_text(in_f.read()))
                
            if use_vabamorfcorpustagger:
                tagged_text.pop_layer('morph_analysis')
                vm_corpus_tagger.tag([tagged_text])

            training_texts.append(tagged_text)
            
    print('(!) Training texts done')
    return training_texts

Train the NerTagger model using settings from the model directory

In [8]:
def train_nertagger(training_texts, new_model_dir):
    print('(!) Training NerTagger')
    
    modelUtil = ModelStorageUtil(new_model_dir)
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir=new_model_dir )
    print('(!) NerTagger training done\n')

Tag the files by finding the appropriate from vallakohtufailid_location,
then preprocessing them, removing layers for optimal file sizes and
saving them to a new directory in the model folder.

In [9]:
def tag_files(model_dir, testing_files, use_vabamorfcorpustagger):
    nertagger = NerTagger(model_dir)
    
    print("(!) Tagging...")
    iterator = 1
    for test_file in testing_files:
        with open(find(test_file.replace(".json", ".txt"), vallakohtufailid_location), 'r', encoding='UTF-8') as f:
            text = f.read()
        
        if test_file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
            text = text.replace('..', '. .')
        text = preprocess_text(Text(text))

        if use_vabamorfcorpustagger or "vabamorf" in model_dir:
            text.pop_layer('morph_analysis')
            text = [text]
            vm_corpus_tagger.tag( text )
            text = text[0]
        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))

        for x in removed_layers:
            text.pop_layer(x)

        path = os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')
        if not os.path.exists(path):
            os.mkdir(path)
            
        text_to_json(text, file=os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', test_file))
        
        print(f'{iterator}. Tagged file {test_file}')
        iterator += 1
    print('(!) Files tagged')


In [8]:
# Train the model by applying all necessary 
def train_model(model_directory):
    for subdistribution in sorted(set(files.values())):
        testing, training = get_testing_and_training_subdistribution(int(subdistribution))

        # Get the filenames to be trained on from the files dictionary
        filenames = [key for key, value in files.items() if int(value) in training]

        # Create training_texts from the aforementioned filenames
        training_texts = create_training_texts(filenames)

        # Set up the trainer and training
        new_model_dir = os.path.join('models', model_directory)
        train_nertagger(training_texts, new_model_dir)

        # Set up the new trained nertagger and defining layers to be removed later on
        tagger = NerTagger(model_dir = new_model_dir)
        #print(tagger.nersettings)
        # Tag the files using the new nertagger
        testing_files = [key for key, value in files.items() if int(value) == testing]
        tag_files(new_model_dir, testing_files, use_vabamorfcorpustagger)
            
    # Get results of model
    extract_results(model_directory, #model directory path
                    files,
                    no_goldstandard_annotations,
                    os.path.join('models', model_directory, 'vallakohtufailid-trained-nertagger'), #training files location
                    testing_files_location,
                    os.path.join('models', model_directory)) #results.txt location
    
    print(f"(!) Model {model_directory} trained")

NB! Make sure the values (location of the training files and location of the `results.txt` file) in the `extract_results()` function are correct as these cannot be referenced before the model directory is defined.

To train the model the `model_directory` (given to the `train_model()` function) must contain a `settings.py` file

In [None]:
models = [
    'model_default_with_vabamorftagger',
    'model_local_features_without_morph',
    'model_morph_without_lemmas',
    'model_morph_with_lemmas',
    'model_morph_with_lemmas_and_sentences',
    'model_morph_with_lemmas_and_sentences_and_gazzetteer',
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_initial'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer1and2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')    
]
for model in models:
    train_model(model)