In [1]:
import json
import time
import os
import re

from modules.preprocessing_protocols import preprocess_text
from modules.results_extraction import extract_results, \
                                       results_by_subdistribution, \
                                       results_by_named_entity, \
                                       confusion_matrix
from modules.tools import find

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil

from estnltk.taggers import VabamorfCorpusTagger
vm_corpus_tagger = VabamorfCorpusTagger()

### Flags & variables:

In [2]:
use_vabamorfcorpustagger = False

In [3]:
divided_corpus = os.path.join('..', 'data', 'divided_corpus.txt')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')
vallakohtufailid_location = os.path.join('..', 'data', 'vallakohtufailid')
no_goldstandard_tags_location = os.path.join('..', 'data', 'files_without_goldstandard_annotations.txt')
testing_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')

removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']

---

Get files without goldstandard annotations

In [4]:
with open(no_goldstandard_tags_location, 'r', encoding='UTF-8') as in_f:
    lines = in_f.readlines()

no_goldstandard_annotations = [line.strip() for line in lines]

Get all files for the first five subdistributions

In [5]:
files = {}

with open(divided_corpus, 'r', encoding = 'UTF-8') as in_f:
    txt = in_f.readlines()

for filename in txt:
    file, subdistribution = filename.split(":")
    files[file] = subdistribution.strip()

Return the subdistribution for testing and training
(e.g. `1` to test, `2`;`3`;`4`;`5` to train or `2` to test, `1` and `3`;`4`;`5` to train)

In [6]:
def get_testing_and_training_subdistribution(subdistribution):
    training = []
    for y in sorted(set(files.values())):
        if int(y) == subdistribution:
            testing = int(y)
        else:
            training.append(int(y))
    return testing, training

Create a list of Text objects from the files read in before (the subdistributions meant for training)

In [7]:
def create_training_texts(filenames):
    print('(!) Preparing training texts')
    
    training_texts = []
    
    for filename in filenames:
        if filename in no_goldstandard_annotations:
            continue
        else:
            with open(os.path.join(json_files_location, filename), 'r', encoding='UTF-8') as in_f:
                tagged_text = preprocess_text(json_to_text(in_f.read()))
                
            if use_vabamorfcorpustagger:
                tagged_text.pop_layer('morph_analysis')
                vm_corpus_tagger.tag([tagged_text])

            training_texts.append(tagged_text)
            
    print('(!) Training texts done')
    return training_texts

Train the NerTagger model using settings from the model directory

In [8]:
def train_nertagger(training_texts, new_model_dir):
    print('(!) Training NerTagger')
    
    modelUtil = ModelStorageUtil(new_model_dir)
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir=new_model_dir )
    print('(!) NerTagger training done\n')

Tag the files by finding the appropriate from vallakohtufailid_location,
then preprocessing them, removing layers for optimal file sizes and
saving them to a new directory in the model folder.

In [9]:
def tag_files(model_dir, testing_files, use_vabamorfcorpustagger, tag_wordner):
    nertagger = NerTagger(model_dir)
    
    print("(!) Tagging...")
    iterator = 1
    for test_file in testing_files:
        with open(find(test_file.replace(".json", ".txt"), vallakohtufailid_location), 'r', encoding='UTF-8') as f:
            text = f.read()
        
        if test_file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
            text = text.replace('..', '. .')
        text = preprocess_text(Text(text))

        if use_vabamorfcorpustagger or "vabamorf" in model_dir:
            text.pop_layer('morph_analysis')
            text = [text]
            vm_corpus_tagger.tag( text )
            text = text[0]
        nertagger.tag(text)
        text.add_layer(flatten(text['ner'], 'flat_ner'))
        
        if tag_wordner:
            print('(!) Tagging Word Level NER')
            wordnertagger = WordLevelNerTagger(model_dir)
            wordnertagger.tag(text)
            text.add_layer(flatten(text['wordner'], 'flat_wordner'))
        
        for x in removed_layers:
            text.pop_layer(x)

        path = os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')
        if not os.path.exists(path):
            os.mkdir(path)
            
        text_to_json(text, file=os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', test_file))
        
        print(f'{iterator}. Tagged file {test_file}')
        iterator += 1
    print('(!) Files tagged')


In [10]:
# Train the model by applying all necessary 
def train_model(model_directory, tag_wordner):
    for subdistribution in sorted(set(files.values())):
        testing, training = get_testing_and_training_subdistribution(int(subdistribution))

        # Get the filenames to be trained on from the files dictionary
        filenames = [key for key, value in files.items() if int(value) in training]

        # Create training_texts from the aforementioned filenames
        training_texts = create_training_texts(filenames)

        # Set up the trainer and training
        new_model_dir = os.path.join('models', model_directory)
        train_nertagger(training_texts, new_model_dir)

        # Set up the new trained nertagger and defining layers to be removed later on
        tagger = NerTagger(model_dir = new_model_dir)
        #print(tagger.nersettings)
        # Tag the files using the new nertagger
        testing_files = [key for key, value in files.items() if int(value) == testing]
        tag_files(new_model_dir, testing_files, use_vabamorfcorpustagger, tag_wordner)
            
    # Get results of model
    extract_results(files,
                    no_goldstandard_annotations,
                    os.path.join('models', model_directory, 'vallakohtufailid-trained-nertagger'), #training files location
                    testing_files_location,
                    os.path.join('models', model_directory)) #results.txt location
    
    print(f"(!) Model {model_directory} trained")

NB! Make sure the values (location of the training files and location of the `results.txt` file) in the `extract_results()` function are correct as these cannot be referenced before the model directory is defined.

To train the model the `model_directory` (given to the `train_model()` function) must contain a `settings.py` file

In [None]:
models = [
    'model_default_with_vabamorftagger',
    'model_local_features_without_morph',
    'model_morph_without_lemmas',
    'model_morph_with_lemmas',
    'model_morph_with_lemmas_and_sentences',
    'model_morph_with_lemmas_and_sentences_and_gazzetteer',
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_initial'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_vabamorf_gazetteer1and2'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc'),
    os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')    
]

for model in models:
    train_model(model, tag_wordner = False)

In [11]:
path = os.path.join('model_morph_with_lemmas_and_sentences_and_gazetteer_and_global_features', 'model_gaz_loc_variants')

train_model(path, tag_wordner = True)

(!) Preparing training texts
(!) Training texts done
(!) Training NerTagger
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 347546
Seconds required: 2.086

Stochastic Gradient Descent (SGD)
c2: 0.001000
max_iterations: 1000
period: 10
delta: 0.000001

Calibrating the learning rate (eta)
calibration.eta: 0.100000
calibration.rate: 2.000000
calibration.samples: 1000
calibration.candidates: 10
calibration.max_trials: 20
Initial loss: 30474.851022
Trial #1 (eta = 0.100000): 2509.798853
Trial #2 (eta = 0.200000): 3571.168525
Trial #3 (eta = 0.400000): 6982.558767
Trial #4 (eta = 0.800000): 13567.187870
Trial #5 (eta = 1.600000): 33517.948069 (worse)
Trial #6 (eta = 0.050000): 2188.020241
Trial #7 (eta = 0.025000): 2373.892858
Trial #8 (eta = 0.012500): 2796.282357
Trial #9 (eta = 0.006250): 3388.747438
Trial #10 (eta = 0.003125): 4204.634551
Trial #11 

***** Epoch #37 *****
Loss: 451.173951
Improvement ratio: 0.306801
Feature L2-norm: 104.994102
Learning rate (eta): 0.049816
Total number of feature updates: 600991
Seconds required for this iteration: 0.336

***** Epoch #38 *****
Loss: 436.505935
Improvement ratio: 0.286233
Feature L2-norm: 105.614194
Learning rate (eta): 0.049811
Total number of feature updates: 617234
Seconds required for this iteration: 0.326

***** Epoch #39 *****
Loss: 427.891405
Improvement ratio: 0.290678
Feature L2-norm: 106.222061
Learning rate (eta): 0.049806
Total number of feature updates: 633477
Seconds required for this iteration: 0.340

***** Epoch #40 *****
Loss: 413.717372
Improvement ratio: 0.322163
Feature L2-norm: 106.824378
Learning rate (eta): 0.049801
Total number of feature updates: 649720
Seconds required for this iteration: 0.346

***** Epoch #41 *****
Loss: 416.581246
Improvement ratio: 0.207907
Feature L2-norm: 107.410644
Learning rate (eta): 0.049796
Total number of feature updates: 665963

***** Epoch #79 *****
Loss: 248.877650
Improvement ratio: 0.118087
Feature L2-norm: 122.925089
Learning rate (eta): 0.049608
Total number of feature updates: 1283197
Seconds required for this iteration: 0.317

***** Epoch #80 *****
Loss: 251.542934
Improvement ratio: 0.084226
Feature L2-norm: 123.221516
Learning rate (eta): 0.049603
Total number of feature updates: 1299440
Seconds required for this iteration: 0.315

***** Epoch #81 *****
Loss: 235.989812
Improvement ratio: 0.067638
Feature L2-norm: 123.517995
Learning rate (eta): 0.049598
Total number of feature updates: 1315683
Seconds required for this iteration: 0.358

***** Epoch #82 *****
Loss: 244.563722
Improvement ratio: 0.177566
Feature L2-norm: 123.812238
Learning rate (eta): 0.049593
Total number of feature updates: 1331926
Seconds required for this iteration: 0.645

***** Epoch #83 *****
Loss: 243.362781
Improvement ratio: 0.098897
Feature L2-norm: 124.102649
Learning rate (eta): 0.049588
Total number of feature updates: 13

(!) Tagging Word Level NER
27. Tagged file L22ne_Emmaste_Emmaste_id15087_1895a.json
(!) Tagging Word Level NER
28. Tagged file V6ru_Vastseliina_Misso_id19866_1882a.json
(!) Tagging Word Level NER
29. Tagged file J2rva_Tyri_Kirna_id24452_1880a.json
(!) Tagging Word Level NER
30. Tagged file Tartu_V6nnu_Ahja_id12638_1875a.json
(!) Tagging Word Level NER
31. Tagged file Tartu_Kodavere_Pala_id16229_1849a.json
(!) Tagging Word Level NER
32. Tagged file Harju_Rapla_Rapla_id365_1873a.json
(!) Tagging Word Level NER
33. Tagged file Viljandi_P6ltsamaa_Adavere_id20278_1890a.json
(!) Tagging Word Level NER
34. Tagged file Viljandi_Viljandi_Karula_id19366_1868a.json
(!) Tagging Word Level NER
35. Tagged file Tartu_Kodavere_Pala_id22898_1872a.json
(!) Tagging Word Level NER
36. Tagged file P2rnu_Audru_V6lla_id5904_1878a.json
(!) Tagging Word Level NER
37. Tagged file Harju_J6el2htme_J6el2htme_id7612_1869a.json
(!) Tagging Word Level NER
38. Tagged file J2rva_Peetri_V2ike-Kareda_id19169_1869a.json
(

126. Tagged file Tartu_V6nnu_Ahja_id18630_1886a.json
(!) Tagging Word Level NER
127. Tagged file V6ru_Vastseliina_Misso_id13714_1886a.json
(!) Tagging Word Level NER
128. Tagged file V6ru_R2pina_Kahkva_id24938_1868a.json
(!) Tagging Word Level NER
129. Tagged file J2rva_Tyri_S2revere_id12762_1879a.json
(!) Tagging Word Level NER
130. Tagged file Tartu_Kodavere_Alatskivi_id21934_1881a.json
(!) Tagging Word Level NER
131. Tagged file V6ru_Vastseliina_Misso_id7468_1885a.json
(!) Tagging Word Level NER
132. Tagged file Saare_K2rla_K2rla_id5736_1827a.json
(!) Tagging Word Level NER
133. Tagged file Tartu_Sangaste_Kuigatsi_id16414_1872a.json
(!) Tagging Word Level NER
134. Tagged file Harju_Kose_Kose-Uuem6isa_id2174_1867a.json
(!) Tagging Word Level NER
135. Tagged file Viljandi_P6ltsamaa_Pajusi_id2717_1871a.json
(!) Tagging Word Level NER
136. Tagged file Viljandi_P6ltsamaa_Vana-P6ltsamaa_id8104_1888a.json
(!) Tagging Word Level NER
137. Tagged file Viljandi_K6pu_Suure-K6pu_id7189_1884a.jso

224. Tagged file Tartu_V6nnu_Ahja_id20664_1889a.json
(!) Tagging Word Level NER
225. Tagged file Harju_Rapla_Rapla_id15629_1867a.json
(!) Tagging Word Level NER
226. Tagged file Tartu_V6nnu_Ahja_id16115_1883a.json
(!) Tagging Word Level NER
227. Tagged file V6ru_Urvaste_Vaabina_id798_1876a.json
(!) Tagging Word Level NER
228. Tagged file V6ru_R2pina_Kahkva_id7319_1888a.json
(!) Tagging Word Level NER
229. Tagged file Tartu_Kodavere_Pala_id22626_1871a.json
(!) Tagging Word Level NER
230. Tagged file Harju_Kose_Palvere_id15360_1880a.json
(!) Tagging Word Level NER
231. Tagged file Tartu_Kodavere_Alatskivi_id6838_1879a.json
(!) Tagging Word Level NER
232. Tagged file Viljandi_P6ltsamaa_Pajusi_id2106_1870a.json
(!) Tagging Word Level NER
233. Tagged file V6ru_R6uge_Saaluse_id9979_1879a.json
(!) Tagging Word Level NER
234. Tagged file Tartu_V6nnu_Ahja_id23400_1893a.json
(!) Tagging Word Level NER
235. Tagged file J2rva_Anna_Purdi_id18906_1870a.json
(!) Tagging Word Level NER
236. Tagged fil

***** Epoch #26 *****
Loss: 643.418905
Improvement ratio: 0.586129
Feature L2-norm: 96.868894
Learning rate (eta): 0.049870
Total number of feature updates: 421226
Seconds required for this iteration: 0.335

***** Epoch #27 *****
Loss: 613.490752
Improvement ratio: 0.587452
Feature L2-norm: 97.755125
Learning rate (eta): 0.049865
Total number of feature updates: 437427
Seconds required for this iteration: 0.346

***** Epoch #28 *****
Loss: 594.890996
Improvement ratio: 0.510492
Feature L2-norm: 98.615812
Learning rate (eta): 0.049860
Total number of feature updates: 453628
Seconds required for this iteration: 0.350

***** Epoch #29 *****
Loss: 583.003786
Improvement ratio: 0.496444
Feature L2-norm: 99.439642
Learning rate (eta): 0.049855
Total number of feature updates: 469829
Seconds required for this iteration: 0.333

***** Epoch #30 *****
Loss: 551.861776
Improvement ratio: 0.481745
Feature L2-norm: 100.238830
Learning rate (eta): 0.049850
Total number of feature updates: 486030
Sec

***** Epoch #66 *****
Loss: 281.598319
Improvement ratio: 0.259069
Feature L2-norm: 119.019864
Learning rate (eta): 0.049672
Total number of feature updates: 1069266
Seconds required for this iteration: 0.352

***** Epoch #67 *****
Loss: 307.675712
Improvement ratio: 0.088739
Feature L2-norm: 119.380901
Learning rate (eta): 0.049667
Total number of feature updates: 1085467
Seconds required for this iteration: 0.365

***** Epoch #68 *****
Loss: 293.714445
Improvement ratio: 0.134223
Feature L2-norm: 119.733183
Learning rate (eta): 0.049662
Total number of feature updates: 1101668
Seconds required for this iteration: 0.355

***** Epoch #69 *****
Loss: 289.918656
Improvement ratio: 0.125724
Feature L2-norm: 120.078536
Learning rate (eta): 0.049657
Total number of feature updates: 1117869
Seconds required for this iteration: 0.334

***** Epoch #70 *****
Loss: 282.575739
Improvement ratio: 0.065642
Feature L2-norm: 120.425327
Learning rate (eta): 0.049652
Total number of feature updates: 11

(!) Tagging Word Level NER
42. Tagged file L22ne_Kullamaa_Sooniste_id3541_1880a.json
(!) Tagging Word Level NER
43. Tagged file J2rva_Tyri_Tyri-Alliku_id2315_1897a.json
(!) Tagging Word Level NER
44. Tagged file J2rva_Tyri_S2revere_id11683_1874a.json
(!) Tagging Word Level NER
45. Tagged file Saare_Kaarma_Loona_id7575_1899a.json
(!) Tagging Word Level NER
46. Tagged file V6ru_P6lva_K2hri_id21590_1851a.json
(!) Tagging Word Level NER
47. Tagged file Tartu_V6nnu_Ahja_id16351_1884a.json
(!) Tagging Word Level NER
48. Tagged file Tartu_V6nnu_Ahja_id11361_1872a.json
(!) Tagging Word Level NER
49. Tagged file Tartu_V6nnu_Ahja_id16121_1883a.json
(!) Tagging Word Level NER
50. Tagged file Tartu_V6nnu_Ahja_id21444_1866a.json
(!) Tagging Word Level NER
51. Tagged file J2rva_Tyri_S2revere_id14702_1887a.json
(!) Tagging Word Level NER
52. Tagged file L22ne_Martna_Martna_id12705_1885a.json
(!) Tagging Word Level NER
53. Tagged file Tartu_Torma_Avinurme_id6291_1861a.json
(!) Tagging Word Level NER
5

(!) Tagging Word Level NER
142. Tagged file Tartu_Otep22_Pyhaj2rve_id6032_1885a.json
(!) Tagging Word Level NER
143. Tagged file Tartu_V6nnu_Ahja_id12252_1873a.json
(!) Tagging Word Level NER
144. Tagged file Tartu_Laiuse_Kivij2rve_id1751_1860a.json
(!) Tagging Word Level NER
145. Tagged file Tartu_Kodavere_Pala_id15778_1842a.json
(!) Tagging Word Level NER
146. Tagged file Tartu_V6nnu_Ahja_id9743_1871a.json
(!) Tagging Word Level NER
147. Tagged file Tartu_V6nnu_Ahja_id11995_1873a.json
(!) Tagging Word Level NER
148. Tagged file Harju_J6el2htme_J6el2htme_id8161_1888a.json
(!) Tagging Word Level NER
149. Tagged file Tartu_Torma_Avinurme_id2545_1868a.json
(!) Tagging Word Level NER
150. Tagged file J2rva_Tyri_S2revere_id7477_1884a.json
(!) Tagging Word Level NER
151. Tagged file Tartu_Laiuse_Kivij2rve_id7913_1865a.json
(!) Tagging Word Level NER
152. Tagged file Viljandi_K6pu_Suure-K6pu_id6432_1884a.json
(!) Tagging Word Level NER
153. Tagged file Saare_Kaarma_Loona_id7805_1910a.json
(!

(!) Tagging Word Level NER
241. Tagged file J2rva_Tyri_Kirna_id23286_1872a.json
(!) Tagging Word Level NER
242. Tagged file Tartu_V6nnu_Ahja_id19466_1888a.json
(!) Tagging Word Level NER
243. Tagged file J2rva_Tyri_V22tsa_id18472_1889a.json
(!) Tagging Word Level NER
244. Tagged file Saare_Kihelkonna_Kotlandi_id15249_1860a.json
(!) Tagging Word Level NER
245. Tagged file Tartu_V6nnu_Ahja_id11346_1872a.json
(!) Tagging Word Level NER
246. Tagged file Harju_Rapla_Rapla_id22195_1871a.json
(!) Tagging Word Level NER
247. Tagged file Tartu_V6nnu_Ahja_id13021_1876a.json
(!) Tagging Word Level NER
248. Tagged file Harju_Jyri_Rae_id268_1874a.json
(!) Tagging Word Level NER
249. Tagged file Tartu_Kodavere_Alatskivi_id5700_1879a.json
(!) Tagging Word Level NER
250. Tagged file Tartu_Torma_Avinurme_id3955_1858a.json
(!) Files tagged
(!) Preparing training texts
(!) Training texts done
(!) Training NerTagger
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
featur

***** Epoch #33 *****
Loss: 498.692489
Improvement ratio: 0.435111
Feature L2-norm: 102.647611
Learning rate (eta): 0.049836
Total number of feature updates: 533973
Seconds required for this iteration: 0.359

***** Epoch #34 *****
Loss: 493.195181
Improvement ratio: 0.358705
Feature L2-norm: 103.349276
Learning rate (eta): 0.049831
Total number of feature updates: 550154
Seconds required for this iteration: 0.367

***** Epoch #35 *****
Loss: 476.446423
Improvement ratio: 0.387052
Feature L2-norm: 104.037330
Learning rate (eta): 0.049826
Total number of feature updates: 566335
Seconds required for this iteration: 0.353

***** Epoch #36 *****
Loss: 465.025482
Improvement ratio: 0.376104
Feature L2-norm: 104.699110
Learning rate (eta): 0.049821
Total number of feature updates: 582516
Seconds required for this iteration: 0.352

***** Epoch #37 *****
Loss: 446.144079
Improvement ratio: 0.350464
Feature L2-norm: 105.354092
Learning rate (eta): 0.049816
Total number of feature updates: 598697

***** Epoch #76 *****
Loss: 270.065901
Improvement ratio: 0.032721
Feature L2-norm: 122.434944
Learning rate (eta): 0.049623
Total number of feature updates: 1229756
Seconds required for this iteration: 0.359

***** Epoch #77 *****
Loss: 264.016780
Improvement ratio: 0.136070
Feature L2-norm: 122.742156
Learning rate (eta): 0.049618
Total number of feature updates: 1245937
Seconds required for this iteration: 0.352

***** Epoch #78 *****
Loss: 255.522731
Improvement ratio: 0.098851
Feature L2-norm: 123.049045
Learning rate (eta): 0.049613
Total number of feature updates: 1262118
Seconds required for this iteration: 0.346

***** Epoch #79 *****
Loss: 260.173118
Improvement ratio: 0.042995
Feature L2-norm: 123.351780
Learning rate (eta): 0.049608
Total number of feature updates: 1278299
Seconds required for this iteration: 0.348

***** Epoch #80 *****
Loss: 254.510147
Improvement ratio: 0.140051
Feature L2-norm: 123.648179
Learning rate (eta): 0.049603
Total number of feature updates: 12

***** Epoch #115 *****
Loss: 189.208469
Improvement ratio: 0.128532
Feature L2-norm: 132.256787
Learning rate (eta): 0.049432
Total number of feature updates: 1860815
Seconds required for this iteration: 0.378

***** Epoch #116 *****
Loss: 219.333087
Improvement ratio: -0.005785
Feature L2-norm: 132.462477
Learning rate (eta): 0.049427
Total number of feature updates: 1876996
Seconds required for this iteration: 0.357

SGD terminated with the stopping criteria
Loss: 189.208469
Total seconds required for training: 42.074

Storing the model
Number of active features: 355996 (355996)
Number of active attributes: 321742 (321742)
Number of active labels: 11 (11)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 1.044

(!) NerTagger training done

(!) Tagging...
(!) Tagging Word Level NER
1. Tagged file P2rnu_Tori_Sindi_id20212_1838a.json
(!) Tagging Word Level NER
2. Tagged file Viljandi_K6pu_Suure-K6pu_i

(!) Tagging Word Level NER
90. Tagged file L22ne_Vormsi_Vormsi_id24526_1888a.json
(!) Tagging Word Level NER
91. Tagged file J2rva_Tyri_S2revere_id14656_1887a.json
(!) Tagging Word Level NER
92. Tagged file Tartu_Maarja-Magdaleena_J6e_id12945_1877a.json
(!) Tagging Word Level NER
93. Tagged file J2rva_Peetri_Silmsi_id22492_1867a.json
(!) Tagging Word Level NER
94. Tagged file L22ne_Pyhalepa_K2rdla_id23150_1872a.json
(!) Tagging Word Level NER
95. Tagged file Viljandi_K6pu_Suure-K6pu_id4473_1883a.json
(!) Tagging Word Level NER
96. Tagged file Tartu_Kodavere_Ranna_id15127_1864a.json
(!) Tagging Word Level NER
97. Tagged file Tartu_V6nnu_Ahja_id14086_1882a.json
(!) Tagging Word Level NER
98. Tagged file Saare_Kihelkonna_Pidula_id5682_1890a.json
(!) Tagging Word Level NER
99. Tagged file Tartu_Kodavere_Pala_id22067_1871a.json
(!) Tagging Word Level NER
100. Tagged file L22ne_Pyhalepa_K2rdla_id23206_1872a.json
(!) Tagging Word Level NER
101. Tagged file Tartu_N6o_Pangodi_id5083_1889a.json


(!) Tagging Word Level NER
189. Tagged file Tartu_V6nnu_Ahja_id22666_1881a.json
(!) Tagging Word Level NER
190. Tagged file Harju_J6el2htme_J6el2htme_id6475_1868a.json
(!) Tagging Word Level NER
191. Tagged file V6ru_R6uge_Saaluse_id11773_1880a.json
(!) Tagging Word Level NER
192. Tagged file L22ne_Vormsi_Vormsi_id25013_1888a.json
(!) Tagging Word Level NER
193. Tagged file V6ru_Urvaste_Vaabina_id785_1876a.json
(!) Tagging Word Level NER
194. Tagged file V6ru_R2pina_R2pina_id12011_1866a.json
(!) Tagging Word Level NER
195. Tagged file Tartu_V6nnu_Ahja_id20646_1889a.json
(!) Tagging Word Level NER
196. Tagged file Harju_Rapla_Rapla_id20938_1870a.json
(!) Tagging Word Level NER
197. Tagged file J2rva_Tyri_S2revere_id10443_1868a.json
(!) Tagging Word Level NER
198. Tagged file L22ne_Vormsi_Vormsi_id13660_1888a.json
(!) Tagging Word Level NER
199. Tagged file Tartu_Torma_Avinurme_id20772_1871a.json
(!) Tagging Word Level NER
200. Tagged file V6ru_P6lva_Kiuma_id6113_1880a.json
(!) Tagging W

***** Epoch #12 *****
Loss: 1296.589067
Improvement ratio: 5.260294
Feature L2-norm: 77.123105
Learning rate (eta): 0.049940
Total number of feature updates: 184140
Seconds required for this iteration: 0.336

***** Epoch #13 *****
Loss: 1199.162338
Improvement ratio: 3.768998
Feature L2-norm: 78.894400
Learning rate (eta): 0.049935
Total number of feature updates: 199485
Seconds required for this iteration: 0.333

***** Epoch #14 *****
Loss: 1112.172480
Improvement ratio: 2.805396
Feature L2-norm: 80.554207
Learning rate (eta): 0.049930
Total number of feature updates: 214830
Seconds required for this iteration: 0.335

***** Epoch #15 *****
Loss: 1025.144642
Improvement ratio: 2.413545
Feature L2-norm: 82.098644
Learning rate (eta): 0.049925
Total number of feature updates: 230175
Seconds required for this iteration: 0.336

***** Epoch #16 *****
Loss: 962.428341
Improvement ratio: 1.922827
Feature L2-norm: 83.558921
Learning rate (eta): 0.049920
Total number of feature updates: 245520


***** Epoch #53 *****
Loss: 313.388454
Improvement ratio: 0.194981
Feature L2-norm: 110.862044
Learning rate (eta): 0.049736
Total number of feature updates: 813285
Seconds required for this iteration: 0.345

***** Epoch #54 *****
Loss: 309.131775
Improvement ratio: 0.184210
Feature L2-norm: 111.291069
Learning rate (eta): 0.049731
Total number of feature updates: 828630
Seconds required for this iteration: 0.344

***** Epoch #55 *****
Loss: 308.333389
Improvement ratio: 0.165089
Feature L2-norm: 111.712822
Learning rate (eta): 0.049727
Total number of feature updates: 843975
Seconds required for this iteration: 0.346

***** Epoch #56 *****
Loss: 302.311528
Improvement ratio: 0.167260
Feature L2-norm: 112.129452
Learning rate (eta): 0.049722
Total number of feature updates: 859320
Seconds required for this iteration: 0.334

***** Epoch #57 *****
Loss: 296.043034
Improvement ratio: 0.170630
Feature L2-norm: 112.539778
Learning rate (eta): 0.049717
Total number of feature updates: 874665

***** Epoch #94 *****
Loss: 209.603986
Improvement ratio: 0.085229
Feature L2-norm: 124.063361
Learning rate (eta): 0.049534
Total number of feature updates: 1442430
Seconds required for this iteration: 0.369

***** Epoch #95 *****
Loss: 210.480954
Improvement ratio: 0.045432
Feature L2-norm: 124.305813
Learning rate (eta): 0.049529
Total number of feature updates: 1457775
Seconds required for this iteration: 0.349

***** Epoch #96 *****
Loss: 206.610219
Improvement ratio: 0.068495
Feature L2-norm: 124.547803
Learning rate (eta): 0.049525
Total number of feature updates: 1473120
Seconds required for this iteration: 0.338

***** Epoch #97 *****
Loss: 205.524305
Improvement ratio: 0.087289
Feature L2-norm: 124.784978
Learning rate (eta): 0.049520
Total number of feature updates: 1488465
Seconds required for this iteration: 0.339

***** Epoch #98 *****
Loss: 204.672875
Improvement ratio: 0.071693
Feature L2-norm: 125.019509
Learning rate (eta): 0.049515
Total number of feature updates: 15

***** Epoch #134 *****
Loss: 173.030619
Improvement ratio: 0.040919
Feature L2-norm: 132.210275
Learning rate (eta): 0.049339
Total number of feature updates: 2056230
Seconds required for this iteration: 0.337

***** Epoch #135 *****
Loss: 170.278876
Improvement ratio: 0.030953
Feature L2-norm: 132.381256
Learning rate (eta): 0.049334
Total number of feature updates: 2071575
Seconds required for this iteration: 0.336

***** Epoch #136 *****
Loss: 169.229278
Improvement ratio: 0.046432
Feature L2-norm: 132.550037
Learning rate (eta): 0.049329
Total number of feature updates: 2086920
Seconds required for this iteration: 0.336

***** Epoch #137 *****
Loss: 168.359135
Improvement ratio: 0.051090
Feature L2-norm: 132.717439
Learning rate (eta): 0.049324
Total number of feature updates: 2102265
Seconds required for this iteration: 0.335

***** Epoch #138 *****
Loss: 169.674593
Improvement ratio: 0.031554
Feature L2-norm: 132.884184
Learning rate (eta): 0.049319
Total number of feature update

***** Epoch #173 *****
Loss: 149.327476
Improvement ratio: 0.043861
Feature L2-norm: 138.053787
Learning rate (eta): 0.049150
Total number of feature updates: 2654685
Seconds required for this iteration: 0.332

***** Epoch #174 *****
Loss: 152.346429
Improvement ratio: 0.015064
Feature L2-norm: 138.183683
Learning rate (eta): 0.049145
Total number of feature updates: 2670030
Seconds required for this iteration: 0.333

***** Epoch #175 *****
Loss: 150.211894
Improvement ratio: 0.020422
Feature L2-norm: 138.314473
Learning rate (eta): 0.049140
Total number of feature updates: 2685375
Seconds required for this iteration: 0.332

***** Epoch #176 *****
Loss: 147.933119
Improvement ratio: 0.031078
Feature L2-norm: 138.443279
Learning rate (eta): 0.049135
Total number of feature updates: 2700720
Seconds required for this iteration: 0.329

***** Epoch #177 *****
Loss: 150.763254
Improvement ratio: 0.014988
Feature L2-norm: 138.571279
Learning rate (eta): 0.049130
Total number of feature update

(!) Tagging...
(!) Tagging Word Level NER
1. Tagged file Tartu_Kodavere_Kokora_id1325_1872a.json
(!) Tagging Word Level NER
2. Tagged file J2rva_Tyri_V22tsa_id22266_1913a.json
(!) Tagging Word Level NER
3. Tagged file L22ne_Martna_Martna_id18100_1871a.json
(!) Tagging Word Level NER
4. Tagged file Tartu_V6nnu_Ahja_id16620_1884a.json
(!) Tagging Word Level NER
5. Tagged file V6ru_R2pina_R2pina_id12080_1866a.json
(!) Tagging Word Level NER
6. Tagged file P2rnu_Halliste_Pornuse_id3474_1868a.json
(!) Tagging Word Level NER
7. Tagged file Viljandi_Viljandi_Karula_id19401_1868a.json
(!) Tagging Word Level NER
8. Tagged file J2rva_Tyri_V22tsa_id17427_1888a.json
(!) Tagging Word Level NER
9. Tagged file Harju_Kose_Kose-Uuem6isa_id3144_1867a.json
(!) Tagging Word Level NER
10. Tagged file Tartu_V6nnu_Ahja_id16098_1883a.json
(!) Tagging Word Level NER
11. Tagged file V6ru_Vastseliina_Misso_id21127_1882a.json
(!) Tagging Word Level NER
12. Tagged file Tartu_Otep22_Pyhaj2rve_id1280_1884a.json
(!) 

100. Tagged file L22ne_Pyhalepa_K2rdla_id23208_1872a.json
(!) Tagging Word Level NER
101. Tagged file Tartu_V6nnu_Ahja_id17090_1884a.json
(!) Tagging Word Level NER
102. Tagged file Harju_Kose_Palvere_id18191_1882a.json
(!) Tagging Word Level NER
103. Tagged file J2rva_J2rva-Jaani_Karinu_id1193_1866a.json
(!) Tagging Word Level NER
104. Tagged file V6ru_Kanepi_Krootuse_id16254_1888a.json
(!) Tagging Word Level NER
105. Tagged file Tartu_V6nnu_Ahja_id14796_1882a.json
(!) Tagging Word Level NER
106. Tagged file V6ru_R2pina_R2pina_id1168_1863a.json
(!) Tagging Word Level NER
107. Tagged file V6ru_R6uge_Saaluse_id9577_1878a.json
(!) Tagging Word Level NER
108. Tagged file V6ru_R2pina_Kahkva_id5750_1887a.json
(!) Tagging Word Level NER
109. Tagged file J2rva_Tyri_Tyri-Alliku_id1370_1894a.json
(!) Tagging Word Level NER
110. Tagged file V6ru_R2pina_Kahkva_id19205_1866a.json
(!) Tagging Word Level NER
111. Tagged file J2rva_Tyri_S2revere_id13737_1884a.json
(!) Tagging Word Level NER
112. Tagg

199. Tagged file Tartu_Torma_Avinurme_id17369_1871a.json
(!) Tagging Word Level NER
200. Tagged file Harju_Keila_Keila_id13018_1890a.json
(!) Tagging Word Level NER
201. Tagged file V6ru_Vastseliina_Misso_id21891_1883a.json
(!) Tagging Word Level NER
202. Tagged file Harju_Hageri_Kohila_id4966_1888a.json
(!) Tagging Word Level NER
203. Tagged file Viljandi_P6ltsamaa_Uue-P6ltsamaa_id8714_1854a.json
(!) Tagging Word Level NER
204. Tagged file V6ru_P6lva_Kiuma_id9794_1881a.json
(!) Tagging Word Level NER
205. Tagged file Tartu_V6nnu_Kiidj2rve_id24401_1865a.json
(!) Tagging Word Level NER
206. Tagged file J2rva_Tyri_V22tsa_id16408_1885a.json
(!) Tagging Word Level NER
207. Tagged file J2rva_Tyri_Tyri-Alliku_id3994_1903a.json
(!) Tagging Word Level NER
208. Tagged file P2rnu_Audru_V6lla_id5150_1877a.json
(!) Tagging Word Level NER
209. Tagged file V6ru_R2pina_Kahkva_id8829_1888a.json
(!) Tagging Word Level NER
210. Tagged file L22ne_Karuse_Saastna_id22146_1867a.json
(!) Tagging Word Level N

***** Epoch #16 *****
Loss: 1002.038645
Improvement ratio: 1.981774
Feature L2-norm: 84.986986
Learning rate (eta): 0.049920
Total number of feature updates: 249824
Seconds required for this iteration: 0.326

***** Epoch #17 *****
Loss: 939.802186
Improvement ratio: 1.690263
Feature L2-norm: 86.393819
Learning rate (eta): 0.049915
Total number of feature updates: 265438
Seconds required for this iteration: 0.322

***** Epoch #18 *****
Loss: 890.885643
Improvement ratio: 1.415766
Feature L2-norm: 87.722122
Learning rate (eta): 0.049910
Total number of feature updates: 281052
Seconds required for this iteration: 0.336

***** Epoch #19 *****
Loss: 835.923823
Improvement ratio: 1.285679
Feature L2-norm: 88.981328
Learning rate (eta): 0.049905
Total number of feature updates: 296666
Seconds required for this iteration: 0.325

***** Epoch #20 *****
Loss: 787.997047
Improvement ratio: 1.128133
Feature L2-norm: 90.182740
Learning rate (eta): 0.049900
Total number of feature updates: 312280
Sec

***** Epoch #56 *****
Loss: 321.903113
Improvement ratio: 0.206285
Feature L2-norm: 114.282871
Learning rate (eta): 0.049722
Total number of feature updates: 874384
Seconds required for this iteration: 0.323

***** Epoch #57 *****
Loss: 324.435576
Improvement ratio: 0.141148
Feature L2-norm: 114.692085
Learning rate (eta): 0.049717
Total number of feature updates: 889998
Seconds required for this iteration: 0.317

***** Epoch #58 *****
Loss: 298.642348
Improvement ratio: 0.179408
Feature L2-norm: 115.098436
Learning rate (eta): 0.049712
Total number of feature updates: 905612
Seconds required for this iteration: 0.328

***** Epoch #59 *****
Loss: 316.739260
Improvement ratio: 0.171727
Feature L2-norm: 115.503341
Learning rate (eta): 0.049707
Total number of feature updates: 921226
Seconds required for this iteration: 0.311

***** Epoch #60 *****
Loss: 292.676873
Improvement ratio: 0.157564
Feature L2-norm: 115.896642
Learning rate (eta): 0.049702
Total number of feature updates: 936840

***** Epoch #98 *****
Loss: 227.076789
Improvement ratio: 0.063516
Feature L2-norm: 127.406215
Learning rate (eta): 0.049515
Total number of feature updates: 1530172
Seconds required for this iteration: 0.305

***** Epoch #99 *****
Loss: 208.020045
Improvement ratio: 0.152963
Feature L2-norm: 127.642771
Learning rate (eta): 0.049510
Total number of feature updates: 1545786
Seconds required for this iteration: 0.304

***** Epoch #100 *****
Loss: 223.668018
Improvement ratio: 0.048575
Feature L2-norm: 127.882465
Learning rate (eta): 0.049505
Total number of feature updates: 1561400
Seconds required for this iteration: 0.305

***** Epoch #101 *****
Loss: 222.577811
Improvement ratio: 0.041595
Feature L2-norm: 128.113796
Learning rate (eta): 0.049500
Total number of feature updates: 1577014
Seconds required for this iteration: 0.304

***** Epoch #102 *****
Loss: 220.763008
Improvement ratio: 0.063052
Feature L2-norm: 128.343379
Learning rate (eta): 0.049495
Total number of feature updates:

(!) Tagging Word Level NER
65. Tagged file Tartu_Kodavere_Ranna_id14234_1857a.json
(!) Tagging Word Level NER
66. Tagged file V6ru_P6lva_K2hri_id21791_1851a.json
(!) Tagging Word Level NER
67. Tagged file Tartu_V6nnu_Ahja_id23744_1896a.json
(!) Tagging Word Level NER
68. Tagged file V6ru_R2pina_Kahkva_id21602_1867a.json
(!) Tagging Word Level NER
69. Tagged file Tartu_Kodavere_Ranna_id19767_1865a.json
(!) Tagging Word Level NER
70. Tagged file Tartu_Torma_Avinurme_id23582_1872a.json
(!) Tagging Word Level NER
71. Tagged file Tartu_R6ngu_Aakre_id2550_1888a.json
(!) Tagging Word Level NER
72. Tagged file J2rva_Tyri_Kirna_id24064_1879a.json
(!) Tagging Word Level NER
73. Tagged file Tartu_V6nnu_Ahja_id19682_1888a.json
(!) Tagging Word Level NER
74. Tagged file Viljandi_Paistu_Holstre_id10573_1900a.json
(!) Tagging Word Level NER
75. Tagged file Tartu_Kodavere_Ranna_id355_1883a.json
(!) Tagging Word Level NER
76. Tagged file Harju_Hageri_Kohila_id4930_1887a.json
(!) Tagging Word Level NER


164. Tagged file P2rnu_P2rnu-Elisabethi_Sauga_id18745_1877a.json
(!) Tagging Word Level NER
165. Tagged file Tartu_V6nnu_Ahja_id18234_1886a.json
(!) Tagging Word Level NER
166. Tagged file Tartu_Torma_Avinurme_id20454_1871a.json
(!) Tagging Word Level NER
167. Tagged file Harju_Rapla_Rapla_id18671_1869a.json
(!) Tagging Word Level NER
168. Tagged file Tartu_V6nnu_Ahja_id18214_1886a.json
(!) Tagging Word Level NER
169. Tagged file Tartu_Laiuse_Kivij2rve_id1436_1856a.json
(!) Tagging Word Level NER
170. Tagged file Tartu_V6nnu_Ahja_id22561_1878a.json
(!) Tagging Word Level NER
171. Tagged file Tartu_Kodavere_Ranna_id14286_1858a.json
(!) Tagging Word Level NER
172. Tagged file P2rnu_Halliste_Penuja_id758_1885a.json
(!) Tagging Word Level NER
173. Tagged file Saare_Kihelkonna_Atla_id6893_1872a.json
(!) Tagging Word Level NER
174. Tagged file Harju_Hageri_Kohila_id10769_1873a.json
(!) Tagging Word Level NER
175. Tagged file Tartu_V6nnu_Ahja_id16318_1883a.json
(!) Tagging Word Level NER
176.