In [1]:
import json
import time
import os
import sklearn_crfsuite
import re
import nereval
import pandas as pd

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from estnltk.taggers import Retagger
from estnltk.taggers import CompoundTokenTagger
from sklearn_crfsuite import metrics

from nervaluate import Evaluator

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil
from estnltk.core import DEFAULT_PY3_NER_MODEL_DIR

In [2]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [3]:
class TokenSplitter( Retagger ):
    """Splits tokens into smaller tokens based on regular expression patterns.""" 
    conf_param = ['patterns', 'break_group_name']
    
    def __init__(self, patterns, break_group_name:str='end'):
        # Set input/output layers
        self.input_layers = ['tokens']
        self.output_layer = 'tokens'
        self.output_attributes = ()
        # Set other configuration parameters
        if not (isinstance(break_group_name, str) and len(break_group_name) > 0):
            raise TypeError('(!) break_group_name should be a non-empty string.')
        self.break_group_name = break_group_name
        # Assert that all patterns are regular expressions in the valid format
        if not isinstance(patterns, list):
            raise TypeError('(!) patterns should be a list of compiled regular expressions.')
        # TODO: we use an adhoc way to verify that patterns are regular expressions 
        #       because there seems to be no common way of doing it both in py35 
        #       and py36
        for pat in patterns:
            # Check for the existence of methods/attributes
            has_match   = callable(getattr(pat, "match", None))
            has_search  = callable(getattr(pat, "search", None))
            has_pattern = getattr(pat, "pattern", None) is not None
            for (k,v) in (('method match()',has_match),\
                          ('method search()',has_search),\
                          ('attribute pattern',has_pattern)):
                if v is False:
                    raise TypeError('(!) Unexpected regex pattern: {!r} is missing {}.'.format(pat, k))
            symbolic_groups = pat.groupindex
            if self.break_group_name not in symbolic_groups.keys():
                raise TypeError('(!) Pattern {!r} is missing symbolic group named {!r}.'.format(pat, self.break_group_name))
        self.patterns = patterns

    def _change_layer(self, text, layers, status):
        # Get changeble layer
        changeble_layer = layers[self.output_layer]
        # Iterate over tokens
        add_spans    = []
        remove_spans = []
        for span in changeble_layer:
            token_str = text.text[span.start:span.end]
            for pat in self.patterns:
                m = pat.search(token_str)
                if m:
                    break_group_end = m.end( self.break_group_name )
                    if break_group_end > -1 and \
                       break_group_end > 0  and \
                       break_group_end < len(token_str):
                        # Make the split
                        add_spans.append( (span.start, span.start+break_group_end) )
                        add_spans.append( (span.start+break_group_end, span.end) )
                        remove_spans.append( span )
                        # Once a token has been split, then break and move on to 
                        # the next token ...
                        break
        if add_spans:
            assert len(remove_spans) > 0
            for old_span in remove_spans:
                changeble_layer.remove_span( old_span )
            for new_span in add_spans:
                changeble_layer.add_annotation( new_span )

token_splitter = TokenSplitter(patterns=[re.compile(r'(?P<end>[A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>Piebenomme)metsawaht'),\
                                         re.compile(r'(?P<end>maa)peal'),\
                                         re.compile(r'(?P<end>reppi)käest'),\
                                         re.compile(r'(?P<end>Kiidjerwelt)J'),\
                                         re.compile(r'(?P<end>Ameljanow)Persitski'),\
                                         re.compile(r'(?P<end>mõistmas)Mihkel'),\
                                         re.compile(r'(?P<end>tema)Käkk'),\
                                         re.compile(r'(?P<end>Ahjawalla)liikmed'),\
                                         re.compile(r'(?P<end>kohtumees)A'),\
                                         re.compile(r'(?P<end>Pechmann)x'),\
                                         re.compile(r'(?P<end>pölli)Anni'),\
                                         re.compile(r'(?P<end>külla)Rauba'),\
                                         re.compile(r'(?P<end>kohtowannem)Jaak'),\
                                         re.compile(r'(?P<end>rannast)Leno'),\
                                         re.compile(r'(?P<end>wallast)Kiiwita'),\
                                         re.compile(r'(?P<end>wallas)Kristjan'),\
                                         re.compile(r'(?P<end>Pedoson)rahul'),\
                                         re.compile(r'(?P<end>pere)Jaan'),\
                                         re.compile(r'(?P<end>kohtu)poolest'),\
                                         re.compile(r'(?P<end>Kurrista)kaudo'),\
                                         re.compile(r'(?P<end>mölder)Gottlieb'),\
                                         re.compile(r'(?P<end>wöörmündri)Jaan'),\
                                         re.compile(r'(?P<end>Oinas)ja'),\
                                         re.compile(r'(?P<end>ette)Leenu'),\
                                         re.compile(r'(?P<end>Tommingas)peab'),\
                                         re.compile(r'(?P<end>wäljaja)Kotlep'),\
                                         re.compile(r'(?P<end>pea)A'),\
                                         re.compile(r'(?P<end>talumees)Nikolai')])

These files don't work because the protocols are written in a different language, which the goldstandard didn't recognise, hence have no goldstandard tags.

In [4]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [5]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [6]:
all_results = {}

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    
    # Getting the filenames to be trained on from the files dictionary.
    filenames = {key: value for key, value in files.items() if int(value) in training_subdistributions}
    
    # Creating training_texts from the aforementioned filenames.
    print("Defineerin treenimistekstid.")
    start = time.time()
    training_texts = []
    for filename in filenames:
        with open('./vallakohtufailid_json/' + str(filename), 'r', encoding='UTF-8') as file:
            if filename in files_not_working:
                continue
            else:
                training_texts.append(json_to_text(file.read()).tag_layer(['sentences', 'morph_analysis']))
    print(f"Treenimistekstid defineeritud {time.time() - start} sekundiga.")
    # Setting up the trainer and training.
    print("\n\nAlustan nertaggeri treenimist.")
    start = time.time()
    model_dir=DEFAULT_PY3_NER_MODEL_DIR
    modelUtil = ModelStorageUtil(model_dir)
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir='test' )
    print(f"NerTagger treenitud {time.time() - start} sekundiga.")
    # Setting up the new trained nertagger and defining layers to be removed later on.
    nertagger = NerTagger(model_dir = 'test')
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    
    # Tagging the files using the new nertagger.
    print("\n\nAlustan failide taggimist.")
    start = time.time()
    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
            text = Text(f.read())
            if f == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
                text = text.replace('..', '. .')

            text = text.tag_layer(['tokens'])
            token_splitter.retag(text)
            CompoundTokenTagger(tag_initials = False, tag_abbreviations = False, tag_hyphenations = False).tag(text)
            text.tag_layer('morph_analysis')

            nertagger.tag(text)
            text.add_layer(flatten(text['ner'], 'flat_ner'))

            for x in removed_layers:
                text.pop_layer(x)
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)
            print(f'Täägitud fail {file}')
    print(f"Failid taggitud {time.time() - start} sekundiga.")
    
    # Chaning the tags into a readable formats for the evaluator.
    print("\n\nAlustan tulemuste ammutamist.")

    #gold = []
    #test = []
    gold_ner = []
    test_ner = []

    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        appendable_gold_ner = []
        appendable_test_ner = []

        if file.endswith(".json"):
            if file in files_not_working:
                continue
            else:
                with open("./vallakohtufailid_nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                    open("./vallakohtufailid_json_flat/" + str(file), 'r', encoding='UTF-8') as f_gold:
                        test_import = json_to_text(f_test.read())
                        gold_import = json_to_text(f_gold.read())

                        # The commented part is needed for word-level-ner.
                        '''
                        for i in range(len(gold_import['flat_gold_wordner'])):
                            tag = gold_import['flat_gold_wordner'][i].nertag[0]
                            gold.append(tag)
                        for i in range(len(test_import['flat_wordner'])):
                            tag = test_import['flat_wordner'][i].nertag[0]
                            test.append(tag)
                        '''
                        
                        for i in range(len(gold_import['gold_ner'])):
                            ner = gold_import['gold_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_gold_ner.append({"label": label, "start": start, "end": end})

                        for i in range(len(test_import['flat_ner'])):
                            ner = test_import['flat_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_test_ner.append({"label": label, "start": start, "end": end})

        gold_ner.append(appendable_gold_ner)
        test_ner.append(appendable_test_ner)

    evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
    results, results_per_tag = evaluator.evaluate()
    all_results[subdistribution_for_testing] = (results, results_per_tag)
print("Programm on lõpetanud oma töö.")

Defineerin treenimistekstid.
Treenimistekstid defineeritud 129.58176922798157 sekundiga.


Alustan nertaggeri treenimist.
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 347968
Seconds required: 2.995

Stochastic Gradient Descent (SGD)
c2: 0.001000
max_iterations: 1000
period: 10
delta: 0.000001

Calibrating the learning rate (eta)
calibration.eta: 0.100000
calibration.rate: 2.000000
calibration.samples: 1000
calibration.candidates: 10
calibration.max_trials: 20
Initial loss: 30474.851022
Trial #1 (eta = 0.100000): 2552.120197
Trial #2 (eta = 0.200000): 3478.921649
Trial #3 (eta = 0.400000): 6748.410511
Trial #4 (eta = 0.800000): 14876.855292
Trial #5 (eta = 1.600000): 31515.541111 (worse)
Trial #6 (eta = 0.050000): 2212.233214
Trial #7 (eta = 0.025000): 2383.132649
Trial #8 (eta = 0.012500): 2798.960243
Trial #9 (eta = 0.006250): 3386.244078
Tria

***** Epoch #37 *****
Loss: 468.751031
Improvement ratio: 0.285963
Feature L2-norm: 106.152285
Learning rate (eta): 0.049816
Total number of feature updates: 600991
Seconds required for this iteration: 0.598

***** Epoch #38 *****
Loss: 450.419669
Improvement ratio: 0.288188
Feature L2-norm: 106.784204
Learning rate (eta): 0.049811
Total number of feature updates: 617234
Seconds required for this iteration: 0.609

***** Epoch #39 *****
Loss: 447.370878
Improvement ratio: 0.273365
Feature L2-norm: 107.406555
Learning rate (eta): 0.049806
Total number of feature updates: 633477
Seconds required for this iteration: 0.598

***** Epoch #40 *****
Loss: 429.997910
Improvement ratio: 0.320287
Feature L2-norm: 108.021482
Learning rate (eta): 0.049801
Total number of feature updates: 649720
Seconds required for this iteration: 0.600

***** Epoch #41 *****
Loss: 428.141771
Improvement ratio: 0.213239
Feature L2-norm: 108.616889
Learning rate (eta): 0.049796
Total number of feature updates: 665963

***** Epoch #78 *****
Loss: 261.729642
Improvement ratio: 0.093913
Feature L2-norm: 124.087536
Learning rate (eta): 0.049613
Total number of feature updates: 1266954
Seconds required for this iteration: 0.600

***** Epoch #79 *****
Loss: 252.664477
Improvement ratio: 0.127244
Feature L2-norm: 124.394524
Learning rate (eta): 0.049608
Total number of feature updates: 1283197
Seconds required for this iteration: 0.604

***** Epoch #80 *****
Loss: 258.010012
Improvement ratio: 0.081723
Feature L2-norm: 124.694441
Learning rate (eta): 0.049603
Total number of feature updates: 1299440
Seconds required for this iteration: 0.617

***** Epoch #81 *****
Loss: 248.514813
Improvement ratio: 0.037315
Feature L2-norm: 124.991687
Learning rate (eta): 0.049598
Total number of feature updates: 1315683
Seconds required for this iteration: 0.600

***** Epoch #82 *****
Loss: 248.338920
Improvement ratio: 0.189743
Feature L2-norm: 125.291294
Learning rate (eta): 0.049593
Total number of feature updates: 13

Täägitud fail P2rnu_Audru_V6lla_id5904_1878a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id7612_1869a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19169_1869a.json
Täägitud fail Tartu_V6nnu_Ahja_id20825_1889a.json
Täägitud fail P2rnu_T6stamaa_Kihnu_id25292_1843a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id12944_1880a.json
Täägitud fail Viljandi_Paistu_Holstre_id11341_1848a.json
Täägitud fail J2rva_Anna_Eivere_id985_1868a.json
Täägitud fail Tartu_Kodavere_Ranna_id14285_1858a.json
Täägitud fail V6ru_R2pina_R2pina_id9603_1862a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id9028_1865a.json
Täägitud fail Harju_Juuru_Kaiu_id1203_1895a.json
Täägitud fail V6ru_R6uge_Saaluse_id10045_1879a.json
Täägitud fail Tartu_V6nnu_Ahja_id17204_1884a.json
Täägitud fail Tartu_Torma_Avinurme_id20727_1871a.json
Täägitud fail L22ne_Vormsi_Vormsi_id24539_1888a.json
Täägitud fail Tartu_R6ngu_Aakre_id14645_1829a.json
Täägitud fail Tartu_V6nnu_Ahja_id19396_1887a.json
Täägitud fail Tartu_Laiuse_Kivij2rve_id6107_

Täägitud fail Tartu_V6nnu_Ahja_id17125_1884a.json
Täägitud fail L22ne_Pyhalepa_Kassari_id20363_1890a.json
Täägitud fail Tartu_V6nnu_Ahja_id17321_1885a.json
Täägitud fail Tartu_Laiuse_Kivij2rve_id4876_1863a.json
Täägitud fail Harju_Juuru_Kaiu_id18571_1873a.json
Täägitud fail V6ru_P6lva_Kiuma_id1633_1880a.json
Täägitud fail V6ru_Vastseliina_Misso_id25270_1886a.json
Täägitud fail V6ru_Vastseliina_Misso_id10250_1886a.json
Täägitud fail Viljandi_Suure-Jaani_Syrgavere_id20998_1877a.json
Täägitud fail J2rva_Tyri_V22tsa_id16934_1886a.json
Täägitud fail J2rva_Ambla_Uudekyla_id13485_1867a.json
Täägitud fail V6ru_P6lva_Peri_id10138_1891a.json
Täägitud fail Viljandi_Kolga-Jaani_Paenasti_id330_1874a.json
Täägitud fail Tartu_Torma_Avinurme_id17385_1871a.json
Täägitud fail V6ru_Vastseliina_Misso_id18830_1882a.json
Täägitud fail Tartu_Torma_Avinurme_id4086_1858a.json
Täägitud fail J2rva_Tyri_S2revere_id10947_1871a.json
Täägitud fail Harju_Kose_Habaja_id735_1874a.json
Täägitud fail Viru_Haljala_Vihula_

***** Epoch #21 *****
Loss: 807.969372
Improvement ratio: 1.029981
Feature L2-norm: 92.764372
Learning rate (eta): 0.049895
Total number of feature updates: 340221
Seconds required for this iteration: 0.741

***** Epoch #22 *****
Loss: 787.543997
Improvement ratio: 0.844961
Feature L2-norm: 93.895602
Learning rate (eta): 0.049890
Total number of feature updates: 356422
Seconds required for this iteration: 0.764

***** Epoch #23 *****
Loss: 737.604301
Improvement ratio: 0.820850
Feature L2-norm: 94.967266
Learning rate (eta): 0.049885
Total number of feature updates: 372623
Seconds required for this iteration: 0.697

***** Epoch #24 *****
Loss: 700.068989
Improvement ratio: 0.792834
Feature L2-norm: 95.994880
Learning rate (eta): 0.049880
Total number of feature updates: 388824
Seconds required for this iteration: 0.697

***** Epoch #25 *****
Loss: 676.610275
Improvement ratio: 0.678371
Feature L2-norm: 96.979409
Learning rate (eta): 0.049875
Total number of feature updates: 405025
Seco

***** Epoch #64 *****
Loss: 315.521147
Improvement ratio: 0.077801
Feature L2-norm: 119.733503
Learning rate (eta): 0.049682
Total number of feature updates: 1036864
Seconds required for this iteration: 0.627

***** Epoch #65 *****
Loss: 303.473409
Improvement ratio: 0.092978
Feature L2-norm: 120.108482
Learning rate (eta): 0.049677
Total number of feature updates: 1053065
Seconds required for this iteration: 0.637

***** Epoch #66 *****
Loss: 288.039969
Improvement ratio: 0.249072
Feature L2-norm: 120.474994
Learning rate (eta): 0.049672
Total number of feature updates: 1069266
Seconds required for this iteration: 0.627

***** Epoch #67 *****
Loss: 313.646841
Improvement ratio: 0.095891
Feature L2-norm: 120.843284
Learning rate (eta): 0.049667
Total number of feature updates: 1085467
Seconds required for this iteration: 0.627

***** Epoch #68 *****
Loss: 301.294049
Improvement ratio: 0.126560
Feature L2-norm: 121.201557
Learning rate (eta): 0.049662
Total number of feature updates: 11

Täägitud fail Tartu_Kambja_Vana-Prangli_id19091_1909a.json
Täägitud fail Tartu_V6nnu_Ahja_id14900_1882a.json
Täägitud fail Tartu_V6nnu_Ahja_id19074_1887a.json
Täägitud fail L22ne_Kullamaa_Piirsalu_id15463_1883a.json
Täägitud fail Harju_Kose_Palvere_id14358_1880a.json
Täägitud fail L22ne_Vormsi_Vormsi_id24517_1888a.json
Täägitud fail Harju_Kose_Palvere_id18727_1883a.json
Täägitud fail Saare_P8ide_Laimjala_id6593_1917a.json
Täägitud fail V6ru_R6uge_Leevi_id24854_1875a.json
Täägitud fail Harju_Keila_Keila_id13472_1890a.json
Täägitud fail Tartu_R6ngu_Aakre_id2817_1888a.json
Täägitud fail Tartu_V6nnu_Ahja_id13953_1882a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id12235_1856a.json
Täägitud fail J2rva_Peetri_Silmsi_id23715_1869a.json
Täägitud fail P2rnu_Tori_Tori_id25326_1890a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id1266_1865a.json
Täägitud fail Tartu_Kodavere_Pala_id17804_1861a.json
Täägitud fail L22ne_Pyhalepa_Kassari_id20356_1889a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id4865_188

Täägitud fail Harju_Kose_Triigi_id11552_1871a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15386_1874a.json
Täägitud fail Harju_Hageri_Kohila_id4177_1883a.json
Täägitud fail V6ru_R6uge_Saaluse_id9629_1878a.json
Täägitud fail J2rva_Tyri_S2revere_id13094_1880a.json
Täägitud fail L22ne_Kullamaa_Piirsalu_id7491_1884a.json
Täägitud fail L22ne_Pyhalepa_K2rdla_id10158_1884a.json
Täägitud fail V6ru_Vastseliina_Misso_id13577_1881a.json
Täägitud fail J2rva_Tyri_V22tsa_id22488_1913a.json
Täägitud fail Harju_Hageri_Kohila_id5465_1889a.json
Täägitud fail Tartu_V6nnu_Ahja_id21646_1867a.json
Täägitud fail J2rva_J2rva-Jaani_Einmanni_id6497_1868a.json
Täägitud fail P2rnu_T6stamaa_Kihnu_id25042_1843a.json
Täägitud fail Tartu_Maarja-Magdaleena_J6e_id10704_1876a.json
Täägitud fail Saare_Kihelkonna_Kihelkonna_id22956_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id18088_1885a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id7185_1884a.json
Täägitud fail Tartu_V6nnu_Ahja_id15141_1883a.json
Täägitud fail Tartu_V6nnu_

***** Epoch #29 *****
Loss: 577.916708
Improvement ratio: 0.531367
Feature L2-norm: 100.461071
Learning rate (eta): 0.049855
Total number of feature updates: 469249
Seconds required for this iteration: 0.645

***** Epoch #30 *****
Loss: 566.830309
Improvement ratio: 0.477653
Feature L2-norm: 101.267869
Learning rate (eta): 0.049850
Total number of feature updates: 485430
Seconds required for this iteration: 0.645

***** Epoch #31 *****
Loss: 538.776981
Improvement ratio: 0.471477
Feature L2-norm: 102.053431
Learning rate (eta): 0.049845
Total number of feature updates: 501611
Seconds required for this iteration: 0.639

***** Epoch #32 *****
Loss: 544.640238
Improvement ratio: 0.371480
Feature L2-norm: 102.818411
Learning rate (eta): 0.049841
Total number of feature updates: 517792
Seconds required for this iteration: 0.659

***** Epoch #33 *****
Loss: 505.292557
Improvement ratio: 0.459362
Feature L2-norm: 103.560457
Learning rate (eta): 0.049836
Total number of feature updates: 533973

***** Epoch #69 *****
Loss: 274.252323
Improvement ratio: 0.176503
Feature L2-norm: 121.371261
Learning rate (eta): 0.049657
Total number of feature updates: 1116489
Seconds required for this iteration: 0.647

***** Epoch #70 *****
Loss: 298.788905
Improvement ratio: 0.078689
Feature L2-norm: 121.721355
Learning rate (eta): 0.049652
Total number of feature updates: 1132670
Seconds required for this iteration: 0.641

***** Epoch #71 *****
Loss: 282.816670
Improvement ratio: 0.126400
Feature L2-norm: 122.063896
Learning rate (eta): 0.049648
Total number of feature updates: 1148851
Seconds required for this iteration: 0.639

***** Epoch #72 *****
Loss: 281.141637
Improvement ratio: 0.102240
Feature L2-norm: 122.396628
Learning rate (eta): 0.049643
Total number of feature updates: 1165032
Seconds required for this iteration: 0.655

***** Epoch #73 *****
Loss: 278.911439
Improvement ratio: 0.090364
Feature L2-norm: 122.723710
Learning rate (eta): 0.049638
Total number of feature updates: 11

Täägitud fail V6ru_R2pina_Kahkva_id14001_1889a.json
Täägitud fail Harju_Kose_Palvere_id23127_1887a.json
Täägitud fail Tartu_V6nnu_Ahja_id22345_1868a.json
Täägitud fail Tartu_N6o_Pangodi_id3054_1889a.json
Täägitud fail J2rva_Tyri_V22tsa_id22024_1912a.json
Täägitud fail Tartu_V6nnu_Ahja_id22715_1883a.json
Täägitud fail Tartu_V6nnu_Rasina_id2088_1906a.json
Täägitud fail V6ru_R6uge_Saaluse_id11045_1879a.json
Täägitud fail Harju_Rapla_Rapla_id24008_1873a.json
Täägitud fail Tartu_Kodavere_Pala_id22811_1872a.json
Täägitud fail L22ne_K2ina_Putkaste_id8765_1867a.json
Täägitud fail Saare_P8ide_Laimjala_id7049_1915a.json
Täägitud fail Tartu_V6nnu_Ahja_id18951_1887a.json
Täägitud fail Tartu_Torma_Avinurme_id21503_1872a.json
Täägitud fail Viljandi_Paistu_Holstre_id10774_1910a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id12857_1884a.json
Täägitud fail Viru_Haljala_Vihula_id10881_1883a.json
Täägitud fail V6ru_Kanepi_Krootuse_id24412_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id20417_1888a.json
Täägit

0....1....2....3....4....5....6....7....8....9....10
Number of features: 349603
Seconds required: 2.965

Stochastic Gradient Descent (SGD)
c2: 0.001000
max_iterations: 1000
period: 10
delta: 0.000001

Calibrating the learning rate (eta)
calibration.eta: 0.100000
calibration.rate: 2.000000
calibration.samples: 1000
calibration.candidates: 10
calibration.max_trials: 20
Initial loss: 32954.274734
Trial #1 (eta = 0.100000): 2317.294957
Trial #2 (eta = 0.200000): 3329.189692
Trial #3 (eta = 0.400000): 6770.340546
Trial #4 (eta = 0.800000): 12579.261987
Trial #5 (eta = 1.600000): 28351.895258
Trial #6 (eta = 3.200000): 57633.099450 (worse)
Trial #7 (eta = 0.050000): 2151.302467
Trial #8 (eta = 0.025000): 2372.117218
Trial #9 (eta = 0.012500): 2791.591205
Trial #10 (eta = 0.006250): 3378.264529
Trial #11 (eta = 0.003125): 4196.800415
Trial #12 (eta = 0.001563): 5422.753843
Trial #13 (eta = 0.000781): 7353.554580
Trial #14 (eta = 0.000391): 10399.903043
Trial #15 (eta = 0.000195): 14780.901532

***** Epoch #38 *****
Loss: 424.975871
Improvement ratio: 0.317033
Feature L2-norm: 104.064039
Learning rate (eta): 0.049811
Total number of feature updates: 583110
Seconds required for this iteration: 0.634

***** Epoch #39 *****
Loss: 415.878330
Improvement ratio: 0.300172
Feature L2-norm: 104.670838
Learning rate (eta): 0.049806
Total number of feature updates: 598455
Seconds required for this iteration: 0.634

***** Epoch #40 *****
Loss: 404.984537
Improvement ratio: 0.306419
Feature L2-norm: 105.259128
Learning rate (eta): 0.049801
Total number of feature updates: 613800
Seconds required for this iteration: 0.645

***** Epoch #41 *****
Loss: 399.211244
Improvement ratio: 0.269103
Feature L2-norm: 105.832624
Learning rate (eta): 0.049796
Total number of feature updates: 629145
Seconds required for this iteration: 0.642

***** Epoch #42 *****
Loss: 391.262434
Improvement ratio: 0.265465
Feature L2-norm: 106.397308
Learning rate (eta): 0.049791
Total number of feature updates: 644490

***** Epoch #78 *****
Loss: 243.244496
Improvement ratio: 0.089508
Feature L2-norm: 120.851544
Learning rate (eta): 0.049613
Total number of feature updates: 1196910
Seconds required for this iteration: 0.636

***** Epoch #79 *****
Loss: 239.948119
Improvement ratio: 0.104801
Feature L2-norm: 121.149615
Learning rate (eta): 0.049608
Total number of feature updates: 1212255
Seconds required for this iteration: 0.640

***** Epoch #80 *****
Loss: 234.804732
Improvement ratio: 0.108465
Feature L2-norm: 121.447516
Learning rate (eta): 0.049603
Total number of feature updates: 1227600
Seconds required for this iteration: 0.633

***** Epoch #81 *****
Loss: 235.156810
Improvement ratio: 0.087094
Feature L2-norm: 121.736714
Learning rate (eta): 0.049598
Total number of feature updates: 1242945
Seconds required for this iteration: 0.646

***** Epoch #82 *****
Loss: 235.399741
Improvement ratio: 0.081426
Feature L2-norm: 122.022486
Learning rate (eta): 0.049593
Total number of feature updates: 12

***** Epoch #118 *****
Loss: 184.861148
Improvement ratio: 0.045719
Feature L2-norm: 130.520931
Learning rate (eta): 0.049417
Total number of feature updates: 1810710
Seconds required for this iteration: 0.650

***** Epoch #119 *****
Loss: 184.523903
Improvement ratio: 0.029444
Feature L2-norm: 130.717399
Learning rate (eta): 0.049412
Total number of feature updates: 1826055
Seconds required for this iteration: 0.642

***** Epoch #120 *****
Loss: 182.944552
Improvement ratio: 0.072516
Feature L2-norm: 130.912179
Learning rate (eta): 0.049407
Total number of feature updates: 1841400
Seconds required for this iteration: 0.647

***** Epoch #121 *****
Loss: 180.332957
Improvement ratio: 0.069584
Feature L2-norm: 131.106973
Learning rate (eta): 0.049402
Total number of feature updates: 1856745
Seconds required for this iteration: 0.680

***** Epoch #122 *****
Loss: 181.668000
Improvement ratio: 0.051082
Feature L2-norm: 131.299921
Learning rate (eta): 0.049397
Total number of feature update

***** Epoch #161 *****
Loss: 156.338225
Improvement ratio: 0.057804
Feature L2-norm: 137.745069
Learning rate (eta): 0.049208
Total number of feature updates: 2470545
Seconds required for this iteration: 0.645

***** Epoch #162 *****
Loss: 160.171461
Improvement ratio: 0.023554
Feature L2-norm: 137.888180
Learning rate (eta): 0.049203
Total number of feature updates: 2485890
Seconds required for this iteration: 0.634

***** Epoch #163 *****
Loss: 156.667527
Improvement ratio: 0.027065
Feature L2-norm: 138.031483
Learning rate (eta): 0.049198
Total number of feature updates: 2501235
Seconds required for this iteration: 0.639

***** Epoch #164 *****
Loss: 156.406177
Improvement ratio: 0.017843
Feature L2-norm: 138.172302
Learning rate (eta): 0.049193
Total number of feature updates: 2516580
Seconds required for this iteration: 0.645

***** Epoch #165 *****
Loss: 156.909889
Improvement ratio: 0.022556
Feature L2-norm: 138.311834
Learning rate (eta): 0.049188
Total number of feature update

***** Epoch #201 *****
Loss: 143.472234
Improvement ratio: 0.021735
Feature L2-norm: 142.863971
Learning rate (eta): 0.049015
Total number of feature updates: 3084345
Seconds required for this iteration: 0.644

***** Epoch #202 *****
Loss: 144.346412
Improvement ratio: 0.018657
Feature L2-norm: 142.976660
Learning rate (eta): 0.049010
Total number of feature updates: 3099690
Seconds required for this iteration: 0.637

***** Epoch #203 *****
Loss: 141.695081
Improvement ratio: 0.022853
Feature L2-norm: 143.090439
Learning rate (eta): 0.049005
Total number of feature updates: 3115035
Seconds required for this iteration: 0.645

***** Epoch #204 *****
Loss: 141.817270
Improvement ratio: 0.024328
Feature L2-norm: 143.202635
Learning rate (eta): 0.049000
Total number of feature updates: 3130380
Seconds required for this iteration: 0.636

***** Epoch #205 *****
Loss: 142.184832
Improvement ratio: -0.012671
Feature L2-norm: 143.314540
Learning rate (eta): 0.048996
Total number of feature updat

Täägitud fail Tartu_N6o_Pangodi_id4146_1889a.json
Täägitud fail Tartu_Kodavere_Ranna_id14405_1860a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19122_1868a.json
Täägitud fail Tartu_V6nnu_Ahja_id19084_1887a.json
Täägitud fail Harju_Hageri_Kohila_id4010_1890a.json
Täägitud fail V6ru_R2pina_Kahkva_id6489_1887a.json
Täägitud fail Harju_Kose_Palvere_id13989_1880a.json
Täägitud fail Viljandi_Paistu_Holstre_id6625_1828a.json
Täägitud fail V6ru_Vastseliina_Misso_id22084_1883a.json
Täägitud fail L22ne_Vormsi_Vormsi_id24683_1888a.json
Täägitud fail Tartu_Kodavere_Pala_id22815_1872a.json
Täägitud fail L22ne_Martna_Martna_id18619_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id20555_1889a.json
Täägitud fail J2rva_Ambla_Ambla_id5939_1888a.json
Täägitud fail J2rva_Tyri_Kirna_id24973_1881a.json
Täägitud fail Tartu_Torma_Avinurme_id20542_1871a.json
Täägitud fail Harju_Kose_Triigi_id10000_1870a.json
Täägitud fail Harju_Juuru_Juuru_id23774_1873a.json
Täägitud fail Viljandi_Paistu_Holstre_id11504_1848a.jso

***** Epoch #3 *****
Loss: 6118.693107
Feature L2-norm: 49.838116
Learning rate (eta): 0.049985
Total number of feature updates: 46842
Seconds required for this iteration: 0.628

***** Epoch #4 *****
Loss: 4636.374189
Feature L2-norm: 55.201369
Learning rate (eta): 0.049980
Total number of feature updates: 62456
Seconds required for this iteration: 0.640

***** Epoch #5 *****
Loss: 3735.239502
Feature L2-norm: 60.091907
Learning rate (eta): 0.049975
Total number of feature updates: 78070
Seconds required for this iteration: 0.633

***** Epoch #6 *****
Loss: 3450.588016
Feature L2-norm: 63.732778
Learning rate (eta): 0.049970
Total number of feature updates: 93684
Seconds required for this iteration: 0.627

***** Epoch #7 *****
Loss: 2621.089086
Feature L2-norm: 66.968792
Learning rate (eta): 0.049965
Total number of feature updates: 109298
Seconds required for this iteration: 0.639

***** Epoch #8 *****
Loss: 2276.136464
Feature L2-norm: 69.916408
Learning rate (eta): 0.049960
Total nu

***** Epoch #45 *****
Loss: 390.794076
Improvement ratio: 0.266110
Feature L2-norm: 110.526872
Learning rate (eta): 0.049776
Total number of feature updates: 702630
Seconds required for this iteration: 0.632

***** Epoch #46 *****
Loss: 392.014912
Improvement ratio: 0.211480
Feature L2-norm: 111.046606
Learning rate (eta): 0.049771
Total number of feature updates: 718244
Seconds required for this iteration: 0.639

***** Epoch #47 *****
Loss: 366.098666
Improvement ratio: 0.240310
Feature L2-norm: 111.567994
Learning rate (eta): 0.049766
Total number of feature updates: 733858
Seconds required for this iteration: 0.653

***** Epoch #48 *****
Loss: 360.553375
Improvement ratio: 0.322122
Feature L2-norm: 112.074469
Learning rate (eta): 0.049761
Total number of feature updates: 749472
Seconds required for this iteration: 0.627

***** Epoch #49 *****
Loss: 394.019725
Improvement ratio: 0.095608
Feature L2-norm: 112.567297
Learning rate (eta): 0.049756
Total number of feature updates: 765086



Alustan failide taggimist.
Täägitud fail V6ru_R6uge_Saaluse_id8753_1877a.json
Täägitud fail Viljandi_Helme_Leebiku_id14149_1888a.json
Täägitud fail L22ne_Martna_Martna_id14159_1884a.json
Täägitud fail V6ru_Vastseliina_Misso_id24907_1886a.json
Täägitud fail V6ru_R2pina_R2pina_id10213_1870a.json
Täägitud fail J2rva_Tyri_V22tsa_id16362_1885a.json
Täägitud fail J2rva_Tyri_S2revere_id5550_1881a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id10860_1880a.json
Täägitud fail Harju_Juuru_Kaiu_id16280_1884a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id1271_1865a.json
Täägitud fail L22ne_Reigi_K6rgessaare_id23306_1895a.json
Täägitud fail L22ne_Ridala_Sinalepa_id25437_1889a.json
Täägitud fail J2rva_Tyri_Tyri-Alliku_id3082_1900a.json
Täägitud fail V6ru_R6uge_Saaluse_id10962_1879a.json
Täägitud fail V6ru_Vastseliina_Misso_id22085_1883a.json
Täägitud fail Tartu_V6nnu_Ahja_id13957_1882a.json
Täägitud fail Harju_Kose_Palvere_id16729_1881a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id4665_1883a.json
T

Täägitud fail Viljandi_K6pu_Suure-K6pu_id7180_1884a.json
Täägitud fail J2rva_Tyri_S2revere_id8880_1886a.json
Täägitud fail Tartu_V6nnu_Ahja_id21240_1889a.json
Täägitud fail Harju_Kose_Palvere_id21893_1886a.json
Täägitud fail Saare_P8ide_Laimjala_id5898_1914a.json
Täägitud fail V6ru_R2pina_Kahkva_id7463_1888a.json
Täägitud fail Tartu_V6nnu_Ahja_id14891_1882a.json
Täägitud fail Harju_Kose_Kose-Uuem6isa_id6353_1869a.json
Täägitud fail J2rva_Tyri_V22tsa_id17496_1888a.json
Täägitud fail P2rnu_P2rnu-Elisabethi_Sauga_id18745_1877a.json
Täägitud fail Tartu_V6nnu_Ahja_id18234_1886a.json
Täägitud fail Tartu_Torma_Avinurme_id20454_1871a.json
Täägitud fail Harju_Rapla_Rapla_id18671_1869a.json
Täägitud fail Tartu_V6nnu_Ahja_id18214_1886a.json
Täägitud fail Tartu_Laiuse_Kivij2rve_id1436_1856a.json
Täägitud fail Tartu_V6nnu_Ahja_id22561_1878a.json
Täägitud fail Tartu_Kodavere_Ranna_id14286_1858a.json
Täägitud fail P2rnu_Halliste_Penuja_id758_1885a.json
Täägitud fail Saare_Kihelkonna_Atla_id6893_1872a

In [8]:
with open("results.txt", "w+"):
    results_file.write(json.dumps(all_results))

<strong>Micro-average of precision</strong> = (TP1 + TP2) / (TP1 + TP2 + FP1 + FP2)<br>
<strong>Micro-average of recall</strong> = (TP1 + TP2) / (TP1 + TP2 + FN1 + FN2)

In [38]:
correct_all = 0
actual_all = 0
possible_all = 0

for i in ['1', '2', '3', '4', '5']:
    train = []
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
    print(f'Testitav alamhulk oli {subdistribution_for_testing} ning treenitavad hulgad {train}:')
    correct = json[i][0]['ent_type']['correct']
    correct_all += correct
    actual = json[i][0]['ent_type']['actual']
    actual_all += actual
    possible = json[i][0]['ent_type']['possible']
    possible_all += possible
    precision = (correct / actual)
    recall = (correct / possible)
    f1 = 2 * ((precision * recall) / (precision + recall))
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-score: {f1}\n')


print('Tulemused üle alamhulkade:')
precision = correct_all / actual_all
recall = correct_all / possible_all
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {2 * ((precision * recall) / (precision + recall))}')

Testitav alamhulk oli 1 ning treenitavad hulgad ['2', '3', '4', '5']:
Precision: 0.9511579912864022
Recall: 0.9176991150442478
F1-score: 0.9341290395225763

Testitav alamhulk oli 2 ning treenitavad hulgad ['1', '3', '4', '5']:
Precision: 0.9487296864271
Recall: 0.9176444542838167
F1-score: 0.9329282016655414

Testitav alamhulk oli 3 ning treenitavad hulgad ['1', '2', '4', '5']:
Precision: 0.9409860557768924
Recall: 0.9147906076010651
F1-score: 0.9277034491223762

Testitav alamhulk oli 4 ning treenitavad hulgad ['1', '2', '3', '5']:
Precision: 0.9315041836933255
Recall: 0.8892810700352963
F1-score: 0.9099030602547045

Testitav alamhulk oli 5 ning treenitavad hulgad ['1', '2', '3', '4']:
Precision: 0.9431321084864392
Recall: 0.9077894736842105
F1-score: 0.9251233640849603

Tulemused üle alamhulkade:
Precision: 0.9427350046756022
Recall: 0.9085876142654822
F1-score: 0.9253463875169369
