In [1]:
import json
import time
import os
import sklearn_crfsuite
import re
import nereval
import pandas as pd
from preprocessing_protocols import preprocess_text

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from sklearn_crfsuite import metrics

from nervaluate import Evaluator

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil
from estnltk.core import DEFAULT_PY3_NER_MODEL_DIR

In [51]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

These files don't work because the protocols are written in a different language, which the goldstandard didn't recognise, hence have no goldstandard tags.

In [50]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [48]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [5]:
all_results = {}

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    
    # Getting the filenames to be trained on from the files dictionary.
    filenames = {key: value for key, value in files.items() if int(value) in training_subdistributions}
    
    # Creating training_texts from the aforementioned filenames.
    print("Valmistan ette treenimistekste.")
    start = time.time()
    training_texts = []
    for filename in filenames:
        with open('./vallakohtufailid-json-flattened/' + str(filename), 'r', encoding='UTF-8') as file:
            if filename in files_not_working:
                continue
            else:
                training_texts.append(preprocess_text(json_to_text(file.read())))
    print(f"Treenimistekstid ette valmistatud {time.time() - start} sekundiga.")
    
    # Setting up the trainer and training.
    print("\n\nAlustan nertaggeri treenimist.")
    start = time.time()
    model_dir=DEFAULT_PY3_NER_MODEL_DIR
    modelUtil = ModelStorageUtil(model_dir)
    nersettings = modelUtil.load_settings()
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir='test' )
    print(f"NerTagger treenitud {time.time() - start} sekundiga.")
    
    # Setting up the new trained nertagger and defining layers to be removed later on.
    nertagger = NerTagger(model_dir = 'test')
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    
    # Tagging the files using the new nertagger.
    print("\n\nAlustan failide taggimist.")
    start = time.time()
    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
            text = f.read()
            if file == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
                text = text.replace('..', '. .')
            text = preprocess_text(Text(text))
            nertagger.tag(text)
            text.add_layer(flatten(text['ner'], 'flat_ner'))

            for x in removed_layers:
                text.pop_layer(x)
            
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid-trained-nertagger/" + file)
            print(f'Täägitud fail {file}')
    print(f"Failid taggitud {time.time() - start} sekundiga.")
    
    # Chaning the tags into a readable formats for the evaluator.
    print("\n\nAlustan tulemuste ammutamist.")

    #gold = []
    #test = []
    gold_ner = []
    test_ner = []

    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        appendable_gold_ner = []
        appendable_test_ner = []

        if file.endswith(".json"):
            if file in files_not_working:
                continue
            else:
                with open("./vallakohtufailid-trained-nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                    open("./vallakohtufailid-json-flattened/" + str(file), 'r', encoding='UTF-8') as f_gold:
                        test_import = json_to_text(f_test.read())
                        gold_import = json_to_text(f_gold.read())

                        # The commented part is needed for word-level-ner.
                        '''
                        for i in range(len(gold_import['flat_gold_wordner'])):
                            tag = gold_import['flat_gold_wordner'][i].nertag[0]
                            gold.append(tag)
                        for i in range(len(test_import['flat_wordner'])):
                            tag = test_import['flat_wordner'][i].nertag[0]
                            test.append(tag)
                        '''

                        for i in range(len(gold_import['gold_ner'])):
                            ner = gold_import['gold_ner'][i]
                            label = ner.nertag
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_gold_ner.append({"label": label, "start": start, "end": end})

                        for i in range(len(test_import['flat_ner'])):
                            ner = test_import['flat_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_test_ner.append({"label": label, "start": start, "end": end})
                
            gold_ner.append(appendable_gold_ner)
            test_ner.append(appendable_test_ner)
        evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
        results, results_per_tag = evaluator.evaluate()
        all_results[subdistribution_for_testing] = (results, results_per_tag)
print("Programm on lõpetanud oma töö.")

Valmistan ette treenimistekste.
Treenimistekstid ette valmistatud 114.23665285110474 sekundiga.


Alustan nertaggeri treenimist.
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 347971
Seconds required: 1.402

Stochastic Gradient Descent (SGD)
c2: 0.001000
max_iterations: 1000
period: 10
delta: 0.000001

Calibrating the learning rate (eta)
calibration.eta: 0.100000
calibration.rate: 2.000000
calibration.samples: 1000
calibration.candidates: 10
calibration.max_trials: 20
Initial loss: 30999.990087
Trial #1 (eta = 0.100000): 4317.052042
Trial #2 (eta = 0.200000): 7494.701077
Trial #3 (eta = 0.400000): 13784.670564
Trial #4 (eta = 0.800000): 27677.081044
Trial #5 (eta = 1.600000): ERROR: overflow loss
nan (worse)
Trial #6 (eta = 0.050000): 3281.998028
Trial #7 (eta = 0.025000): 3066.810447
Trial #8 (eta = 0.012500): 3285.958481
Trial #9 (eta = 0.00625

***** Epoch #36 *****
Loss: 910.604066
Improvement ratio: 0.366425
Feature L2-norm: 86.109141
Learning rate (eta): 0.024955
Total number of feature updates: 584748
Seconds required for this iteration: 0.504

***** Epoch #37 *****
Loss: 872.108045
Improvement ratio: 0.395578
Feature L2-norm: 86.785024
Learning rate (eta): 0.024954
Total number of feature updates: 600991
Seconds required for this iteration: 0.495

***** Epoch #38 *****
Loss: 845.057402
Improvement ratio: 0.369127
Feature L2-norm: 87.441311
Learning rate (eta): 0.024953
Total number of feature updates: 617234
Seconds required for this iteration: 0.501

***** Epoch #39 *****
Loss: 825.777719
Improvement ratio: 0.359622
Feature L2-norm: 88.083879
Learning rate (eta): 0.024951
Total number of feature updates: 633477
Seconds required for this iteration: 0.498

***** Epoch #40 *****
Loss: 800.623362
Improvement ratio: 0.351267
Feature L2-norm: 88.709391
Learning rate (eta): 0.024950
Total number of feature updates: 649720
Seco

***** Epoch #79 *****
Loss: 423.046636
Improvement ratio: 0.126279
Feature L2-norm: 105.550833
Learning rate (eta): 0.024902
Total number of feature updates: 1283197
Seconds required for this iteration: 0.505

***** Epoch #80 *****
Loss: 418.530228
Improvement ratio: 0.122443
Feature L2-norm: 105.863536
Learning rate (eta): 0.024900
Total number of feature updates: 1299440
Seconds required for this iteration: 0.513

***** Epoch #81 *****
Loss: 411.669602
Improvement ratio: 0.138382
Feature L2-norm: 106.171334
Learning rate (eta): 0.024899
Total number of feature updates: 1315683
Seconds required for this iteration: 0.520

***** Epoch #82 *****
Loss: 413.817761
Improvement ratio: 0.101972
Feature L2-norm: 106.475168
Learning rate (eta): 0.024898
Total number of feature updates: 1331926
Seconds required for this iteration: 0.532

***** Epoch #83 *****
Loss: 402.438225
Improvement ratio: 0.148405
Feature L2-norm: 106.773321
Learning rate (eta): 0.024897
Total number of feature updates: 13

***** Epoch #119 *****
Loss: 308.584893
Improvement ratio: 0.044525
Feature L2-norm: 115.661448
Learning rate (eta): 0.024852
Total number of feature updates: 1932917
Seconds required for this iteration: 0.491

***** Epoch #120 *****
Loss: 299.864994
Improvement ratio: 0.075837
Feature L2-norm: 115.867514
Learning rate (eta): 0.024851
Total number of feature updates: 1949160
Seconds required for this iteration: 0.490

***** Epoch #121 *****
Loss: 299.754030
Improvement ratio: 0.058795
Feature L2-norm: 116.070471
Learning rate (eta): 0.024850
Total number of feature updates: 1965403
Seconds required for this iteration: 0.505

***** Epoch #122 *****
Loss: 297.867382
Improvement ratio: 0.076807
Feature L2-norm: 116.271904
Learning rate (eta): 0.024848
Total number of feature updates: 1981646
Seconds required for this iteration: 0.490

***** Epoch #123 *****
Loss: 294.740437
Improvement ratio: 0.077056
Feature L2-norm: 116.473266
Learning rate (eta): 0.024847
Total number of feature update

***** Epoch #162 *****
Loss: 239.062353
Improvement ratio: 0.056616
Feature L2-norm: 123.237141
Learning rate (eta): 0.024799
Total number of feature updates: 2631366
Seconds required for this iteration: 0.497

***** Epoch #163 *****
Loss: 238.367905
Improvement ratio: 0.049804
Feature L2-norm: 123.388960
Learning rate (eta): 0.024798
Total number of feature updates: 2647609
Seconds required for this iteration: 0.505

***** Epoch #164 *****
Loss: 240.687896
Improvement ratio: 0.038210
Feature L2-norm: 123.538581
Learning rate (eta): 0.024797
Total number of feature updates: 2663852
Seconds required for this iteration: 0.515

***** Epoch #165 *****
Loss: 237.536621
Improvement ratio: 0.046055
Feature L2-norm: 123.688160
Learning rate (eta): 0.024795
Total number of feature updates: 2680095
Seconds required for this iteration: 0.526

***** Epoch #166 *****
Loss: 237.840522
Improvement ratio: 0.040428
Feature L2-norm: 123.835297
Learning rate (eta): 0.024794
Total number of feature update

***** Epoch #202 *****
Loss: 200.425571
Improvement ratio: 0.077815
Feature L2-norm: 128.638416
Learning rate (eta): 0.024750
Total number of feature updates: 3281086
Seconds required for this iteration: 0.498

***** Epoch #203 *****
Loss: 213.264685
Improvement ratio: -0.006155
Feature L2-norm: 128.759244
Learning rate (eta): 0.024749
Total number of feature updates: 3297329
Seconds required for this iteration: 0.496

SGD terminated with the stopping criteria
Loss: 200.425571
Total seconds required for training: 104.750

Storing the model
Number of active features: 347971 (347971)
Number of active attributes: 314289 (314289)
Number of active labels: 11 (11)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.588

NerTagger treenitud 873.1751489639282 sekundiga.


Alustan failide taggimist.
Täägitud fail V6ru_R2pina_Kahkva_id24674_1868a.json
Täägitud fail L22ne_Martna_Martna_id14205_1869a.json
Täägit

Täägitud fail Tartu_V6nnu_Kiidj2rve_id25125_1870a.json
Täägitud fail L22ne_Reigi_K6rgessaare_id22613_1892a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19114_1867a.json
Täägitud fail Harju_Hageri_Kohila_id22158_1890a.json
Täägitud fail P2rnu_P2rnu-Elisabethi_Sauga_id17814_1868a.json
Täägitud fail Harju_Kose_Palvere_id561_1867a.json
Täägitud fail V6ru_R2pina_R2pina_id21267_1867a.json
Täägitud fail J2rva_Tyri_V22tsa_id20541_1902a.json
Täägitud fail Tartu_V6nnu_Ahja_id21768_1867a.json
Täägitud fail Tartu_V6nnu_Ahja_id20314_1888a.json
Täägitud fail Tartu_R6ngu_Aakre_id4282_1888a.json
Täägitud fail J2rva_Tyri_S2revere_id6796_1883a.json
Täägitud fail V6ru_R2pina_R2pina_id10711_1868a.json
Täägitud fail Harju_Kose_Palvere_id23525_1887a.json
Täägitud fail Tartu_R6ngu_Aakre_id13836_1829a.json
Täägitud fail P2rnu_Halliste_Penuja_id657_1885a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id11390_1880a.json
Täägitud fail V6ru_Vastseliina_Misso_id11633_1886a.json
Täägitud fail Viljandi_P6ltsamaa_Adav

***** Epoch #8 *****
Loss: 3910.701971
Feature L2-norm: 52.024728
Learning rate (eta): 0.024990
Total number of feature updates: 129608
Seconds required for this iteration: 0.509

***** Epoch #9 *****
Loss: 3538.434502
Feature L2-norm: 54.363536
Learning rate (eta): 0.024989
Total number of feature updates: 145809
Seconds required for this iteration: 0.507

***** Epoch #10 *****
Loss: 3202.007048
Feature L2-norm: 56.489410
Learning rate (eta): 0.024988
Total number of feature updates: 162010
Seconds required for this iteration: 0.507

***** Epoch #11 *****
Loss: 2948.430130
Improvement ratio: 5.616339
Feature L2-norm: 58.459768
Learning rate (eta): 0.024986
Total number of feature updates: 178211
Seconds required for this iteration: 0.508

***** Epoch #12 *****
Loss: 2734.105424
Improvement ratio: 2.877791
Feature L2-norm: 60.333891
Learning rate (eta): 0.024985
Total number of feature updates: 194412
Seconds required for this iteration: 0.506

***** Epoch #13 *****
Loss: 2528.767787
I

***** Epoch #54 *****
Loss: 621.005193
Improvement ratio: 0.200923
Feature L2-norm: 96.242393
Learning rate (eta): 0.024933
Total number of feature updates: 874854
Seconds required for this iteration: 0.510

***** Epoch #55 *****
Loss: 609.892639
Improvement ratio: 0.216895
Feature L2-norm: 96.699454
Learning rate (eta): 0.024931
Total number of feature updates: 891055
Seconds required for this iteration: 0.507

***** Epoch #56 *****
Loss: 598.876859
Improvement ratio: 0.201252
Feature L2-norm: 97.149345
Learning rate (eta): 0.024930
Total number of feature updates: 907256
Seconds required for this iteration: 0.514

***** Epoch #57 *****
Loss: 582.900638
Improvement ratio: 0.208251
Feature L2-norm: 97.589516
Learning rate (eta): 0.024929
Total number of feature updates: 923457
Seconds required for this iteration: 0.508

***** Epoch #58 *****
Loss: 579.782821
Improvement ratio: 0.188766
Feature L2-norm: 98.024467
Learning rate (eta): 0.024928
Total number of feature updates: 939658
Seco

***** Epoch #96 *****
Loss: 373.640522
Improvement ratio: 0.076360
Feature L2-norm: 110.578113
Learning rate (eta): 0.024881
Total number of feature updates: 1555296
Seconds required for this iteration: 0.509

***** Epoch #97 *****
Loss: 372.238031
Improvement ratio: 0.077951
Feature L2-norm: 110.835087
Learning rate (eta): 0.024879
Total number of feature updates: 1571497
Seconds required for this iteration: 0.507

***** Epoch #98 *****
Loss: 368.885123
Improvement ratio: 0.088869
Feature L2-norm: 111.090256
Learning rate (eta): 0.024878
Total number of feature updates: 1587698
Seconds required for this iteration: 0.513

***** Epoch #99 *****
Loss: 364.438005
Improvement ratio: 0.091863
Feature L2-norm: 111.342338
Learning rate (eta): 0.024877
Total number of feature updates: 1603899
Seconds required for this iteration: 0.507

***** Epoch #100 *****
Loss: 362.951687
Improvement ratio: 0.089669
Feature L2-norm: 111.591288
Learning rate (eta): 0.024876
Total number of feature updates: 1

***** Epoch #135 *****
Loss: 286.928881
Improvement ratio: 0.073310
Feature L2-norm: 119.045169
Learning rate (eta): 0.024832
Total number of feature updates: 2187135
Seconds required for this iteration: 0.511

***** Epoch #136 *****
Loss: 277.943132
Improvement ratio: 0.076640
Feature L2-norm: 119.228085
Learning rate (eta): 0.024831
Total number of feature updates: 2203336
Seconds required for this iteration: 0.506

***** Epoch #137 *****
Loss: 293.403842
Improvement ratio: 0.020004
Feature L2-norm: 119.408625
Learning rate (eta): 0.024830
Total number of feature updates: 2219537
Seconds required for this iteration: 0.511

***** Epoch #138 *****
Loss: 281.047146
Improvement ratio: 0.058640
Feature L2-norm: 119.589154
Learning rate (eta): 0.024829
Total number of feature updates: 2235738
Seconds required for this iteration: 0.506

***** Epoch #139 *****
Loss: 274.498419
Improvement ratio: 0.087067
Feature L2-norm: 119.767108
Learning rate (eta): 0.024827
Total number of feature update

***** Epoch #177 *****
Loss: 237.658820
Improvement ratio: 0.035803
Feature L2-norm: 125.750570
Learning rate (eta): 0.024781
Total number of feature updates: 2867577
Seconds required for this iteration: 0.508

***** Epoch #178 *****
Loss: 236.185826
Improvement ratio: 0.043232
Feature L2-norm: 125.890488
Learning rate (eta): 0.024779
Total number of feature updates: 2883778
Seconds required for this iteration: 0.511

***** Epoch #179 *****
Loss: 232.864976
Improvement ratio: 0.056136
Feature L2-norm: 126.029110
Learning rate (eta): 0.024778
Total number of feature updates: 2899979
Seconds required for this iteration: 0.512

***** Epoch #180 *****
Loss: 236.212862
Improvement ratio: 0.028353
Feature L2-norm: 126.166226
Learning rate (eta): 0.024777
Total number of feature updates: 2916180
Seconds required for this iteration: 0.507

***** Epoch #181 *****
Loss: 233.535448
Improvement ratio: 0.044610
Feature L2-norm: 126.302909
Learning rate (eta): 0.024776
Total number of feature update

Täägitud fail Tartu_V6nnu_Ahja_id17984_1885a.json
Täägitud fail Tartu_Kodavere_Ranna_id14138_1855a.json
Täägitud fail L22ne_Kullamaa_Sooniste_id3541_1880a.json
Täägitud fail J2rva_Tyri_Tyri-Alliku_id2315_1897a.json
Täägitud fail J2rva_Tyri_S2revere_id11683_1874a.json
Täägitud fail Saare_Kaarma_Loona_id7575_1899a.json
Täägitud fail V6ru_P6lva_K2hri_id21590_1851a.json
Täägitud fail Tartu_V6nnu_Ahja_id16351_1884a.json
Täägitud fail Tartu_V6nnu_Ahja_id11361_1872a.json
Täägitud fail Tartu_V6nnu_Ahja_id16121_1883a.json
Täägitud fail Tartu_V6nnu_Ahja_id21444_1866a.json
Täägitud fail J2rva_Tyri_S2revere_id14702_1887a.json
Täägitud fail L22ne_Martna_Martna_id12705_1885a.json
Täägitud fail Tartu_Torma_Avinurme_id6291_1861a.json
Täägitud fail Harju_Kose_Palvere_id16297_1881a.json
Täägitud fail P2rnu_P2rnu-Elisabethi_Sauga_id18115_1877a.json
Täägitud fail Tartu_Kambja_Vana-Prangli_id19091_1909a.json
Täägitud fail Tartu_V6nnu_Ahja_id14900_1882a.json
Täägitud fail Tartu_V6nnu_Ahja_id19074_1887a.json

Täägitud fail Tartu_R6ngu_Aakre_id8042_1827a.json
Täägitud fail Tartu_V6nnu_Ahja_id13144_1876a.json
Täägitud fail Tartu_V6nnu_Ahja_id17542_1885a.json
Täägitud fail Tartu_Kodavere_Pala_id17298_1857a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id1642_1884a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id14538_1866a.json
Täägitud fail Tartu_Torma_Avinurme_id24645_1823a.json
Täägitud fail Harju_Keila_Keila_id11680_1886a.json
Täägitud fail Tartu_V6nnu_Ahja_id17059_1884a.json
Täägitud fail Harju_Hageri_Kohila_id10480_1870a.json
Täägitud fail V6ru_Kanepi_Krootuse_id24518_1885a.json
Täägitud fail Harju_Kose_Triigi_id11470_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id15395_1883a.json
Täägitud fail Tartu_V6nnu_Ahja_id15584_1883a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id9507_1889a.json
Täägitud fail Tartu_V6nnu_Ahja_id12372_1874a.json
Täägitud fail Harju_Kose_Triigi_id11552_1871a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15386_1874a.json
Täägitud fail Harju_Hageri_Kohila_id4177_1883a.json
Täägi

***** Epoch #22 *****
Loss: 758.588633
Improvement ratio: 0.925725
Feature L2-norm: 93.879125
Learning rate (eta): 0.049890
Total number of feature updates: 355982
Seconds required for this iteration: 0.617

***** Epoch #23 *****
Loss: 719.921441
Improvement ratio: 0.849792
Feature L2-norm: 94.955260
Learning rate (eta): 0.049885
Total number of feature updates: 372163
Seconds required for this iteration: 0.619

***** Epoch #24 *****
Loss: 692.191174
Improvement ratio: 0.762637
Feature L2-norm: 95.987080
Learning rate (eta): 0.049880
Total number of feature updates: 388344
Seconds required for this iteration: 0.622

***** Epoch #25 *****
Loss: 666.303962
Improvement ratio: 0.722529
Feature L2-norm: 96.978796
Learning rate (eta): 0.049875
Total number of feature updates: 404525
Seconds required for this iteration: 0.621

***** Epoch #26 *****
Loss: 650.347865
Improvement ratio: 0.625819
Feature L2-norm: 97.916365
Learning rate (eta): 0.049870
Total number of feature updates: 420706
Seco

***** Epoch #62 *****
Loss: 297.290890
Improvement ratio: 0.196477
Feature L2-norm: 118.867445
Learning rate (eta): 0.049692
Total number of feature updates: 1003222
Seconds required for this iteration: 0.637

***** Epoch #63 *****
Loss: 308.223050
Improvement ratio: 0.131105
Feature L2-norm: 119.247994
Learning rate (eta): 0.049687
Total number of feature updates: 1019403
Seconds required for this iteration: 0.615

***** Epoch #64 *****
Loss: 306.944368
Improvement ratio: 0.105537
Feature L2-norm: 119.623529
Learning rate (eta): 0.049682
Total number of feature updates: 1035584
Seconds required for this iteration: 0.619

***** Epoch #65 *****
Loss: 289.695812
Improvement ratio: 0.186610
Feature L2-norm: 120.003652
Learning rate (eta): 0.049677
Total number of feature updates: 1051765
Seconds required for this iteration: 0.616

***** Epoch #66 *****
Loss: 294.427639
Improvement ratio: 0.103483
Feature L2-norm: 120.378313
Learning rate (eta): 0.049672
Total number of feature updates: 10

Storing the model
Number of active features: 356474 (356474)
Number of active attributes: 322491 (322491)
Number of active labels: 11 (11)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.699

NerTagger treenitud 879.1425447463989 sekundiga.


Alustan failide taggimist.
Täägitud fail P2rnu_Tori_Sindi_id20212_1838a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id12190_1884a.json
Täägitud fail Tartu_V6nnu_Ahja_id22375_1869a.json
Täägitud fail Tartu_Torma_Avinurme_id17128_1871a.json
Täägitud fail J2rva_Tyri_S2revere_id16142_1889a.json
Täägitud fail P2rnu_Audru_V6lla_id2931_1878a.json
Täägitud fail V6ru_R2pina_R2pina_id5391_1912a.json
Täägitud fail Tartu_V6nnu_Ahja_id22887_1887a.json
Täägitud fail V6ru_Kanepi_Krootuse_id25466_1885a.json
Täägitud fail Tartu_V6nnu_Kiidj2rve_id24772_1867a.json
Täägitud fail J2rva_Tyri_S2revere_id8223_1885a.json
Täägitud fail Tartu_Laiuse_Kivij2rve_id5885_1864a.json
Täägitu

Täägitud fail Tartu_Kodavere_Pala_id21385_1869a.json
Täägitud fail V6ru_Vastseliina_Misso_id22118_1883a.json
Täägitud fail Viljandi_Pilistvere_K6o_id25231_1843a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id7659_1870a.json
Täägitud fail P2rnu_P2rnu-Jaagupi_Soosalu_id14278_1868a.json
Täägitud fail Harju_Hageri_Kohila_id3017_1884a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15112_1868a.json
Täägitud fail V6ru_R2pina_R2pina_id1170_1863a.json
Täägitud fail Harju_Hageri_Kohila_id4902_1887a.json
Täägitud fail J2rva_Tyri_Kirna_id23198_1871a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id1480_1884a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15113_1868a.json
Täägitud fail Harju_Juuru_Kaiu_id17271_1903a.json
Täägitud fail J2rva_Tyri_S2revere_id15373_1888a.json
Täägitud fail Tartu_V6nnu_Ahja_id21776_1868a.json
Täägitud fail Tartu_V6nnu_Ahja_id14675_1882a.json
Täägitud fail Tartu_V6nnu_Ahja_id21777_1868a.json
Täägitud fail Harju_Rapla_Rapla_id17272_1868a.json
Täägitud fail Harju_Kuusalu_Kolga_id11722

***** Epoch #10 *****
Loss: 1678.296915
Feature L2-norm: 73.399340
Learning rate (eta): 0.049950
Total number of feature updates: 153450
Seconds required for this iteration: 0.550

***** Epoch #11 *****
Loss: 1486.809041
Improvement ratio: 11.484025
Feature L2-norm: 75.520653
Learning rate (eta): 0.049945
Total number of feature updates: 168795
Seconds required for this iteration: 0.545

***** Epoch #12 *****
Loss: 1345.556797
Improvement ratio: 5.167262
Feature L2-norm: 77.462608
Learning rate (eta): 0.049940
Total number of feature updates: 184140
Seconds required for this iteration: 0.549

***** Epoch #13 *****
Loss: 1239.339295
Improvement ratio: 3.623297
Feature L2-norm: 79.284892
Learning rate (eta): 0.049935
Total number of feature updates: 199485
Seconds required for this iteration: 0.537

***** Epoch #14 *****
Loss: 1142.263123
Improvement ratio: 2.888465
Feature L2-norm: 80.962589
Learning rate (eta): 0.049930
Total number of feature updates: 214830
Seconds required for this 

***** Epoch #51 *****
Loss: 331.318288
Improvement ratio: 0.199769
Feature L2-norm: 111.041518
Learning rate (eta): 0.049746
Total number of feature updates: 782595
Seconds required for this iteration: 0.521

***** Epoch #52 *****
Loss: 328.014263
Improvement ratio: 0.187948
Feature L2-norm: 111.492971
Learning rate (eta): 0.049741
Total number of feature updates: 797940
Seconds required for this iteration: 0.536

***** Epoch #53 *****
Loss: 320.956114
Improvement ratio: 0.181872
Feature L2-norm: 111.937451
Learning rate (eta): 0.049736
Total number of feature updates: 813285
Seconds required for this iteration: 0.551

***** Epoch #54 *****
Loss: 315.188256
Improvement ratio: 0.184361
Feature L2-norm: 112.372662
Learning rate (eta): 0.049731
Total number of feature updates: 828630
Seconds required for this iteration: 0.544

***** Epoch #55 *****
Loss: 310.557868
Improvement ratio: 0.171691
Feature L2-norm: 112.800111
Learning rate (eta): 0.049727
Total number of feature updates: 843975

***** Epoch #95 *****
Loss: 210.630526
Improvement ratio: 0.078858
Feature L2-norm: 125.532651
Learning rate (eta): 0.049529
Total number of feature updates: 1457775
Seconds required for this iteration: 0.522

***** Epoch #96 *****
Loss: 210.029630
Improvement ratio: 0.069325
Feature L2-norm: 125.775581
Learning rate (eta): 0.049525
Total number of feature updates: 1473120
Seconds required for this iteration: 0.521

***** Epoch #97 *****
Loss: 210.620720
Improvement ratio: 0.063373
Feature L2-norm: 126.014819
Learning rate (eta): 0.049520
Total number of feature updates: 1488465
Seconds required for this iteration: 0.527

***** Epoch #98 *****
Loss: 207.588849
Improvement ratio: 0.080937
Feature L2-norm: 126.254858
Learning rate (eta): 0.049515
Total number of feature updates: 1503810
Seconds required for this iteration: 0.521

***** Epoch #99 *****
Loss: 205.916817
Improvement ratio: 0.072708
Feature L2-norm: 126.489456
Learning rate (eta): 0.049510
Total number of feature updates: 15

***** Epoch #134 *****
Loss: 172.919387
Improvement ratio: 0.050666
Feature L2-norm: 133.539324
Learning rate (eta): 0.049339
Total number of feature updates: 2056230
Seconds required for this iteration: 0.519

***** Epoch #135 *****
Loss: 173.401333
Improvement ratio: 0.045704
Feature L2-norm: 133.710680
Learning rate (eta): 0.049334
Total number of feature updates: 2071575
Seconds required for this iteration: 0.519

***** Epoch #136 *****
Loss: 172.798538
Improvement ratio: 0.048161
Feature L2-norm: 133.881759
Learning rate (eta): 0.049329
Total number of feature updates: 2086920
Seconds required for this iteration: 0.528

***** Epoch #137 *****
Loss: 171.947241
Improvement ratio: 0.031444
Feature L2-norm: 134.051440
Learning rate (eta): 0.049324
Total number of feature updates: 2102265
Seconds required for this iteration: 0.522

***** Epoch #138 *****
Loss: 171.263749
Improvement ratio: 0.034802
Feature L2-norm: 134.221266
Learning rate (eta): 0.049319
Total number of feature update

***** Epoch #174 *****
Loss: 154.093259
Improvement ratio: 0.013447
Feature L2-norm: 139.575932
Learning rate (eta): 0.049145
Total number of feature updates: 2670030
Seconds required for this iteration: 0.526

***** Epoch #175 *****
Loss: 151.648463
Improvement ratio: 0.045121
Feature L2-norm: 139.707522
Learning rate (eta): 0.049140
Total number of feature updates: 2685375
Seconds required for this iteration: 0.519

***** Epoch #176 *****
Loss: 149.970384
Improvement ratio: 0.029313
Feature L2-norm: 139.839922
Learning rate (eta): 0.049135
Total number of feature updates: 2700720
Seconds required for this iteration: 0.520

***** Epoch #177 *****
Loss: 153.663515
Improvement ratio: 0.007148
Feature L2-norm: 139.969335
Learning rate (eta): 0.049130
Total number of feature updates: 2716065
Seconds required for this iteration: 0.520

***** Epoch #178 *****
Loss: 151.579534
Improvement ratio: 0.023599
Feature L2-norm: 140.099125
Learning rate (eta): 0.049126
Total number of feature update

Täägitud fail Harju_Hageri_Kohila_id24217_1875a.json
Täägitud fail Tartu_V6nnu_Ahja_id17527_1885a.json
Täägitud fail P2rnu_Tori_Sindi_id11974_1885a.json
Täägitud fail Viljandi_P6ltsamaa_Adavere_id17828_1894a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id14506_1876a.json
Täägitud fail Harju_Juuru_Juuru_id19472_1887a.json
Täägitud fail Harju_Kose_Palvere_id18184_1882a.json
Täägitud fail Tartu_Laiuse_Kivij2rve_id4917_1863a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id21217_1875a.json
Täägitud fail J2rva_Tyri_S2revere_id14565_1886a.json
Täägitud fail V6ru_R2pina_R2pina_id10635_1868a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id14511_1876a.json
Täägitud fail Tartu_Kodavere_Pala_id22058_1871a.json
Täägitud fail V6ru_Kanepi_Krootuse_id25390_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id14012_1882a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id22167_1881a.json
Täägitud fail Tartu_Kodavere_Pala_id22543_1871a.json
Täägitud fail Viljandi_Paistu_Holstre_id9042_1836a.json
Täägitud fail Tartu_Kodav

Täägitud fail J2rva_Tyri_V22tsa_id16408_1885a.json
Täägitud fail J2rva_Tyri_Tyri-Alliku_id3994_1903a.json
Täägitud fail P2rnu_Audru_V6lla_id5150_1877a.json
Täägitud fail V6ru_R2pina_Kahkva_id8829_1888a.json
Täägitud fail L22ne_Karuse_Saastna_id22146_1867a.json
Täägitud fail Tartu_V6nnu_Ahja_id22857_1886a.json
Täägitud fail Tartu_V6nnu_Ahja_id12159_1873a.json
Täägitud fail Tartu_Torma_Avinurme_id5061_1860a.json
Täägitud fail Tartu_R6ngu_Aakre_id6659_1826a.json
Täägitud fail Tartu_V6nnu_Kiidj2rve_id24500_1866a.json
Täägitud fail Viru_Rakvere_Rakvere_id5143_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id17913_1885a.json
Täägitud fail Tartu_Kodavere_Pala_id23275_1872a.json
Täägitud fail L22ne_Reigi_K6rgessaare_id23087_1894a.json
Täägitud fail Tartu_N6o_Aru_id5374_1890a.json
Täägitud fail Harju_Juuru_Juuru_id20228_1868a.json
Täägitud fail Tartu_V6nnu_Rasina_id13313_1865a.json
Täägitud fail Tartu_V6nnu_Ahja_id15819_1883a.json
Täägitud fail Tartu_V6nnu_Ahja_id13002_1875a.json
Täägitud fail Tartu

***** Epoch #25 *****
Loss: 656.994614
Improvement ratio: 0.723579
Feature L2-norm: 96.389820
Learning rate (eta): 0.049875
Total number of feature updates: 390350
Seconds required for this iteration: 0.598

***** Epoch #26 *****
Loss: 647.638765
Improvement ratio: 0.613312
Feature L2-norm: 97.332915
Learning rate (eta): 0.049870
Total number of feature updates: 405964
Seconds required for this iteration: 0.604

***** Epoch #27 *****
Loss: 620.109720
Improvement ratio: 0.616911
Feature L2-norm: 98.253606
Learning rate (eta): 0.049865
Total number of feature updates: 421578
Seconds required for this iteration: 0.602

***** Epoch #28 *****
Loss: 595.886809
Improvement ratio: 0.574674
Feature L2-norm: 99.121473
Learning rate (eta): 0.049860
Total number of feature updates: 437192
Seconds required for this iteration: 0.600

***** Epoch #29 *****
Loss: 581.522826
Improvement ratio: 0.487664
Feature L2-norm: 99.962799
Learning rate (eta): 0.049855
Total number of feature updates: 452806
Seco

***** Epoch #69 *****
Loss: 273.931712
Improvement ratio: 0.116081
Feature L2-norm: 120.754217
Learning rate (eta): 0.049657
Total number of feature updates: 1077366
Seconds required for this iteration: 0.616

***** Epoch #70 *****
Loss: 269.671840
Improvement ratio: 0.243345
Feature L2-norm: 121.095540
Learning rate (eta): 0.049652
Total number of feature updates: 1092980
Seconds required for this iteration: 0.650

***** Epoch #71 *****
Loss: 296.260628
Improvement ratio: 0.002884
Feature L2-norm: 121.438184
Learning rate (eta): 0.049648
Total number of feature updates: 1108594
Seconds required for this iteration: 0.653

***** Epoch #72 *****
Loss: 276.279478
Improvement ratio: 0.062100
Feature L2-norm: 121.771645
Learning rate (eta): 0.049643
Total number of feature updates: 1124208
Seconds required for this iteration: 0.633

***** Epoch #73 *****
Loss: 281.747648
Improvement ratio: 0.088412
Feature L2-norm: 122.099274
Learning rate (eta): 0.049638
Total number of feature updates: 11

Täägitud fail Harju_Juuru_Juuru_id20490_1869a.json
Täägitud fail P2rnu_Halliste_Penuja_id416_1885a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id2041_1878a.json
Täägitud fail L22ne_Ridala_Sinalepa_id24333_1882a.json
Täägitud fail Harju_Kose_Kose-Uuem6isa_id5292_1869a.json
Täägitud fail V6ru_R6uge_Leevi_id24491_1875a.json
Täägitud fail Harju_Rapla_Rapla_id19152_1870a.json
Täägitud fail L22ne_Pyhalepa_Kassari_id23159_1867a.json
Täägitud fail Harju_Kose_Palvere_id13236_1879a.json
Täägitud fail Tartu_Kodavere_Pala_id20761_1867a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id20031_1872a.json
Täägitud fail Tartu_V6nnu_Ahja_id9655_1871a.json
Täägitud fail Tartu_Kodavere_Pala_id21194_1868a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id8154_1888a.json
Täägitud fail Tartu_V6nnu_Ahja_id15090_1883a.json
Täägitud fail Harju_Kose_Palvere_id24675_1872a.json
Täägitud fail L22ne_Kullamaa_Piirsalu_id7871_1890a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id4314_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id

In [57]:
gold_ner = []
test_ner = []

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    

    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        appendable_gold_ner = []
        appendable_test_ner = []

        if file.endswith(".json"):
            if file in files_not_working:
                continue
            else:
                with open("./vallakohtufailid-trained-nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                    open("./vallakohtufailid-json-flattened/" + str(file), 'r', encoding='UTF-8') as f_gold:
                        test_import = json_to_text(f_test.read())
                        gold_import = json_to_text(f_gold.read())

                        # The commented part is needed for word-level-ner.
                        '''
                        for i in range(len(gold_import['flat_gold_wordner'])):
                            tag = gold_import['flat_gold_wordner'][i].nertag[0]
                            gold.append(tag)
                        for i in range(len(test_import['flat_wordner'])):
                            tag = test_import['flat_wordner'][i].nertag[0]
                            test.append(tag)
                        '''

                        for i in range(len(gold_import['gold_ner'])):
                            ner = gold_import['gold_ner'][i]
                            label = ner.nertag
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_gold_ner.append({"label": label, "start": start, "end": end})

                        for i in range(len(test_import['flat_ner'])):
                            ner = test_import['flat_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_test_ner.append({"label": label, "start": start, "end": end})
                
            gold_ner.append(appendable_gold_ner)
            test_ner.append(appendable_test_ner)
print("Programm on lõpetanud oma töö.")

Programm on lõpetanud oma töö.


In [52]:
with open("results_new.txt", "w+") as results_file:
    results_file.write(json.dumps(all_results))

In [3]:
with open("results_new.txt", "r") as f:
    results_json = json.loads(f.read())

### Tulemused alamhulkade kaupa:

In [43]:
correct_all = 0
actual_all = 0
possible_all = 0
df = dict()

for i in ['1', '2', '3', '4', '5']:
    train = []
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
    correct = results_json[i][0]['strict']['correct']
    correct_all += correct
    actual = results_json[i][0]['strict']['actual']
    actual_all += actual
    possible = results_json[i][0]['strict']['possible']
    possible_all += possible
    precision = (correct / actual)
    recall = (correct / possible)
    f1 = 2 * ((precision * recall) / (precision + recall))
    df[str(subdistribution_for_testing)] = [precision, recall, f1]

precision = correct_all / actual_all
recall = correct_all / possible_all
f1 = 2 * ((precision * recall) / (precision + recall))
df["Total"] = [precision, recall, f1]

dataframe = pd.DataFrame(df, index=["Precision", "Recall", "F1-score"])
dataframe.columns.name = "Alamhulk"
display(dataframe)

Alamhulk,1,2,3,4,5,Total
Precision,0.911927,0.914806,0.905153,0.882717,0.911048,0.904403
Recall,0.879646,0.889086,0.880174,0.847297,0.881895,0.874555
F1-score,0.895495,0.901763,0.892489,0.864645,0.896234,0.889228


### Tulemused nimeüksuste liigi kaupa:

In [46]:
df = dict()
for i in ['1', '2', '3', '4', '5']:
    train = []
    by_kind = dict()
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
            
    for key in list(results_json[i][1].keys()):
        correct = results_json[i][1][str(key)]['strict']['correct']
        actual = results_json[i][1][str(key)]['strict']['actual']
        possible = results_json[i][1][str(key)]['strict']['possible']
        precision = (correct / actual)
        recall = (correct / possible)
        f1 = 2 * ((precision * recall) / (precision + recall))
        by_kind[str(key) + "_precision"] = precision
        by_kind[str(key) + "_recall"] = recall
        by_kind[str(key) + "_f1score"] = f1
    df[str(subdistribution_for_testing)] = by_kind

display(pd.DataFrame(df))

Unnamed: 0,1,2,3,4,5
ORG_precision,0.777778,0.777778,0.848485,0.764706,0.728395
ORG_recall,0.711864,0.742424,0.8,0.712329,0.75641
ORG_f1score,0.743363,0.75969,0.823529,0.737589,0.742138
PER_precision,0.939442,0.944829,0.939235,0.908825,0.946711
PER_recall,0.927235,0.92801,0.928118,0.905157,0.938812
PER_f1score,0.933298,0.936344,0.933643,0.906987,0.942745
MISC_precision,0.7,0.868421,0.74359,0.756757,0.631579
MISC_recall,0.512195,0.647059,0.690476,0.8,0.6
MISC_f1score,0.591549,0.741573,0.716049,0.777778,0.615385
LOC_precision,0.533981,0.650602,0.641667,0.556391,0.666667


### Confusion matrix

In [135]:
uus_gold_ner = []
uus_test_ner = []

for i in range(len(gold_ner)):
    for j in range(len(test_ner[i])):
        element_test = test_ner[i][j]
        for element_gold in gold_ner[i]:
            if element_test['start'] == element_gold['start'] and element_test['end'] == element_gold['end']:
                uus_gold_ner.append(element_gold)
                uus_test_ner.append(element_test)

In [136]:
y_true = pd.Series([x['label'] for x in uus_gold_ner], name="Actual")
y_pred = pd.Series([x['label'] for x in uus_test_ner], name="Predicted")

In [137]:
pd.crosstab(y_true, y_pred)

Predicted,LOC,LOC_ORG,MISC,ORG,PER
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LOC,406,58,0,0,43
LOC_ORG,58,1501,2,1,87
MISC,2,2,135,0,10
ORG,0,4,0,258,0
PER,7,23,1,0,18078
