In [7]:
import json
import time
import os
import sklearn_crfsuite
import re
import nereval
import pandas as pd
from preprocessing_protocols import preprocess_text

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from sklearn_crfsuite import metrics

from nervaluate import Evaluator

from estnltk.taggers.estner.ner_trainer import NerTrainer
from estnltk.taggers.estner.model_storage_util import ModelStorageUtil
from estnltk.core import DEFAULT_PY3_NER_MODEL_DIR

In [8]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

These files don't work because the protocols are written in a different language, which the goldstandard didn't recognise, hence have no goldstandard tags.

In [9]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [10]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [11]:
all_results = {}

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    
    # Getting the filenames to be trained on from the files dictionary.
    filenames = {key: value for key, value in files.items() if int(value) in training_subdistributions}
    
    # Creating training_texts from the aforementioned filenames.
    print("Valmistan ette treenimistekste.")
    start = time.time()
    training_texts = []
    for filename in filenames:
        with open('./vallakohtufailid-json-flattened/' + str(filename), 'r', encoding='UTF-8') as file:
            if filename in files_not_working:
                continue
            else:
                training_texts.append(preprocess_text(json_to_text(file.read())))
    print(f"Treenimistekstid ette valmistatud {time.time() - start} sekundiga.")
    
    # Setting up the trainer and training.
    print("\n\nAlustan nertaggeri treenimist.")
    start = time.time()
    model_dir=DEFAULT_PY3_NER_MODEL_DIR
    modelUtil = ModelStorageUtil(model_dir)
    nersettings = modelUtil.load_settings()
    nersettings.FEATURE_EXTRACTORS = ('models.protocols_fex.NerEmptyFeatureTagger',
                                      'models.protocols_fex.NerLocalFeatureWithoutMorphTagger' )
    trainer = NerTrainer(nersettings)
    trainer.train( training_texts, layer='gold_wordner', model_dir='test' )
    print(f"NerTagger treenitud {time.time() - start} sekundiga.")
    
    # Setting up the new trained nertagger and defining layers to be removed later on.
    nertagger = NerTagger(model_dir = 'test')
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    
    # Tagging the files using the new nertagger.
    print("\n\nAlustan failide taggimist.")
    start = time.time()
    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
            text = f.read()
            if file == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
                text = text.replace('..', '. .')
            text = preprocess_text(Text(text))
            nertagger.tag(text)
            text.add_layer(flatten(text['ner'], 'flat_ner'))

            for x in removed_layers:
                text.pop_layer(x)
            
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid-trained-nertagger/" + file)
            print(f'Täägitud fail {file}')
    print(f"Failid taggitud {time.time() - start} sekundiga.")
print("Programm on lõpetanud oma töö.")

Valmistan ette treenimistekste.
Treenimistekstid ette valmistatud 116.6378743648529 sekundiga.


Alustan nertaggeri treenimist.
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33508
Seconds required: 0.196

Stochastic Gradient Descent (SGD)
c2: 0.001000
max_iterations: 1000
period: 10
delta: 0.000001

Calibrating the learning rate (eta)
calibration.eta: 0.100000
calibration.rate: 2.000000
calibration.samples: 1000
calibration.candidates: 10
calibration.max_trials: 20
Initial loss: 32903.918933
Trial #1 (eta = 0.100000): 2990.159087
Trial #2 (eta = 0.200000): 3857.921941
Trial #3 (eta = 0.400000): 6726.280473
Trial #4 (eta = 0.800000): 13488.032333
Trial #5 (eta = 1.600000): 26283.318047
Trial #6 (eta = 3.200000): 53761.528785 (worse)
Trial #7 (eta = 0.050000): 3027.620163
Trial #8 (eta = 0.025000): 3379.571104
Trial #9 (eta = 0.012500): 3868.78700

Täägitud fail Tartu_V6nnu_Ahja_id3502_1882a.json
Täägitud fail J2rva_Tyri_V22tsa_id22541_1914a.json
Täägitud fail L22ne_Pyhalepa_K2rdla_id24804_1877a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id10108_1880a.json
Täägitud fail Tartu_V6nnu_Ahja_id13562_1881a.json
Täägitud fail Harju_Kose_Palvere_id15594_1881a.json
Täägitud fail L22ne_Emmaste_Emmaste_id15087_1895a.json
Täägitud fail V6ru_Vastseliina_Misso_id19866_1882a.json
Täägitud fail J2rva_Tyri_Kirna_id24452_1880a.json
Täägitud fail Tartu_V6nnu_Ahja_id12638_1875a.json
Täägitud fail Tartu_Kodavere_Pala_id16229_1849a.json
Täägitud fail Harju_Rapla_Rapla_id365_1873a.json
Täägitud fail Viljandi_P6ltsamaa_Adavere_id20278_1890a.json
Täägitud fail Viljandi_Viljandi_Karula_id19366_1868a.json
Täägitud fail Tartu_Kodavere_Pala_id22898_1872a.json
Täägitud fail P2rnu_Audru_V6lla_id5904_1878a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id7612_1869a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19169_1869a.json
Täägitud fail Tartu_V6nnu_Ahja_id2

Täägitud fail Viljandi_Paistu_Holstre_id11321_1848a.json
Täägitud fail J2rva_Anna_Eivere_id981_1868a.json
Täägitud fail Harju_Kose_Triigi_id10535_1870a.json
Täägitud fail Tartu_V6nnu_Ahja_id18345_1886a.json
Täägitud fail Tartu_Torma_Avinurme_id14451_1903a.json
Täägitud fail Tartu_V6nnu_Ahja_id19549_1888a.json
Täägitud fail Harju_Kose_Habaja_id675_1874a.json
Täägitud fail Harju_Hageri_Kohila_id21386_1886a.json
Täägitud fail P2rnu_Tori_Sindi_id7854_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id22271_1869a.json
Täägitud fail Viljandi_P6ltsamaa_Pajusi_id2463_1870a.json
Täägitud fail Tartu_R6ngu_Aakre_id5938_1890a.json
Täägitud fail Tartu_Kodavere_Pala_id18246_1863a.json
Täägitud fail J2rva_Tyri_S2revere_id12169_1876a.json
Täägitud fail Tartu_Torma_Avinurme_id17501_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id16602_1884a.json
Täägitud fail Tartu_V6nnu_Ahja_id17125_1884a.json
Täägitud fail L22ne_Pyhalepa_Kassari_id20363_1890a.json
Täägitud fail Tartu_V6nnu_Ahja_id17321_1885a.json
Täägitud fail 

***** Epoch #17 *****
Loss: 5732.668587
Improvement ratio: 0.463816
Feature L2-norm: 100.420753
Learning rate (eta): 0.049915
Total number of feature updates: 275417
Seconds required for this iteration: 0.202

***** Epoch #18 *****
Loss: 5404.883387
Improvement ratio: 0.499263
Feature L2-norm: 102.527901
Learning rate (eta): 0.049910
Total number of feature updates: 291618
Seconds required for this iteration: 0.200

***** Epoch #19 *****
Loss: 5211.024517
Improvement ratio: 0.501847
Feature L2-norm: 104.595835
Learning rate (eta): 0.049905
Total number of feature updates: 307819
Seconds required for this iteration: 0.201

***** Epoch #20 *****
Loss: 5089.614989
Improvement ratio: 0.409245
Feature L2-norm: 106.607707
Learning rate (eta): 0.049900
Total number of feature updates: 324020
Seconds required for this iteration: 0.202

***** Epoch #21 *****
Loss: 4960.831184
Improvement ratio: 0.364715
Feature L2-norm: 108.538699
Learning rate (eta): 0.049895
Total number of feature updates: 3

Täägitud fail Saare_Kihelkonna_Atla_id7135_1873a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id22077_1881a.json
Täägitud fail Harju_Kuusalu_Kolga_id15956_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id23394_1893a.json
Täägitud fail Viljandi_Pilistvere_Arussaare_id24509_1855a.json
Täägitud fail Tartu_Torma_Avinurme_id14475_1903a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15780_1889a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id7557_1879a.json
Täägitud fail Tartu_V6nnu_Ahja_id14767_1882a.json
Täägitud fail Tartu_N6o_Luke_id4149_1872a.json
Täägitud fail Viljandi_Viljandi_Karula_id19357_1867a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id1517_1884a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id15347_1876a.json
Täägitud fail Tartu_V6nnu_Ahja_id21806_1868a.json
Täägitud fail Harju_Juuru_Kaiu_id3479_1886a.json
Täägitud fail Tartu_Torma_Avinurme_id10136_1901a.json
Täägitud fail Tartu_Kodavere_Pala_id21272_1869a.json
Täägitud fail Tartu_V6nnu_Ahja_id16981_1884a.json
Täägitud fail Tartu_Torma_Avinur

Täägitud fail Tartu_V6nnu_Ahja_id20995_1889a.json
Täägitud fail Tartu_V6nnu_Ahja_id13250_1866a.json
Täägitud fail L22ne_Reigi_K6rgessaare_id22876_1893a.json
Täägitud fail J2rva_Tyri_V22tsa_id16656_1886a.json
Täägitud fail Harju_Kose_Triigi_id12028_1880a.json
Täägitud fail J2rva_Tyri_Kirna_id22825_1869a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id13801_1881a.json
Täägitud fail J2rva_Tyri_Kirna_id25139_1881a.json
Täägitud fail Harju_Kose_Kose-Uuem6isa_id3340_1868a.json
Täägitud fail V6ru_Vastseliina_Misso_id16574_1882a.json
Täägitud fail J2rva_Tyri_Kirna_id23402_1872a.json
Täägitud fail J2rva_Tyri_Kirna_id22809_1868a.json
Täägitud fail J2rva_Tyri_Kirna_id22602_1867a.json
Täägitud fail Tartu_Kodavere_Kokora_id627_1872a.json
Täägitud fail L22ne_Kullamaa_Kuij6e_id15513_1880a.json
Täägitud fail J2rva_Tyri_S2revere_id11688_1874a.json
Täägitud fail Tartu_V6nnu_Ahja_id11263_1872a.json
Täägitud fail J2rva_Ambla_Ambla_id7441_1887a.json
Täägitud fail Tartu_Kodavere_Pala_id18456_1863a.json
Täägit

***** Epoch #18 *****
Loss: 5527.149563
Improvement ratio: 0.443271
Feature L2-norm: 103.220216
Learning rate (eta): 0.049910
Total number of feature updates: 291258
Seconds required for this iteration: 0.206

***** Epoch #19 *****
Loss: 5218.224133
Improvement ratio: 0.423519
Feature L2-norm: 105.306763
Learning rate (eta): 0.049905
Total number of feature updates: 307439
Seconds required for this iteration: 0.204

***** Epoch #20 *****
Loss: 5096.575834
Improvement ratio: 0.398967
Feature L2-norm: 107.359883
Learning rate (eta): 0.049900
Total number of feature updates: 323620
Seconds required for this iteration: 0.204

***** Epoch #21 *****
Loss: 4950.695584
Improvement ratio: 0.404308
Feature L2-norm: 109.340269
Learning rate (eta): 0.049895
Total number of feature updates: 339801
Seconds required for this iteration: 0.208

***** Epoch #22 *****
Loss: 4855.724004
Improvement ratio: 0.342098
Feature L2-norm: 111.234695
Learning rate (eta): 0.049890
Total number of feature updates: 3

Täägitud fail Tartu_Kodavere_Alatskivi_id9807_1879a.json
Täägitud fail Tartu_V6nnu_Ahja_id19012_1887a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id13155_1884a.json
Täägitud fail Saare_Kaarma_Loona_id7769_1910a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id6270_1879a.json
Täägitud fail Tartu_Kursi_Puurmani_id11055_1874a.json
Täägitud fail Tartu_V6nnu_Ahja_id19102_1887a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id3746_1883a.json
Täägitud fail Tartu_N6o_Aru_id4068_1890a.json
Täägitud fail P2rnu_Tori_Sindi_id20034_1836a.json
Täägitud fail Tartu_Kodavere_Pala_id20260_1866a.json
Täägitud fail L22ne_Vormsi_Vormsi_id24532_1888a.json
Täägitud fail Tartu_Kodavere_Pala_id25066_1873a.json
Täägitud fail Tartu_Kodavere_Ranna_id19679_1865a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19197_1869a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id8180_1888a.json
Täägitud fail Harju_Kose_Triigi_id11473_1871a.json
Täägitud fail V6ru_R2pina_R2pina_id11101_1863a.json
Täägitud fail Saare_Kihelkonna_Kotlan

Täägitud fail J2rva_Tyri_V22tsa_id19055_1898a.json
Täägitud fail Harju_Kose_Palvere_id13987_1880a.json
Täägitud fail V6ru_Vastseliina_Misso_id24810_1886a.json
Täägitud fail Tartu_Torma_Avinurme_id22547_1872a.json
Täägitud fail Tartu_N6o_Pangodi_id2808_1889a.json
Täägitud fail V6ru_P6lva_Kiuma_id7167_1880a.json
Täägitud fail Tartu_V6nnu_Ahja_id14727_1882a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id23068_1881a.json
Täägitud fail Tartu_Torma_Avinurme_id3646_1868a.json
Täägitud fail V6ru_Vastseliina_Misso_id11543_1886a.json
Täägitud fail Tartu_V6nnu_Ahja_id22666_1881a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id6475_1868a.json
Täägitud fail V6ru_R6uge_Saaluse_id11773_1880a.json
Täägitud fail L22ne_Vormsi_Vormsi_id25013_1888a.json
Täägitud fail V6ru_Urvaste_Vaabina_id785_1876a.json
Täägitud fail V6ru_R2pina_R2pina_id12011_1866a.json
Täägitud fail Tartu_V6nnu_Ahja_id20646_1889a.json
Täägitud fail Harju_Rapla_Rapla_id20938_1870a.json
Täägitud fail J2rva_Tyri_S2revere_id10443_1868a.json


***** Epoch #19 *****
Loss: 4815.708852
Improvement ratio: 0.341794
Feature L2-norm: 142.795579
Learning rate (eta): 0.099621
Total number of feature updates: 291555
Seconds required for this iteration: 0.196

***** Epoch #20 *****
Loss: 4795.730135
Improvement ratio: 0.274479
Feature L2-norm: 145.396219
Learning rate (eta): 0.099602
Total number of feature updates: 306900
Seconds required for this iteration: 0.195

***** Epoch #21 *****
Loss: 3724.464049
Improvement ratio: 0.594644
Feature L2-norm: 147.574978
Learning rate (eta): 0.099582
Total number of feature updates: 322245
Seconds required for this iteration: 0.198

***** Epoch #22 *****
Loss: 3626.028226
Improvement ratio: 0.441189
Feature L2-norm: 149.657180
Learning rate (eta): 0.099562
Total number of feature updates: 337590
Seconds required for this iteration: 0.198

***** Epoch #23 *****
Loss: 3538.725912
Improvement ratio: 0.567157
Feature L2-norm: 151.727449
Learning rate (eta): 0.099542
Total number of feature updates: 3

Täägitud fail Harju_Keila_Keila_id13005_1890a.json
Täägitud fail V6ru_R2pina_Kahkva_id7771_1888a.json
Täägitud fail Tartu_N6o_Pangodi_id5095_1889a.json
Täägitud fail Tartu_N6o_Pangodi_id4146_1889a.json
Täägitud fail Tartu_Kodavere_Ranna_id14405_1860a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id19122_1868a.json
Täägitud fail Tartu_V6nnu_Ahja_id19084_1887a.json
Täägitud fail Harju_Hageri_Kohila_id4010_1890a.json
Täägitud fail V6ru_R2pina_Kahkva_id6489_1887a.json
Täägitud fail Harju_Kose_Palvere_id13989_1880a.json
Täägitud fail Viljandi_Paistu_Holstre_id6625_1828a.json
Täägitud fail V6ru_Vastseliina_Misso_id22084_1883a.json
Täägitud fail L22ne_Vormsi_Vormsi_id24683_1888a.json
Täägitud fail Tartu_Kodavere_Pala_id22815_1872a.json
Täägitud fail L22ne_Martna_Martna_id18619_1871a.json
Täägitud fail Tartu_V6nnu_Ahja_id20555_1889a.json
Täägitud fail J2rva_Ambla_Ambla_id5939_1888a.json
Täägitud fail J2rva_Tyri_Kirna_id24973_1881a.json
Täägitud fail Tartu_Torma_Avinurme_id20542_1871a.json
Täägi

***** Epoch #3 *****
Loss: 11124.645991
Feature L2-norm: 72.939613
Learning rate (eta): 0.099940
Total number of feature updates: 46842
Seconds required for this iteration: 0.198

***** Epoch #4 *****
Loss: 11435.964817
Feature L2-norm: 83.262036
Learning rate (eta): 0.099920
Total number of feature updates: 62456
Seconds required for this iteration: 0.194

***** Epoch #5 *****
Loss: 11370.811709
Feature L2-norm: 91.032337
Learning rate (eta): 0.099900
Total number of feature updates: 78070
Seconds required for this iteration: 0.196

***** Epoch #6 *****
Loss: 8217.628451
Feature L2-norm: 95.994643
Learning rate (eta): 0.099880
Total number of feature updates: 93684
Seconds required for this iteration: 0.196

***** Epoch #7 *****
Loss: 7961.668956
Feature L2-norm: 102.254220
Learning rate (eta): 0.099860
Total number of feature updates: 109298
Seconds required for this iteration: 0.197

***** Epoch #8 *****
Loss: 6969.197774
Feature L2-norm: 106.723164
Learning rate (eta): 0.099840
Tot

Täägitud fail Tartu_V6nnu_Ahja_id16349_1884a.json
Täägitud fail Harju_Kuusalu_Kolga_id11902_1888a.json
Täägitud fail J2rva_Peetri_V2ike-Kareda_id22150_1879a.json
Täägitud fail Tartu_Kodavere_Pala_id22870_1872a.json
Täägitud fail Tartu_V6nnu_Ahja_id13565_1881a.json
Täägitud fail Harju_Juuru_Juuru_id17866_1869a.json
Täägitud fail Harju_Juuru_Kaiu_id931_1912a.json
Täägitud fail J2rva_Anna_Eivere_id6607_1880a.json
Täägitud fail L22ne_Kullamaa_Piirsalu_id16723_1888a.json
Täägitud fail J2rva_Tyri_V22tsa_id17604_1888a.json
Täägitud fail Tartu_V6nnu_Ahja_id14708_1882a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id9914_1880a.json
Täägitud fail L22ne_Pyhalepa_K2rdla_id23461_1873a.json
Täägitud fail Tartu_V6nnu_Ahja_id21842_1868a.json
Täägitud fail Harju_J6el2htme_J6el2htme_id7633_1870a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id13040_1856a.json
Täägitud fail Saare_P8ide_Laimjala_id6577_1917a.json
Täägitud fail J2rva_Tyri_Kirna_id24066_1879a.json
Täägitud fail L22ne_Ridala_Sinalepa_id25489_189

Täägitud fail J2rva_Peetri_V2ike-Kareda_id22431_1880a.json
Täägitud fail Saare_Kihelkonna_Kotlandi_id21549_1869a.json
Täägitud fail J2rva_Tyri_V22tsa_id18782_1897a.json
Täägitud fail J2rva_Tyri_V22tsa_id16955_1887a.json
Täägitud fail J2rva_J2rva-Jaani_Einmanni_id9007_1868a.json
Täägitud fail Tartu_V6nnu_Ahja_id23568_1895a.json
Täägitud fail Tartu_Kodavere_Alatskivi_id1504_1865a.json
Täägitud fail P2rnu_P2rnu-Elisabethi_Sauga_id18104_1877a.json
Täägitud fail L22ne_Emmaste_Emmaste_id8296_1894a.json
Täägitud fail Tartu_V6nnu_Ahja_id13380_1881a.json
Täägitud fail Tartu_Otep22_Pyhaj2rve_id9556_1885a.json
Täägitud fail Tartu_V6nnu_Ahja_id22940_1889a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id4128_1883a.json
Täägitud fail J2rva_Tyri_S2revere_id16068_1889a.json
Täägitud fail Tartu_Kodavere_Pala_id23072_1872a.json
Täägitud fail Harju_Kose_Triigi_id11698_1876a.json
Täägitud fail V6ru_P6lva_Peri_id10140_1891a.json
Täägitud fail Viljandi_K6pu_Suure-K6pu_id12747_1884a.json
Täägitud fail J2rva_Ty

In [12]:
gold_ner = []
test_ner = []

for subdistribution in [1, 2, 3, 4, 5]:
    training_subdistributions = []
    for y in [1, 2, 3, 4, 5]:
        if y == subdistribution:
            subdistribution_for_testing = y
        else:
            training_subdistributions.append(y)
    

    for file in {key: value for key, value in files.items() if int(value) == subdistribution_for_testing}:
        appendable_gold_ner = []
        appendable_test_ner = []

        if file.endswith(".json"):
            if file in files_not_working:
                continue
            else:
                with open("./vallakohtufailid-trained-nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                    open("./vallakohtufailid-json-flattened/" + str(file), 'r', encoding='UTF-8') as f_gold:
                        test_import = json_to_text(f_test.read())
                        gold_import = json_to_text(f_gold.read())

                        # The commented part is needed for word-level-ner.
                        '''
                        for i in range(len(gold_import['flat_gold_wordner'])):
                            tag = gold_import['flat_gold_wordner'][i].nertag[0]
                            gold.append(tag)
                        for i in range(len(test_import['flat_wordner'])):
                            tag = test_import['flat_wordner'][i].nertag[0]
                            test.append(tag)
                        '''

                        for i in range(len(gold_import['gold_ner'])):
                            ner = gold_import['gold_ner'][i]
                            label = ner.nertag
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_gold_ner.append({"label": label, "start": start, "end": end})

                        for i in range(len(test_import['flat_ner'])):
                            ner = test_import['flat_ner'][i]
                            label = ner.nertag[0]
                            start = int(ner.start)
                            end = int(ner.end)
                            appendable_test_ner.append({"label": label, "start": start, "end": end})
                
            gold_ner.append(appendable_gold_ner)
            test_ner.append(appendable_test_ner)
    evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
    results, results_per_tag = evaluator.evaluate()
    all_results[subdistribution_for_testing] = (results, results_per_tag)
print("Programm on lõpetanud oma töö.")

Programm on lõpetanud oma töö.


In [13]:
with open("results_new_ver1.txt", "w+") as results_file:
    results_file.write(json.dumps(all_results))

In [14]:
with open("results_new_ver1.txt", "r") as f:
    results_json = json.loads(f.read())

### Tulemused alamhulkade kaupa:

In [15]:
correct_all = 0
actual_all = 0
possible_all = 0
df = dict()

for i in ['1', '2', '3', '4', '5']:
    train = []
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)
    correct = results_json[i][0]['strict']['correct']
    correct_all += correct
    actual = results_json[i][0]['strict']['actual']
    actual_all += actual
    possible = results_json[i][0]['strict']['possible']
    possible_all += possible
    precision = (correct / actual)
    recall = (correct / possible)
    f1 = 2 * ((precision * recall) / (precision + recall))
    df[str(subdistribution_for_testing)] = [precision, recall, f1]

precision = correct_all / actual_all
recall = correct_all / possible_all
f1 = 2 * ((precision * recall) / (precision + recall))
df["Total"] = [precision, recall, f1]

dataframe = pd.DataFrame(df, index=["Precision", "Recall", "F1-score"])
dataframe.columns.name = "Alamhulk"
display(dataframe)

Alamhulk,1,2,3,4,5,Total
Precision,0.753394,0.770397,0.774204,0.776462,0.790536,0.778208
Recall,0.736726,0.71985,0.707419,0.710128,0.69755,0.708368
F1-score,0.744966,0.744266,0.739306,0.741815,0.741138,0.741648


### Tulemused nimeüksuste liigi kaupa:

In [20]:
df = dict()
totals = dict()
    
for i in ['1', '2', '3', '4', '5']:
    train = []
    by_kind = dict()
    for j in ['1', '2', '3', '4', '5']:
        if j == i:
            subdistribution_for_testing = j
        else:
            train.append(j)

    for key in list(results_json[i][1].keys()):
        correct_all = 0
        actual_all = 0
        possible_all = 0
        correct = results_json[i][1][str(key)]['strict']['correct']
        correct_all += correct
        actual = results_json[i][1][str(key)]['strict']['actual']
        actual_all += actual
        possible = results_json[i][1][str(key)]['strict']['possible']
        possible_all += possible
        
        precision = (correct / actual)
        recall = (correct / possible)
        f1 = 2 * ((precision * recall) / (precision + recall))
        
        precisionname = str(key) + "_precision"
        recallname = str(key) + "_recall"
        f1scorename = str(key) + "_f1score"

        by_kind[precisionname] = precision
        by_kind[recallname] = recall
        by_kind[f1scorename] = f1

    df[str(subdistribution_for_testing)] = by_kind

for key, value in df.items():
    for name, score in value.items():
        if name in totals:
            totals[name] = (totals.get(name) + score)
        else:
            totals[name] = score

for key, value in totals.items():
    totals[key] = value/5

df["Total"] = totals
display(pd.DataFrame(df))

Unnamed: 0,1,2,3,4,5,Total
ORG_precision,0.207921,0.234568,0.270142,0.291513,0.299187,0.260666
ORG_recall,0.711864,0.608,0.584615,0.589552,0.531792,0.605165
ORG_f1score,0.321839,0.33853,0.36953,0.390123,0.382934,0.360591
PER_precision,0.830798,0.841239,0.841739,0.841118,0.851565,0.841292
PER_recall,0.794958,0.779371,0.770592,0.781839,0.770935,0.779539
PER_f1score,0.812483,0.809124,0.804596,0.810396,0.809246,0.809169
MISC_precision,0.444444,0.513514,0.552632,0.538462,0.574468,0.524704
MISC_recall,0.390244,0.413043,0.470149,0.497041,0.516746,0.457445
MISC_f1score,0.415584,0.457831,0.508065,0.516923,0.544081,0.488497
LOC_precision,0.129032,0.149485,0.18638,0.172603,0.175355,0.162571


### Confusion matrix

In [17]:
uus_gold_ner = []
uus_test_ner = []

for i in range(len(gold_ner)):
    for j in range(len(test_ner[i])):
        element_test = test_ner[i][j]
        for element_gold in gold_ner[i]:
            if element_test['start'] == element_gold['start'] and element_test['end'] == element_gold['end']:
                uus_gold_ner.append(element_gold)
                uus_test_ner.append(element_test)

In [18]:
y_true = pd.Series([x['label'] for x in uus_gold_ner], name="Actual")
y_pred = pd.Series([x['label'] for x in uus_test_ner], name="Predicted")

In [19]:
pd.crosstab(y_true, y_pred)

Predicted,LOC,LOC_ORG,MISC,ORG,PER
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LOC,74,88,1,0,53
LOC_ORG,22,818,1,5,59
MISC,0,2,108,1,5
ORG,0,7,0,184,1
PER,11,62,4,2,15071
