In [1]:
import json
import time
import os
import re
import math

from nervaluate import Evaluator
from modules.preprocessing_protocols import preprocess_text
#from modules.extract_results import extract_results_to_txt_file, \
#                                    display_results_by_subdistribution,\
#                                    display_results_by_named_entity, \
#                                    display_confusion_matrix
from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten

In [2]:
files = {}

with open(os.path.join('..', '..', 'data', 'divided_corpus.txt'), 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

In [3]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'J2rva_Tyri_V22tsa_id22266_1913a.json', \
                     'J2rva_Tyri_V22tsa_id22178_1912a.json']

In [4]:
def extract_results_to_txt_file(model_dir, files):
    gold_ner = []
    test_ner = []

    for file in files:
        appendable_gold_ner = []
        appendable_test_ner = []

        if not file.endswith(".json") or file in files_not_working:
            continue
        else:
            with open(os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', file), 'r', encoding='UTF-8') as f_test, \
                 open(os.path.join('..', '..', 'data', 'vallakohtufailid-json-flattened', file), 'r', encoding='UTF-8') as f_gold:
                test_import = json_to_text(f_test.read())
                gold_import = json_to_text(f_gold.read())

                # The commented part is needed for word-level-ner.
            '''
            for i in range(len(gold_import['flat_gold_wordner'])):
                tag = gold_import['flat_gold_wordner'][i].nertag[0]
                gold.append(tag)
            for i in range(len(test_import['flat_wordner'])):
                tag = test_import['flat_wordner'][i].nertag[0]
                test.append(tag)
                '''

            for i in range(len(gold_import['gold_ner'])):
                ner = gold_import['gold_ner'][i]
                label = ner.nertag
                start = int(ner.start)
                end = int(ner.end)
                appendable_gold_ner.append({"label": label, "start": start, "end": end})

            for i in range(len(test_import['flat_ner'])):
                ner = test_import['flat_ner'][i]
                label = ner.nertag[0]
                start = int(ner.start)
                end = int(ner.end)
                appendable_test_ner.append({"label": label, "start": start, "end": end})
        gold_ner.append(appendable_gold_ner)
        test_ner.append(appendable_test_ner)
    evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])
    results, results_per_tag = evaluator.evaluate()
    print("Tulemuste ammutamine on lõpetatud.")
    
    return results

In [5]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [6]:
def chunks(lst, n):
    chunked_lst = []
    for i in range(0, len(lst), n):
        chunked_lst.append(lst[i:i + n])
    return chunked_lst

In [7]:
def test_model(model_dir, files, increment):
    removed_layers = ['sentences', 'morph_analysis', 'compound_tokens', 'ner', 'words', 'tokens']
    
    nertagger = NerTagger(model_dir)
    
    print("(!) Märgendan")
    start = time.time()
    testing_files = chunks(list(files.keys()), increment)
    i = 1
    for chunk in testing_files:
        for test_file in chunk:
            with open(find(test_file.replace(".json", ".txt"), os.path.join('..' ,'..', 'data', 'vallakohtufailid')), 'r', encoding='UTF-8') as f:
                text = f.read()

            if test_file == "Tartu_V6nnu_Ahja_id3502_1882a.json":
                text = text.replace('..', '. .')
                
            text = preprocess_text(Text(text))

            nertagger.tag(text)
            text.add_layer(flatten(text['ner'], 'flat_ner'))

            for x in removed_layers:
                text.pop_layer(x)

            path = os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')
            if not os.path.exists(path):
                os.mkdir(path)

            text_to_json(text, file=os.path.join(model_dir, 'vallakohtufailid-trained-nertagger', test_file))
            
            print(f'Märgendatud fail {test_file}')
        
        print("Eraldan alamosa tulemusi")
        all_results = extract_results_to_txt_file(model_dir, os.listdir(os.path.join(model_dir, 'vallakohtufailid-trained-nertagger')))
        
        with open(os.path.join(model_dir, 'results', f'results{i}.txt'), 'w+') as results_file:
            results_file.write(json.dumps(all_results))
            
        i += 1
        

In [8]:
model_dir = 'model_gaz_loc_variants'
increment = 250

In [9]:
test_model(model_dir, files, increment)

(!) Märgendan
Märgendatud fail V6ru_R2pina_Kahkva_id24674_1868a.json
Märgendatud fail L22ne_Martna_Martna_id14205_1869a.json
Märgendatud fail Harju_Juuru_Juuru_id19451_1886a.json
Märgendatud fail Tartu_Kodavere_Ranna_id11316_1845a.json
Märgendatud fail J2rva_Peetri_V2ike-Kareda_id22448_1881a.json
Märgendatud fail L22ne_Vormsi_Vormsi_id24908_1888a.json
Märgendatud fail J2rva_Tyri_V22tsa_id20382_1901a.json
Märgendatud fail Tartu_Laiuse_Kivij2rve_id13162_1866a.json
Märgendatud fail Tartu_V6nnu_Ahja_id20418_1888a.json
Märgendatud fail L22ne_Vormsi_Vormsi_id24521_1888a.json
Märgendatud fail P2rnu_Mihkli_Mihkli_id1099_1852a.json
Märgendatud fail Tartu_V6nnu_Ahja_id16184_1883a.json
Märgendatud fail Tartu_V6nnu_Ahja_id10343_1871a.json
Märgendatud fail L22ne_Ridala_V6nnu_id2373_1889a.json
Märgendatud fail Harju_Hageri_Kohila_id2634_1882a.json
Märgendatud fail L22ne_Martna_Martna_id10803_1890a.json
Märgendatud fail V6ru_Vastseliina_Misso_id13294_1887a.json
Märgendatud fail P2rnu_Tori_Sindi_id377

Märgendatud fail Tartu_V6nnu_Ahja_id21768_1867a.json
Märgendatud fail Tartu_V6nnu_Ahja_id20314_1888a.json
Märgendatud fail Tartu_R6ngu_Aakre_id4282_1888a.json
Märgendatud fail J2rva_Tyri_S2revere_id6796_1883a.json
Märgendatud fail V6ru_R2pina_R2pina_id10711_1868a.json
Märgendatud fail Harju_Kose_Palvere_id23525_1887a.json
Märgendatud fail Tartu_R6ngu_Aakre_id13836_1829a.json
Märgendatud fail P2rnu_Halliste_Penuja_id657_1885a.json
Märgendatud fail Tartu_Kodavere_Alatskivi_id11390_1880a.json
Märgendatud fail V6ru_Vastseliina_Misso_id11633_1886a.json
Märgendatud fail Viljandi_P6ltsamaa_Adavere_id20850_1896a.json
Märgendatud fail Tartu_Otep22_Pyhaj2rve_id3008_1884a.json
Märgendatud fail J2rva_Ambla_Ambla_id7255_1886a.json
Märgendatud fail Tartu_Torma_Avinurme_id23576_1872a.json
Märgendatud fail Harju_J6el2htme_J6el2htme_id7375_1869a.json
Märgendatud fail Tartu_Kodavere_Pala_id18163_1862a.json
Märgendatud fail L22ne_Vormsi_Vormsi_id24037_1888a.json
Märgendatud fail Tartu_V6nnu_Ahja_id22714_

KeyboardInterrupt: 

In [None]:
'''
all_f1 = []
path = os.path.join(model_dir, 'results')
for file in os.listdir(path):
    if file.endswith('.json'):
        with open(os.path.join(path, file), 'r', encoding='UTF-8') as in_f:
            val = display_results_by_subdistribution(json.loads(in_f.read()))[2]
            print(val)
            all_f1.append(val)
'''