In [4]:
import os
import re

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json

# If the annotation contains newline character, then indexes will contain ';' at the linebreak (e.g. 388 393;394 398 )
indexes_on_line_split = re.compile(r' (\d+) (\d+;\d+ ){1,}(\d+)$')

def collect_annotations( in_f ):
    annotations = []
    split_lines_ahead = 0
    for line in in_f:
        line = line.rstrip('\n')
        items = line.split('\t')
        if split_lines_ahead > 0:
            split_lines_ahead -= 1
            last_item = annotations[-1]
            new_tuple = (last_item[0],last_item[1],last_item[2],last_item[3]+line,last_item[4])
            annotations[-1] = new_tuple
            continue
        if len(items) == 3:
            indexes_str = items[1]
            if indexes_str.count(';') > 0:
                split_lines_ahead += indexes_str.count(';')
            indexes_str = indexes_on_line_split.sub(' \\1 \\3', indexes_str)
            tag, start, end = indexes_str.split()
            annotations.append( (tag, start, end, items[2], items[0]) )
    return annotations

cwd = os.getcwd()

directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]
for directory in directories:
    path = cwd + "\\" + directory + "\\"
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(path + file, 'r', encoding="utf-8") as txt, open(path + file.split(".")[0] + ".ann", 'r', encoding="utf-8") as ann:
                textfile = txt.read()
                
                # converting the text form .txt file into an EstNLTK Text object and giving it the "words" layer
                text = Text(textfile)
                text.meta['origin_directory'] = str(directory)
                text = text.tag_layer(['words'])
                
                gold_ner_layer = Layer(name="gold_ner", text_object=text, attributes=['nertag'])
                gold_wordner_layer = Layer(name="gold_wordner", text_object=text, attributes=['nertag'], parent="words")
                
                annotation_dictionary = {}
                fixed_annotations = collect_annotations(ann)

                for annotation in fixed_annotations:
                    trigger = annotation[4]
                    location = annotation[0] + " " + annotation[1] + " " + annotation[2]
                    entity = annotation[3]
                    annotation_dictionary[trigger] = [location, entity]
                    
                dictionary_for_wordner = {} #the dictionary for wordner
                for key in annotation_dictionary:
                    name = []
                    value = annotation_dictionary.get(key)
                    ner, loc = value[0].split(" ")[0], (value[0].split(" ")[1], value[0].split(" ")[2])
                    preceding_newlines = text.text[0:int(loc[0])].count("\n") #counting preceding newlines to deduct from the indexes

                    startIndex = int(loc[0]) - int(preceding_newlines) #starting index form the .ann file
                    endIndex = int(loc[1]) - int(preceding_newlines) #ending index from the .ann file

                    for i in range(0, len(text.words) - 1): #looping through every word in text.words
                        if text.words[i].start == startIndex: #if the startindex is same in the .ann file and in the loop, add an new span 
                            if text.words[i].end == endIndex:
                                base_span = EnvelopingBaseSpan([text.words[i].base_span])
                                name = [text.words[i]] #adding the name element(s) to a list to use in the wordner dictionary later on
                            else:
                                if text.words[i+1].end == endIndex: #if the next word's endindex is the same as the endindex in the .ann file, add a new span
                                    name = [text.words[i], text.words[i+1]] 
                                    base_span = EnvelopingBaseSpan([s.base_span for s in name])
                                else:
                                    for j in range(len(value[0].split(" ")) - 1):
                                        name.append(text.words[i+j])
                                base_span = EnvelopingBaseSpan([s.base_span for s in name])

                            new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)

                            if ner == "Isik":
                                new_span.add_annotation(Annotation(new_span, nertag="PER"))
                                for k in range(0, len(name)):
                                    if k == 0:
                                        dictionary_for_wordner[i] = "B-PER"
                                        continue
                                    else:
                                        dictionary_for_wordner[i+k] = "I-PER"
                            if ner == "KO_koht" or ner == "KO_org":
                                new_span.add_annotation(Annotation(new_span, nertag="LOC_ORG"))
                                for k in range(0, len(name)):
                                    if k == 0:
                                        dictionary_for_wordner[i] = "B-LOC_ORG"
                                        continue

                                    else:
                                        dictionary_for_wordner[i+k] = "I-LOC_ORG"
                            if ner == "Koht":
                                new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                                for k in range(0, len(name)):
                                    if k == 0:
                                        dictionary_for_wordner[i] = "B-LOC"
                                        continue
                                    else:
                                        dictionary_for_wordner[i+k] = "I-LOC"
                            if ner == "Org":
                                new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                                for k in range(0, len(name)):
                                    if k == 0:
                                        dictionary_for_wordner[i] = "B-ORG"
                                        continue
                                    else:
                                        dictionary_for_wordner[i+k] = "I-ORG"
                            if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                                new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                                for k in range(0, len(name)):
                                    if k == 0:
                                        dictionary_for_wordner[i] = "B-MISC"
                                        continue
                                    else:
                                        dictionary_for_wordner[i+k] = "I-MISC"
                            try:
                                gold_ner_layer.add_span(new_span)
                                break
                            except:
                                print(f"Tekkis probleem reaga {new_span}, mis pärineb failist {file}: see Span juba eksisteerib.")
                text.add_layer(gold_ner_layer)
                
                for i in range(0, len(text.words) - 1):
                    for key in dictionary_for_wordner.keys():
                        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)

                        if i == key:
                            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                            gold_wordner_layer.add_span(new_span)
                            break
                        else:
                            if i in dictionary_for_wordner.keys():
                                continue
                            else:
                                new_span.add_annotation(Annotation(new_span, nertag="O"))
                            gold_wordner_layer.add_span(new_span)
                            break

                text.add_layer(gold_wordner_layer)
                text_to_json(text, file=cwd + "\\vallakohtufailid_json\\" + file.replace(".txt", ".json"))

Tekkis probleem reaga EnvelopingSpan(['Peter', 'Barnabas'], [{'nertag': 'PER'}]), mis pärineb failist Harju_Hageri_Kohila_id4010_1890a.txt: see Span juba eksisteerib.
Tekkis probleem reaga EnvelopingSpan(['Peeter', 'Laas'], [{'nertag': 'PER'}]), mis pärineb failist Harju_Juuru_Juuru_id556_1877a.txt: see Span juba eksisteerib.
Tekkis probleem reaga EnvelopingSpan(['Toomas', 'Rassohw'], [{'nertag': 'PER'}]), mis pärineb failist Harju_Juuru_Kaiu_id12588_1883a.txt: see Span juba eksisteerib.
Tekkis probleem reaga EnvelopingSpan(['Mik', 'Lewet'], [{'nertag': 'PER'}]), mis pärineb failist Harju_Kose_Triigi_id9684_1869a.txt: see Span juba eksisteerib.
Tekkis probleem reaga EnvelopingSpan(['Jaan', 'Anni'], [{'nertag': 'PER'}]), mis pärineb failist Tartu_Kodavere_Pala_id22870_1872a.txt: see Span juba eksisteerib.
Tekkis probleem reaga EnvelopingSpan(['Jaan', 'Anni'], [{'nertag': 'PER'}]), mis pärineb failist Tartu_Kodavere_Pala_id22870_1872a.txt: see Span juba eksisteerib.
Tekkis probleem reaga