In [9]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [21]:
import os
import re

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json

In [73]:
# If the annotation contains newline character, then indexes will contain ';' at the linebreak (e.g. 388 393;394 398 )
indexes_on_line_split = re.compile(r' (\d+) (\d+;\d+ ){1,}(\d+)$')

def collect_annotations( in_f ):
    annotations = []
    split_lines_ahead = 0
    for line in in_f:
        line = line.rstrip('\n')
        items = line.split('\t')
        if split_lines_ahead > 0:
            split_lines_ahead -= 1
            last_item = annotations[-1]
            new_tuple = (last_item[0],last_item[1],last_item[2],last_item[3]+line,last_item[4])
            annotations[-1] = new_tuple
            continue
        if len(items) == 3:
            indexes_str = items[1]
            if indexes_str.count(';') > 0:
                split_lines_ahead += indexes_str.count(';')
            indexes_str = indexes_on_line_split.sub(' \\1 \\3', indexes_str)
            tag, start, end = indexes_str.split()
            annotations.append( (tag, start, end, items[2], items[0]) )
    seen = set()
    removed_duplicates_annotations = []
    for a, b, c, d, e in annotations:
        if not b in seen:
            seen.add(b)
            removed_duplicates_annotations.append((a, b, c, d, e))
        else:
            for index, item in enumerate(removed_duplicates_annotations):
                if item[1] == b and item[2] > c:
                    tuple_without_n = (a, b, c, d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                elif item[1] == b and item[2] < c:
                    tuple_without_n = (a, b, item[2], d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                else:
                    continue
                
    annotations = sorted(list(set(removed_duplicates_annotations)), key=lambda x: int(x[1]))
    return annotations

cwd = os.getcwd()

rownr = 1
directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]
for directory in directories:
    path = cwd + "\\" + directory + "\\"
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(path + file, 'r', encoding="utf-8") as txt, open(path + file.split(".")[0] + ".ann", 'r', encoding="utf-8") as ann:
                textfile = txt.read().replace(u"\xa0", u" ")

                # converting the text form .txt file into an EstNLTK Text object and giving it the "words" layer
                text = Text(textfile)
                text.meta['origin_directory'] = str(directory)
                text = text.tag_layer(['words'])

                gold_ner_layer = Layer(name="gold_ner", text_object=text, attributes=['nertag'])
                gold_wordner_layer = Layer(name="gold_wordner", text_object=text, attributes=['nertag'], parent="words")

                fixed_annotations = collect_annotations(ann)

                annotation_dictionary = {}
                for annotation in fixed_annotations:
                    trigger = annotation[4]
                    location = annotation[0] + " " + annotation[1] + " " + annotation[2]
                    entity = annotation[3]
                    annotation_dictionary[trigger] = [location, entity]

                for key in annotation_dictionary:
                    name = []
                    value = annotation_dictionary.get(key)
                    
                    entity = value[1]
                    ner, startIndex, endIndex = value[0].split(" ")
                    
                    if entity.count(".") > 1: #TODO: SIIN ON PROBLEEM
                        entity.replace(".", " . ")

                    #TODO: Probleem on siin: ma ei saa kontrollida preceding_newlinese kuidagi, vaid ma pean programmi ümber kirjutama nii, et mitte if tekstisõna index on sama, mis algusindeks, vaid
                    #tekstisõna = annotationi sõna algusega ja siis arvutama preceding_newlinese. Praegu on .ann failis näiteks algusindeks 705, kui ma loen enne seda kokku 46 preceding newline'i sellest
                    #tekstifailist, siis neid on liiga palju, sest teksti enda indeksid on juba teistsugused...
                    
                    
                    for i in range(0, len(text.words)):
                        if text.words[i].text == entity.split(" ")[0]:
                            
                            preceding_newlines = text.text[:int(text.words[i].start)].count("\n")
                            startIndex = int(startIndex) - int(preceding_newlines)
                            endIndex = int(endIndex) - int(preceding_newlines)
                            
                            if text.words[i].start == startIndex:
                                if text.words[i].end == endIndex:
                                    base_span = EnvelopingBaseSpan([text.words[i].base_span])
                                    name = [text.words[i]] #adding the name element(s) to a list to use in the wordner dictionary later on
                                else:
                                    if text.words[i+1].end == endIndex: #if the next word's endindex is the same as the endindex in the .ann file, add a new span
                                        name = [text.words[i], text.words[i+1]]
                                    else:
                                        for j in range(len(entity.split(" "))):
                                            name.append(text.words[i+j])
                                    base_span = EnvelopingBaseSpan([s.base_span for s in name])
                                new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)

                                dictionary_for_wordner = dict()
                                if ner == "Isik":
                                    new_span.add_annotation(Annotation(new_span, nertag="PER"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-PER" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-PER"
                                if ner == "KO_koht" or ner == "KO_org":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC_ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC_ORG"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC_ORG"
                                if ner == "Koht":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC"
                                if ner == "Org":
                                    new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-ORG" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-ORG"
                                if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                                    new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-MISC" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-MISC"
                                gold_ner_layer.add_span(new_span)
                                break
                text.add_layer(gold_ner_layer)
                if len(text.gold_ner) != len(fixed_annotations):
                    print(str(rownr) + ". " + str(file) + ":", len(text.gold_ner), len(fixed_annotations))
                    rownr += 1
                for i in range(0, len(text.words)):
                    for key in dictionary_for_wordner.keys():
                        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)

                        if i == key:
                            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                            gold_wordner_layer.add_span(new_span)
                            break
                        else:
                            if i in dictionary_for_wordner.keys():
                                continue
                            else:
                                new_span.add_annotation(Annotation(new_span, nertag="O"))
                            gold_wordner_layer.add_span(new_span)
                            break
                
                text.add_layer(gold_wordner_layer)
                text_to_json(text, file=cwd + "\\vallakohtufailid_json\\" + file.replace(".txt", ".json"))
    print(f"Kaust {directory} on läbitud.")
print("Programm on lõpetanud oma töö.")

1. Harju_Hageri_Kohila_id10509_1871a.txt: 8 13
2. Harju_Hageri_Kohila_id1346_1888a.txt: 7 10
3. Harju_Hageri_Kohila_id20604_1883a.txt: 5 10
4. Harju_Hageri_Kohila_id21431_1885a.txt: 5 11
5. Harju_Hageri_Kohila_id23811_1873a.txt: 20 34
6. Harju_Hageri_Kohila_id3108_1885a.txt: 7 8
E. Jb. Rändith
E. Jb. Rändith
7. Harju_Hageri_Kohila_id4010_1890a.txt: 47 77
8. Harju_Hageri_Kohila_id4177_1883a.txt: 29 43
9. Harju_J6el2htme_J6el2htme_id7364_1868a.txt: 7 9
10. Harju_J6el2htme_J6el2htme_id7659_1870a.txt: 21 28
11. Harju_Juuru_Juuru_id19451_1886a.txt: 11 14
12. Harju_Juuru_Juuru_id19472_1887a.txt: 16 24
13. Harju_Juuru_Juuru_id23775_1873a.txt: 12 25
14. Harju_Juuru_Juuru_id556_1877a.txt: 5 9
15. Harju_Juuru_Kaiu_id12588_1883a.txt: 9 15
16. Harju_Juuru_Kaiu_id18571_1873a.txt: 10 18
17. Harju_Juuru_Kaiu_id3479_1886a.txt: 10 15
18. Harju_Juuru_Kaiu_id9788_1882a.txt: 11 16
19. Harju_Jyri_Rae_id3658_1888a.txt: 8 10
20. Harju_Keila_Saue_id14410_1886a.txt: 4 10
21. Harju_Kose_Kose-Uuem6isa_id3340_186

KeyboardInterrupt: 