In [2]:
import os
import re

from modules.preprocessing_protocols import preprocess_text
from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json
from estnltk.layer_operations import extract_sections
from estnltk.layer_operations import flatten

In [2]:
# If the annotation contains newline character, then indexes will contain ';' at the linebreak (e.g. 388 393;394 398 )
indexes_on_line_split = re.compile(r' (\d+) (\d+;\d+ ){1,}(\d+)$')

def collect_annotations( in_f ):
    annotations = []
    split_lines_ahead = 0
    for line in in_f:
        line = line.rstrip('\n')
        items = line.split('\t')
        if split_lines_ahead > 0:
            split_lines_ahead -= 1
            last_item = annotations[-1]
            new_tuple = (last_item[0],last_item[1],last_item[2],(last_item[3]+line),last_item[4])
            annotations[-1] = new_tuple
            continue
        if len(items) == 3:
            indexes_str = items[1]
            if indexes_str.count(';') > 0:
                split_lines_ahead += indexes_str.count(';')
            indexes_str = indexes_on_line_split.sub(' \\1 \\3', indexes_str)
            tag, start, end = indexes_str.split()
            annotations.append( (tag, start, end, items[2], items[0]) )
    seen = set()
    removed_duplicates_annotations = []
    for a, b, c, d, e in annotations:
        if not b in seen:
            seen.add(b)
            removed_duplicates_annotations.append((a, b, c, d, e))
        else:
            for index, item in enumerate(removed_duplicates_annotations):
                if item[1] == b and item[2] > c:
                    tuple_without_n = (a, b, c, d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                elif item[1] == b and item[2] < c:
                    tuple_without_n = (a, b, item[2], d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                else:
                    continue
    
    for index, item in enumerate(removed_duplicates_annotations):
        if "\xa0" in item[3]:
            replaced = re.sub(r'\s\s+', r' ', item[3].replace(u'\xa0', u' '))
            removed_duplicates_annotations[index] = ( item[0], item[1], item[2], replaced, item[4] )
    
    annotations = sorted(list(set(removed_duplicates_annotations)), key=lambda x: int(x[1]))
    return annotations

rownr = 1
directories = ["vallakohus_esimene/", "vallakohus_teine/", "vallakohus_kolmas/", "vallakohus_neljas/"]
for directory in directories:
    path = "./vallakohtufailid/" + directory
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(path + file, 'r', encoding="UTF-8") as txt, \
                 open(path + file.split(".")[0] + ".ann", 'r', encoding="UTF-8") as ann:
                textfile = txt.read().replace(u'\xa0', ' ')
                
                if file == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
                    textfile = textfile.replace('..', '. .')
                
                dictionary_for_wordner = dict()
                
                # Converting the text form .txt file into an EstNLTK Text object and giving it the "morph_analysis" layer
                text = Text(textfile)
                text.meta['origin_directory'] = str(directory)
                text = preprocess_text(text)
                
                # Creating NER layers
                gold_ner_layer = Layer(name="unflattened_gold_ner", text_object=text, attributes=['nertag'])
                gold_wordner_layer = Layer(name="unflattened_gold_wordner", text_object=text, attributes=['nertag'], parent="words")
                
                # Fixing annotations (some annotations are on different rows)
                fixed_annotations = collect_annotations(ann)

                # Collecting the annotations in a separate dictionary
                annotation_dictionary = {}
                for annotation in fixed_annotations:
                    trigger = annotation[4]
                    location = annotation[0] + " " + annotation[1] + " " + annotation[2]
                    entity = annotation[3]
                    annotation_dictionary[trigger] = [location, entity]

                # Iterating through the keys (triggers) of the dictionary
                for key in annotation_dictionary:
                    name = []
                    
                    location, entity = annotation_dictionary.get(key)
    
                    ner, startIndex, endIndex = location.split(" ")
      
                    for i in range(len(text.words)):
                        if text.words[i].start == (int(startIndex) - text.text[:int(text.words[i].start)].count("\n")):  
                            
                            preceding_newlines = text.text[:int(text.words[i].start)].count("\n")
                            startIndex = int(startIndex) - int(preceding_newlines)
                            endIndex = int(endIndex) - int(preceding_newlines)
                            
                            # Exceptions in some files
                            if entity == "Gustav  Waddi" or (text.words[i].text == entity[:-1] and entity[-1] == " "):
                                endIndex -= 1
                            if text.words[i] == "..":
                                endindex -= 2
                            if "\n" in text.text[startIndex:endIndex] and entity != "Gustav  Waddi":
                                endIndex -= text.text[startIndex:endIndex].count("\n")
                            if entity == "Jaan Park" and text.words[i+1].text == "Park_":
                                endIndex += 1
                            
                            # Creating a base span based on the start index and end index.
                            if text.words[i].start == startIndex:
                                if text.words[i].end == endIndex:
                                    base_span = EnvelopingBaseSpan([text.words[i].base_span])
                                    name = [text.words[i]]                                    
                                else:
                                    if text.words[i+1].end == endIndex: 
                                        name = [text.words[i], text.words[i+1]]
                                    else:
                                        iterator = 0
                                        while True:
                                            if text.words[i+iterator].end == endIndex:
                                                name.append(text.words[i+iterator])
                                                iterator = 0
                                                break
                                            else:
                                                name.append(text.words[i+iterator])
                                                iterator += 1
                                
                                base_span = EnvelopingBaseSpan([s.base_span for s in name])
                                new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)
                                
                                # Creating named entities based on aforementioned spans
                                if ner == "Isik":
                                    new_span.add_annotation(Annotation(new_span, nertag="PER"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-PER"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-PER"
                                if ner == "KO_koht" or ner == "KO_org":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC_ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC_ORG"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC_ORG"
                                if ner == "Koht":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC"
                                if ner == "Org":
                                    new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-ORG" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-ORG"
                                if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                                    new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-MISC" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-MISC"
                                gold_ner_layer.add_span(new_span)
                            break
                text.add_layer(gold_ner_layer)
                
                # Finding the difference between annotations in the file and annotations on the text
                set1 = [re.sub(r'\s\s+', r' ', element[3].strip()) for element in fixed_annotations]
                set2 = [re.sub(r'\s\s+', r' ', element.enclosing_text.strip()) for element in text.unflattened_gold_ner]
                diff = list(list(set(set1)-set(set2)) + list(set(set2)-set(set1)))
                if (diff):
                    print(f"{rownr}. Probleem failis {file}, mille erinevus on {diff}")
                    rownr += 1
                
                # Creating WordNer annotations:
                for i in range(0, len(text.words)):
                    for key in dictionary_for_wordner.keys():
                        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)
                        if i == key:
                            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                            gold_wordner_layer.add_span(new_span)
                            break
                        else:
                            if i in dictionary_for_wordner.keys():
                                continue
                            else:
                                new_span.add_annotation(Annotation(new_span, nertag="O"))
                        gold_wordner_layer.add_span(new_span)
                        break
                
                text.add_layer(gold_wordner_layer)
                
                # Flattening the layers to save up on space.
                text.add_layer(flatten(text['unflattened_gold_ner'], 'gold_ner'))
                text.add_layer(flatten(text['unflattened_gold_wordner'], 'gold_wordner'))
                text.pop_layer('words')
                text.pop_layer('tokens')
                text.pop_layer('unflattened_gold_ner')
                text.gold_wordner.ambiguous = False
                text.gold_ner.ambiguous = False
                # Saving the new json files to a separate folder.
                text_to_json(text, file="./vallakohtufailid-json-flattened/" + file.replace(".txt", ".json"))
    print(f"Kaust {directory} on läbitud.")
print("Programm on lõpetanud oma töö.")

1. Probleem failis Viljandi_Paistu_Holstre_id9042_1836a.txt, mille erinevus on ['Jaan Park_']
2. Probleem failis Tartu_Kodavere_Pala_id22870_1872a.txt, mille erinevus on ['Jaan\nAnni']
Kaust vallakohus_esimene/ on läbitud.
Kaust vallakohus_teine/ on läbitud.
Kaust vallakohus_kolmas/ on läbitud.
3. Probleem failis Tartu_Laiuse_Kivij2rve_id13164_1866a.txt, mille erinevus on ['Thomas Peterson']
Kaust vallakohus_neljas/ on läbitud.
Programm on lõpetanud oma töö.
