# Convert BRAT-tool annotated files to "goldstandard" .json files

In [1]:
import os
import re

from modules.preprocessing_protocols import preprocess_text
from modules.collect_annotations import collect_annotations
from modules.get_layer_difference import find_difference

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json
from estnltk.layer_operations import flatten

<b>Define directories in which the brat-annotated files (\*.txt and \*.ann) are located. For example, <code>vallakohus_esimene</code> and <code>vallakohus_teine</code> contain file pairs:</b>

In [2]:
directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]

<b>Define the location of the aforementioned directories in relation to the Jupyter Notebook. Also define the location where the .json files will be saved to:</b>

In [3]:
vallakohtufailid_location = os.path.join('..', 'data', 'vallakohtufailid')
json_files_location = os.path.join('..', 'data', 'vallakohtufailid-json-flattened')

<b>Define named entities in the BRAT-tool annotated files:</b>

In [4]:
named_entities = {
    "Isik": "PER",
    "KO_koht": "LOC_ORG",
    "KO_org": "LOC_ORG",
    "Koht": "LOC",
    "Org": "ORG",
    "Muu": "MISC",
    "Teadmata": "MISC",
    "ese": "MISC"
}

---

In [5]:
for directory in directories:
    path = os.path.join(vallakohtufailid_location, directory)
    files = [filename for filename in os.listdir(path) if filename.endswith('.txt')]
    for file in files:
        with open(os.path.join(path, file), 'r', encoding="UTF-8") as in_txt:
            in_txt = in_txt.read().replace(u'\xa0', ' ')

        if file == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
            in_txt = in_txt.replace('..', '. .')

        dictionary_for_wordner = dict()

        # Convert text into EstNLTK Text object and preprocess it
        text = Text(in_txt)
        text.meta['origin_directory'] = str(directory)
        preprocess_text(text)

        # Create NER layers
        gold_ner_layer = Layer(name="unflattened_gold_ner", text_object=text, attributes=['nertag'])
        gold_wordner_layer = Layer(name="unflattened_gold_wordner", text_object=text, attributes=['nertag'], parent="words")

        # Fix annotations
        with open(os.path.join(path, file.split(".")[0] + ".ann"), 'r', encoding="UTF-8") as in_ann:
            fixed_annotations = collect_annotations(in_ann)

        # Collect the annotations in a separate dictionary
        annotation_dictionary = {}
        for annotation in fixed_annotations:
            trigger = annotation[4]
            location = annotation[0] + " " + annotation[1] + " " + annotation[2]
            entity = annotation[3]
            annotation_dictionary[trigger] = [location, entity]

        # Iterate through the keys (triggers) of the dictionary
        for key in annotation_dictionary:
            name = []

            location, entity = annotation_dictionary.get(key)

            ner, startIndex, endIndex = location.split(" ")

            for i in range(len(text.words)):
                if text.words[i].start == (int(startIndex) - text.text[:int(text.words[i].start)].count("\n")):  

                    preceding_newlines = text.text[:int(text.words[i].start)].count("\n")
                    startIndex = int(startIndex) - int(preceding_newlines)
                    endIndex = int(endIndex) - int(preceding_newlines)

                    # NB! Exceptions in some of the files:
                    if entity == "Gustav  Waddi" or (text.words[i].text == entity[:-1] and entity[-1] == " "):
                        endIndex -= 1
                    if text.words[i] == "..":
                        endindex -= 2
                    if "\n" in text.text[startIndex:endIndex] and entity != "Gustav  Waddi":
                        endIndex -= text.text[startIndex:endIndex].count("\n")
                    if entity == "Jaan Park" and text.words[i+1].text == "Park_":
                        endIndex += 1

                    # Create a base span based on the start index and end index
                    if text.words[i].start == startIndex:
                        if text.words[i].end == endIndex:
                            base_span = EnvelopingBaseSpan([text.words[i].base_span])
                            name = [text.words[i]]                                    
                        else:
                            if text.words[i+1].end == endIndex: 
                                name = [text.words[i], text.words[i+1]]
                            else:
                                iterator = 0
                                while True:
                                    if text.words[i+iterator].end == endIndex:
                                        name.append(text.words[i+iterator])
                                        iterator = 0
                                        break
                                    else:
                                        name.append(text.words[i+iterator])
                                        iterator += 1

                        base_span = EnvelopingBaseSpan([s.base_span for s in name])
                        new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)

                        # Create named entities based on aforementioned spans
                        new_span.add_annotation(Annotation(new_span, nertag=named_entities[ner]))
                        for k in range(0, len(name)):
                            if k == 0:
                                dictionary_for_wordner[i] = f'B-{named_entities[ner]}'
                            else:
                                dictionary_for_wordner[i+k] = f'I-{named_entities[ner]}'

                        gold_ner_layer.add_span(new_span)
                        
                    break
        text.add_layer(gold_ner_layer)

        # Find the difference between annotations in the file and annotations on the text
        find_difference(file, fixed_annotations, text.unflattened_gold_ner)

        # Create wordner annotations
        for i in range(0, len(text.words)):
            for key in dictionary_for_wordner.keys():
                new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)
                if i == key:
                    new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                    gold_wordner_layer.add_span(new_span)
                    break
                else:
                    if i in dictionary_for_wordner.keys():
                        continue
                    else:
                        new_span.add_annotation(Annotation(new_span, nertag="O"))
                gold_wordner_layer.add_span(new_span)
                break

        text.add_layer(gold_wordner_layer)

        # Flatten the layers and remove extra layers to save space
        remove_layers = ['words', 'tokens', 'unflattened_gold_ner']

        text.add_layer(flatten(text['unflattened_gold_ner'], 'gold_ner'))
        text.add_layer(flatten(text['unflattened_gold_wordner'], 'gold_wordner'))

        for layer in remove_layers:
            text.pop_layer(layer)

        text.gold_wordner.ambiguous = False
        text.gold_ner.ambiguous = False

        # Save the new json files to a separate directory
        if not os.path.exists(json_files_location):
            os.mkdir(json_files_location)

        text_to_json(text, file=os.path.join(json_files_location, file.replace(".txt", ".json")))
        
    print(f"(!) Directory {directory} is completed")
print("(!) The code has finished!")

(!) Difference in Viljandi_Paistu_Holstre_id9042_1836a.txt correct annotations and output annotations: ['Jaan Park_']
(!) Difference in Tartu_Kodavere_Pala_id22870_1872a.txt correct annotations and output annotations: ['Jaan\nAnni']
(!) Directory vallakohus_esimene is completed
(!) Directory vallakohus_teine is completed
(!) Directory vallakohus_kolmas is completed
(!) Difference in Tartu_Laiuse_Kivij2rve_id13164_1866a.txt correct annotations and output annotations: ['Thomas Peterson']
(!) Directory vallakohus_neljas is completed
(!) The code has finished!
