In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import os
import re

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json
from estnltk.layer_operations import extract_sections

In [4]:
# If the annotation contains newline character, then indexes will contain ';' at the linebreak (e.g. 388 393;394 398 )
indexes_on_line_split = re.compile(r' (\d+) (\d+;\d+ ){1,}(\d+)$')

def collect_annotations( in_f ):
    annotations = []
    split_lines_ahead = 0
    for line in in_f:
        line = line.rstrip('\n')
        items = line.split('\t')
        if split_lines_ahead > 0:
            split_lines_ahead -= 1
            last_item = annotations[-1]
            new_tuple = (last_item[0],last_item[1],last_item[2],last_item[3]+line,last_item[4])
            annotations[-1] = new_tuple
            continue
        if len(items) == 3:
            indexes_str = items[1]
            if indexes_str.count(';') > 0:
                split_lines_ahead += indexes_str.count(';')
            indexes_str = indexes_on_line_split.sub(' \\1 \\3', indexes_str)
            tag, start, end = indexes_str.split()
            annotations.append( (tag, start, end, items[2], items[0]) )
    seen = set()
    removed_duplicates_annotations = []
    for a, b, c, d, e in annotations:
        if not b in seen:
            seen.add(b)
            removed_duplicates_annotations.append((a, b, c, d, e))
        else:
            for index, item in enumerate(removed_duplicates_annotations):
                if item[1] == b and item[2] > c:
                    tuple_without_n = (a, b, c, d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                elif item[1] == b and item[2] < c:
                    tuple_without_n = (a, b, item[2], d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                else:
                    continue
                
    annotations = sorted(list(set(removed_duplicates_annotations)), key=lambda x: int(x[1]))
    return annotations

cwd = os.getcwd()

rownr = 1
directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]
for directory in directories:
    path = cwd + "/" + directory + "/"
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(path + file, 'r', encoding="utf-8") as txt, open(path + file.split(".")[0] + ".ann", 'r', encoding="utf-8") as ann:
                textfile = txt.read().replace(u"\xa0", u" ")

                # converting the text form .txt file into an EstNLTK Text object and giving it the "words" layer
                text = Text(textfile)
                text.meta['origin_directory'] = str(directory)
                text = text.tag_layer(['words'])
                
                # creating NER layers
                gold_ner_layer = Layer(name="gold_ner", text_object=text, attributes=['nertag'])
                gold_wordner_layer = Layer(name="gold_wordner", text_object=text, attributes=['nertag'], parent="words")
                
                #fixing annotations 
                fixed_annotations = collect_annotations(ann)

                annotation_dictionary = {}
                for annotation in fixed_annotations:
                    trigger = annotation[4]
                    location = annotation[0] + " " + annotation[1] + " " + annotation[2]
                    entity = annotation[3]
                    annotation_dictionary[trigger] = [location, entity]

                for key in annotation_dictionary:
                    name = []
                    
                    location, entity = annotation_dictionary.get(key)
    
                    ner, startIndex, endIndex = location.split(" ")
        
                    entity = re.sub(r'  ', r' ', re.sub(r'([^a-zA-ZõäöüÄÖÕÜ ])', r' \1 ', entity).rstrip())
                    
                    for i in range(len(text.words)):
                        if text.words[i].start == (int(startIndex) - text.text[:int(text.words[i].start)].count("\n")):  
                            preceding_newlines = text.text[:int(text.words[i].start)].count("\n")
                            startIndex = int(startIndex) - int(preceding_newlines)
                            endIndex = int(endIndex) - int(preceding_newlines)
                            
                            if text.words[i].start == startIndex:
                                if text.words[i].end == endIndex:
                                    base_span = EnvelopingBaseSpan([text.words[i].base_span])
                                    name = [text.words[i]]                                    
                                else:
                                    if text.words[i+1].end == endIndex: 
                                        name = [text.words[i], text.words[i+1]]
                                    else:
                                        entity = entity.split(" ")
                                        for j in range(len(entity)):
                                            if len(entity[j]) != len(text.words[i+j].text):
                                                '''
                                                span1, span2 = text.words[i].text[:len(entity[j])], text.words[i].text[len(entity[j]):]
                                                text.words[i] = Text(span1).tag_layer().words[0]
                                                insertThis = Text(span2).tag_layer().words[0]
                                                text.words.spans.insert(i+1, insertThis)
                                                name.append(text.words[i])
                                                '''
                                                something = startIndex + len(entity[j])
                                                sections = extract_sections(text, sections=[(startIndex, something), (something, something + len(text.words[i+j].text) - len(entity[j]))])
                                                #PROBLEEM SIIN
                                                print(sections[0], sections[1])
                                                name.append(sections[0])
                                            else:
                                                name.append(text.words[i+j])

                                base_span = EnvelopingBaseSpan([s.base_span for s in name])
                                new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)

                                dictionary_for_wordner = dict()
                                
                                if ner == "Isik":
                                    new_span.add_annotation(Annotation(new_span, nertag="PER"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-PER"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-PER"
                                if ner == "KO_koht" or ner == "KO_org":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC_ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC_ORG"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC_ORG"
                                if ner == "Koht":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC"
                                if ner == "Org":
                                    new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-ORG" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-ORG"
                                if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                                    new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-MISC" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-MISC"
                                gold_ner_layer.add_span(new_span)
                            break
                text.add_layer(gold_ner_layer)
                
                set1 = list()
                set2 = list()
                for TUPLE in fixed_annotations:
                    set1.append(TUPLE[3])
                for NER in text.gold_ner:
                    set2.append(NER.enclosing_text)
                
                if set1 != set2:
                    print(rownr, "Probleem failis", file, "pikkused:", len(set1), len(set2))
                    rownr += 1
                
                for i in range(0, len(text.words)):
                    for key in dictionary_for_wordner.keys():
                        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)
                        if i == key:
                            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                            gold_wordner_layer.add_span(new_span)
                            break
                        else:
                            if i in dictionary_for_wordner.keys():
                                continue
                            else:
                                new_span.add_annotation(Annotation(new_span, nertag="O"))
                        gold_wordner_layer.add_span(new_span)
                        break
                
                text.add_layer(gold_wordner_layer)
                text_to_json(text, file=cwd + "/vallakohtufailid_json/" + file.replace(".txt", ".json"))
    print(f"Kaust {directory} on läbitud.")
print("Programm on lõpetanud oma töö.")

Text(text='Wändramees') Text(text='Hans')


AttributeError: 'Text' object has no layer 'base_span'