In [178]:
import os
import re

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span

In [179]:
with open("vallakohus_esimene\Harju_Hageri_Kohila_id20604_1883a.ann", encoding="UTF8") as annotated_file:
    annotation = annotated_file.read()
with open("vallakohus_esimene\Harju_Hageri_Kohila_id20604_1883a.txt", encoding="UTF8") as txt:
    textfile = txt.read()

In [180]:
print(annotation)

T1	Isik 34 42	J. Krais
T2	Isik 56 72	Juhan Lesserberg
T3	Isik 76 86	Juhan Asse
T4	Isik 146 158	J.Lesserberg
T5	Isik 190 200	Juhan Asse
T6	Isik 231 241	Lesserberg
T7	Isik 357 373	Juhan Lesserberg
T8	Isik 418 428	Juhan Asse
T9	Isik 584 600	Juhan Lesserberg
T10	Isik 740 750	Juhan Asse



In [181]:
# converting the text form .txt file into an EstNLTK Text object and giving it the "words" layer
text = Text(textfile)
text = text.tag_layer(['words'])

In [182]:
# creating new layers to add to the Text object later on
gold_ner_layer = Layer(name="gold_ner", text_object=text, attributes=['nertag'])
gold_wordner_layer = Layer(name="gold_wordner", text_object=text, attributes=['nertag'], parent="words")


In [183]:
# creating a dictionary to hold the annotation data, the dictionary looks as follows:
# {"TRIGGER": ['NERTAG INDEX1 INDEX2', 'NAMED-ENTITY'], ...}
annotation_dictionary = {}

splitted_annotation = annotation.split("\n")
for row in splitted_annotation:
    splitted_row = row.split("\t")
    if (len(splitted_row) == 3):
        trigger = splitted_row[0]
        location = splitted_row[1]
        entity = splitted_row[2]
        annotation_dictionary[trigger] = [location, entity]
    else:
        print("Rea '" + row + "' lugemisel tekkis probleem.")

Rea '' lugemisel tekkis probleem.


In [184]:
# looping through every key and value in dictionary to add to the layer
for key in annotation_dictionary:    
    value = annotation_dictionary.get(key)
    print(key, value)

T1 ['Isik 34 42', 'J. Krais']
T2 ['Isik 56 72', 'Juhan Lesserberg']
T3 ['Isik 76 86', 'Juhan Asse']
T4 ['Isik 146 158', 'J.Lesserberg']
T5 ['Isik 190 200', 'Juhan Asse']
T6 ['Isik 231 241', 'Lesserberg']
T7 ['Isik 357 373', 'Juhan Lesserberg']
T8 ['Isik 418 428', 'Juhan Asse']
T9 ['Isik 584 600', 'Juhan Lesserberg']
T10 ['Isik 740 750', 'Juhan Asse']


In [185]:
dictionary_for_wordner = {} #the dictionary for wordner

for key in annotation_dictionary:
    name = []
    value = annotation_dictionary.get(key)
    ner, loc = value[0].split(" ")[0], (value[0].split(" ")[1], value[0].split(" ")[2])
    preceding_newlines = text.text[0:int(loc[0])].count("\n") #counting preceding newlines to deduct from the indexes
    
    startIndex = int(loc[0]) - int(preceding_newlines) #starting index form the .ann file
    endIndex = int(loc[1]) - int(preceding_newlines) #ending index from the .ann file
    
    for i in range(0, len(text.words) - 1): #looping through every word in text.words
        if text.words[i].start == startIndex: #if the startindex is same in the .ann file and in the loop, add an new span 
            if text.words[i].end == endIndex:
                base_span = EnvelopingBaseSpan([text.words[i].base_span])
                name = [text.words[i]] #adding the name element(s) to a list to use in the wordner dictionary later on
            else:
                if text.words[i+1].end == endIndex: #if the next word's endindex is the same as the endindex in the .ann file, add a new span
                    name = [text.words[i], text.words[i+1]] 
                    base_span = EnvelopingBaseSpan([s.base_span for s in name])
                else:
                    for j in range(len(value[0].split(" ") - 1)):
                        name.append(text.words[i+j])
                base_span = EnvelopingBaseSpan([s.base_span for s in name])
                
            new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)
            
            if ner == "Isik":
                new_span.add_annotation(Annotation(new_span, nertag="PER"))
                print(f"ISIK: {new_span} on lisatud.")
                for k in range(0, len(name)):
                    if k == 0:
                        dictionary_for_wordner[i] = "B-PER"
                        continue
                    else:
                        dictionary_for_wordner[i+k] = "I-PER"
            if ner == "KO_koht" or ner == "KO_org":
                new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                print(f"LOC_ORG: {new_span} on lisatud.")
                for k in range(0, len(name)):
                    if k == 0:
                        dictionary_for_wordner[i] = "B-LOC"
                        dictionary_for_wordner[i] = "B-ORG"
                        continue
                        
                    else:
                        dictionary_for_wordner[i+k] = "I-LOC"
                        dictionary_for_wordner[i+k] = "I-ORG"
            if ner == "Koht":
                new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                print(f"LOC: {new_span} on lisatud.")
                for k in range(0, len(name)):
                    if k == 0:
                        dictionary_for_wordner[i] = "B-LOC"
                        continue
                    else:
                        dictionary_for_wordner[i+k] = "I-LOC"
            if ner == "Org":
                new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                print(f"ORG: {new_span} on lisatud.")
                for k in range(0, len(name)):
                    if k == 0:
                        dictionary_for_wordner[i] = "B-ORG"
                        continue
                    else:
                        dictionary_for_wordner[i+k] = "I-ORG"
            if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                print(f"MISC: {new_span} on lisatud.")
                for k in range(0, len(name)):
                    if k == 0:
                        dictionary_for_wordner[i] = "B-MISC"
                        continue
                    else:
                        dictionary_for_wordner[i+k] = "I-MISC"
            gold_ner_layer.add_span(new_span)
            break
text.add_layer(gold_ner_layer)

ISIK: EnvelopingSpan(['J. Krais'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Lesserberg'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Asse'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['J.Lesserberg'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Asse'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Lesserberg'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Lesserberg'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Asse'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Lesserberg'], [{'nertag': 'PER'}]) on lisatud.
ISIK: EnvelopingSpan(['Juhan', 'Asse'], [{'nertag': 'PER'}]) on lisatud.


In [186]:
for i in range(0, len(text.words) - 1):
    for key in dictionary_for_wordner.keys():
        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)

        if i == key:
            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
            gold_wordner_layer.add_span(new_span)
            break
        else:
            if i in keys:
                continue
            else:
                new_span.add_annotation(Annotation(new_span, nertag="O"))
            gold_wordner_layer.add_span(new_span)
            break

text.add_layer(gold_wordner_layer)

In [187]:
text.gold_ner

layer name,attributes,parent,enveloping,ambiguous,span count
gold_ner,nertag,,,False,10

text,nertag
['J. Krais'],PER
"['Juhan', 'Lesserberg']",PER
"['Juhan', 'Asse']",PER
['J.Lesserberg'],PER
"['Juhan', 'Asse']",PER
['Lesserberg'],PER
"['Juhan', 'Lesserberg']",PER
"['Juhan', 'Asse']",PER
"['Juhan', 'Lesserberg']",PER
"['Juhan', 'Asse']",PER


In [188]:
text.gold_wordner

layer name,attributes,parent,enveloping,ambiguous,span count
gold_wordner,nertag,words,,False,129

text,nertag
Astus,O
kohtu,O
ette,O
Koolivöörmünder,O
J. Krais,B-PER
ja,O
kaebas,O
et,O
Juhan,B-PER
Lesserberg,I-PER
