In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def Diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [3]:
import os
import re
import unicodedata

from estnltk import EnvelopingBaseSpan
from estnltk import Text, Layer, Annotation, EnvelopingSpan, Span
from estnltk.converters import text_to_json
from estnltk.layer_operations import extract_sections
from estnltk.taggers import Retagger

class TokenSplitter( Retagger ):
    """Splits tokens into smaller tokens based on regular expression patterns.""" 
    conf_param = ['patterns', 'break_group_name']
    
    def __init__(self, patterns, break_group_name:str='end'):
        # Set input/output layers
        self.input_layers = ['tokens']
        self.output_layer = 'tokens'
        self.output_attributes = ()
        # Set other configuration parameters
        if not (isinstance(break_group_name, str) and len(break_group_name) > 0):
            raise TypeError('(!) break_group_name should be a non-empty string.')
        self.break_group_name = break_group_name
        # Assert that all patterns are regular expressions in the valid format
        if not isinstance(patterns, list):
            raise TypeError('(!) patterns should be a list of compiled regular expressions.')
        # TODO: we use an adhoc way to verify that patterns are regular expressions 
        #       because there seems to be no common way of doing it both in py35 
        #       and py36
        for pat in patterns:
            # Check for the existence of methods/attributes
            has_match   = callable(getattr(pat, "match", None))
            has_search  = callable(getattr(pat, "search", None))
            has_pattern = getattr(pat, "pattern", None) is not None
            for (k,v) in (('method match()',has_match),\
                          ('method search()',has_search),\
                          ('attribute pattern',has_pattern)):
                if v is False:
                    raise TypeError('(!) Unexpected regex pattern: {!r} is missing {}.'.format(pat, k))
            symbolic_groups = pat.groupindex
            if self.break_group_name not in symbolic_groups.keys():
                raise TypeError('(!) Pattern {!r} is missing symbolic group named {!r}.'.format(pat, self.break_group_name))
        self.patterns = patterns

    def _change_layer(self, text, layers, status):
        # Get changeble layer
        changeble_layer = layers[self.output_layer]
        # Iterate over tokens
        add_spans    = []
        remove_spans = []
        for span in changeble_layer:
            token_str = text.text[span.start:span.end]
            for pat in self.patterns:
                m = pat.search(token_str)
                if m:
                    break_group_end = m.end( self.break_group_name )
                    if break_group_end > -1 and \
                       break_group_end > 0  and \
                       break_group_end < len(token_str):
                        # Make the split
                        add_spans.append( (span.start, span.start+break_group_end) )
                        add_spans.append( (span.start+break_group_end, span.end) )
                        remove_spans.append( span )
                        # Once a token has been split, then break and move on to 
                        # the next token ...
                        break
        if add_spans:
            assert len(remove_spans) > 0
            for old_span in remove_spans:
                changeble_layer.remove_span( old_span )
            for new_span in add_spans:
                changeble_layer.add_annotation( new_span )

In [28]:
# If the annotation contains newline character, then indexes will contain ';' at the linebreak (e.g. 388 393;394 398 )
indexes_on_line_split = re.compile(r' (\d+) (\d+;\d+ ){1,}(\d+)$')

def collect_annotations( in_f ):
    annotations = []
    split_lines_ahead = 0
    for line in in_f:
        line = line.rstrip('\n')
        items = line.split('\t')
        if split_lines_ahead > 0:
            split_lines_ahead -= 1
            last_item = annotations[-1]
            new_tuple = (last_item[0],last_item[1],last_item[2],(last_item[3]+line),last_item[4])
            annotations[-1] = new_tuple
            continue
        if len(items) == 3:
            indexes_str = items[1]
            if indexes_str.count(';') > 0:
                split_lines_ahead += indexes_str.count(';')
            indexes_str = indexes_on_line_split.sub(' \\1 \\3', indexes_str)
            tag, start, end = indexes_str.split()
            annotations.append( (tag, start, end, items[2], items[0]) )
    seen = set()
    removed_duplicates_annotations = []
    for a, b, c, d, e in annotations:
        if not b in seen:
            seen.add(b)
            removed_duplicates_annotations.append((a, b, c, d, e))
        else:
            for index, item in enumerate(removed_duplicates_annotations):
                if item[1] == b and item[2] > c:
                    tuple_without_n = (a, b, c, d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                elif item[1] == b and item[2] < c:
                    tuple_without_n = (a, b, item[2], d, e)
                    item = tuple_without_n
                    removed_duplicates_annotations[index] = item
                else:
                    continue
    
    for index, item in enumerate(removed_duplicates_annotations):
        if "\xa0" in item[3]:
            replaced = item[3].replace(u'\xa0', u' ').replace("  ", " ")
            removed_duplicates_annotations[index] = ( item[0], item[1], item[2], replaced, item[4] )
        
    
    annotations = sorted(list(set(removed_duplicates_annotations)), key=lambda x: int(x[1]))
    return annotations

cwd = os.getcwd()

token_splitter = TokenSplitter(patterns=[re.compile(r'(?P<end>[A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>\w+)[A-ZÕÄÖÜ]{1}'),\
                                         re.compile(r'(?P<end>I.\n)Prussak'),\
                                         re.compile(r'(?P<end>R.)Mart'),\
                                         re.compile(r'(?P<end>Dr. )Th. Saag'),\
                                         re.compile(r'(?P<end>V. )Johan'),\
                                         re.compile(r'(?P<end>I. )Peeter'),\
                                         re.compile(r'(?P<end>I. )Wildi'),\
                                         re.compile(r'(?P<end>N. )Lear'),\
                                         re.compile(r'(?P<end>Abbi-)Josep'),\
                                         re.compile(r'(?P<end>P.R. )Maddis'),\
                                         re.compile(r'(?P<end>L.)Jaani'),\
                                         re.compile(r'(?P<end>I. )Jaan'),\
                                         re.compile(r'(?P<end>I. )Karel')])


rownr = 1
directories = ["vallakohus_esimene", "vallakohus_teine", "vallakohus_kolmas", "vallakohus_neljas"]
for directory in directories:
    path = cwd + "/" + directory + "/"
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(path + file, 'r', encoding="utf-8") as txt, open(path + file.split(".")[0] + ".ann", 'r', encoding="utf-8") as ann:
                textfile = txt.read().replace(u'\xa0', u' ')
                dictionary_for_wordner = dict()
                # converting the text form .txt file into an EstNLTK Text object and giving it the "words" layer
                text = Text(textfile)
                text.meta['origin_directory'] = str(directory)
                text = text.tag_layer(['tokens'])
                token_splitter.retag(text)
                text = text.tag_layer(['words'])
                
                # creating NER layers
                gold_ner_layer = Layer(name="gold_ner", text_object=text, attributes=['nertag'])
                gold_wordner_layer = Layer(name="gold_wordner", text_object=text, attributes=['nertag'], parent="words")
                
                #fixing annotations 
                fixed_annotations = collect_annotations(ann)

                annotation_dictionary = {}
                for annotation in fixed_annotations:
                    trigger = annotation[4]
                    location = annotation[0] + " " + annotation[1] + " " + annotation[2]
                    entity = annotation[3]
                    annotation_dictionary[trigger] = [location, entity]

                for key in annotation_dictionary:
                    name = []
                    
                    location, entity = annotation_dictionary.get(key)
    
                    ner, startIndex, endIndex = location.split(" ")
        
                    entity = re.sub(r'\s\s+', r' ', re.sub(r'([^ \wõäöüÕÄÖÜ])', r' \1 ', entity)).rstrip()
                    if entity[0] == " " or entity[0] == "  ":
                        startIndex = int(startIndex) + entity[0].count(" ")
                        entity = entity.lstrip()
                    
                    for i in range(len(text.words)):
                        if text.words[i].start == (int(startIndex) - text.text[:int(text.words[i].start)].count("\n")):  
                            preceding_newlines = text.text[:int(text.words[i].start)].count("\n")
                            startIndex = int(startIndex) - int(preceding_newlines)
                            endIndex = int(endIndex) - int(preceding_newlines)
                            
                            if text.words[i].start == startIndex:
                                if text.words[i].end == endIndex:
                                    base_span = EnvelopingBaseSpan([text.words[i].base_span])
                                    name = [text.words[i]]                                    
                                else:
                                    if text.words[i+1].end == endIndex: 
                                        name = [text.words[i], text.words[i+1]]
                                    else:
                                        entity = entity.split(" ")
                                        if re.match(r'([A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+', entity[0]):
                                            entity.append("")
                                        for j in range(len(entity)):
                                            if entity == ["J", ".", "Laan"]:
                                                name.append(text.words[i+j])
                                                break
                                            #print(file, j, entity, len(entity), text.words[i].text, text.words[i+j].text)
                                            name.append(text.words[i+j])

                                base_span = EnvelopingBaseSpan([s.base_span for s in name])
                                new_span = EnvelopingSpan(base_span, layer=gold_ner_layer)
                                
                                if ner == "Isik":
                                    new_span.add_annotation(Annotation(new_span, nertag="PER"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-PER"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-PER"
                                if ner == "KO_koht" or ner == "KO_org":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC_ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC_ORG"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC_ORG"
                                if ner == "Koht":
                                    new_span.add_annotation(Annotation(new_span, nertag="LOC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-LOC"
                                        else:
                                            dictionary_for_wordner[i+k] = "I-LOC"
                                if ner == "Org":
                                    new_span.add_annotation(Annotation(new_span, nertag="ORG"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-ORG" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-ORG"
                                if ner == "Muu" or ner == "Teadmata" or ner == "ese":
                                    new_span.add_annotation(Annotation(new_span, nertag="MISC"))
                                    for k in range(0, len(name)):
                                        if k == 0:
                                            dictionary_for_wordner[i] = "B-MISC" 
                                        else:
                                            dictionary_for_wordner[i+k] = "I-MISC"
                                gold_ner_layer.add_span(new_span)
                            break
                text.add_layer(gold_ner_layer)
                
                set1 = list()
                set2 = list()
                for TUPLE in fixed_annotations:
                    set1.append(TUPLE[3])
                for NER in text.gold_ner:
                    set2.append(NER.enclosing_text)
                
                if len(Diff(set1, set2)) > 0 and len(set1) != len(set2):
                    print(rownr, "Probleem failis", file, "layerisse ei jõudnud:", Diff(set1, set2))
                    rownr += 1
                
                for i in range(0, len(text.words)):
                    for key in dictionary_for_wordner.keys():
                        new_span = Span(base_span=text.words[i].base_span, layer=gold_wordner_layer)
                        if i == key:
                            new_span.add_annotation(Annotation(new_span, nertag=str(dictionary_for_wordner.get(key))))
                            gold_wordner_layer.add_span(new_span)
                            break
                        else:
                            if i in dictionary_for_wordner.keys():
                                continue
                            else:
                                new_span.add_annotation(Annotation(new_span, nertag="O"))
                        gold_wordner_layer.add_span(new_span)
                        break
                
                text.add_layer(gold_wordner_layer)
                text_to_json(text, file=cwd + "/vallakohtufailid_json/" + file.replace(".txt", ".json"))
    print(f"Kaust {directory} on läbitud.")
print("Programm on lõpetanud oma töö.")

1 Probleem failis Tartu_V6nnu_Ahja_id20417_1888a.txt layerisse ei jõudnud: ['Th. Saag Wulfiusse']
2 Probleem failis J2rva_Tyri_V22tsa_id22259_1913a.txt layerisse ei jõudnud: ['Prussak']
Kaust vallakohus_esimene on läbitud.
3 Probleem failis P2rnu_Halliste_Pornuse_id4791_1869a.txt layerisse ei jõudnud: ['Mart Jersolaw']


KeyboardInterrupt: 