In [45]:
import json
import time
import os
import sklearn_crfsuite
import re

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from estnltk.taggers import Retagger
from estnltk.taggers import CompoundTokenTagger


nertagger = NerTagger()
word_level_ner = WordLevelNerTagger()

In [41]:
class TokenSplitter( Retagger ):
    """Splits tokens into smaller tokens based on regular expression patterns.""" 
    conf_param = ['patterns', 'break_group_name']
    
    def __init__(self, patterns, break_group_name:str='end'):
        # Set input/output layers
        self.input_layers = ['tokens']
        self.output_layer = 'tokens'
        self.output_attributes = ()
        # Set other configuration parameters
        if not (isinstance(break_group_name, str) and len(break_group_name) > 0):
            raise TypeError('(!) break_group_name should be a non-empty string.')
        self.break_group_name = break_group_name
        # Assert that all patterns are regular expressions in the valid format
        if not isinstance(patterns, list):
            raise TypeError('(!) patterns should be a list of compiled regular expressions.')
        # TODO: we use an adhoc way to verify that patterns are regular expressions 
        #       because there seems to be no common way of doing it both in py35 
        #       and py36
        for pat in patterns:
            # Check for the existence of methods/attributes
            has_match   = callable(getattr(pat, "match", None))
            has_search  = callable(getattr(pat, "search", None))
            has_pattern = getattr(pat, "pattern", None) is not None
            for (k,v) in (('method match()',has_match),\
                          ('method search()',has_search),\
                          ('attribute pattern',has_pattern)):
                if v is False:
                    raise TypeError('(!) Unexpected regex pattern: {!r} is missing {}.'.format(pat, k))
            symbolic_groups = pat.groupindex
            if self.break_group_name not in symbolic_groups.keys():
                raise TypeError('(!) Pattern {!r} is missing symbolic group named {!r}.'.format(pat, self.break_group_name))
        self.patterns = patterns

    def _change_layer(self, text, layers, status):
        # Get changeble layer
        changeble_layer = layers[self.output_layer]
        # Iterate over tokens
        add_spans    = []
        remove_spans = []
        for span in changeble_layer:
            token_str = text.text[span.start:span.end]
            for pat in self.patterns:
                m = pat.search(token_str)
                if m:
                    break_group_end = m.end( self.break_group_name )
                    if break_group_end > -1 and \
                       break_group_end > 0  and \
                       break_group_end < len(token_str):
                        # Make the split
                        add_spans.append( (span.start, span.start+break_group_end) )
                        add_spans.append( (span.start+break_group_end, span.end) )
                        remove_spans.append( span )
                        # Once a token has been split, then break and move on to 
                        # the next token ...
                        break
        if add_spans:
            assert len(remove_spans) > 0
            for old_span in remove_spans:
                changeble_layer.remove_span( old_span )
            for new_span in add_spans:
                changeble_layer.add_annotation( new_span )

In [44]:
token_splitter = TokenSplitter(patterns=[re.compile(r'(?P<end>[A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>Piebenomme)metsawaht'),\
                                         re.compile(r'(?P<end>maa)peal'),\
                                         re.compile(r'(?P<end>reppi)käest'),\
                                         re.compile(r'(?P<end>Kiidjerwelt)J'),\
                                         re.compile(r'(?P<end>Ameljanow)Persitski'),\
                                         re.compile(r'(?P<end>mõistmas)Mihkel'),\
                                         re.compile(r'(?P<end>tema)Käkk'),\
                                         re.compile(r'(?P<end>Ahjawalla)liikmed'),\
                                         re.compile(r'(?P<end>kohtumees)A'),\
                                         re.compile(r'(?P<end>Pechmann)x'),\
                                         re.compile(r'(?P<end>pölli)Anni'),\
                                         re.compile(r'(?P<end>külla)Rauba'),\
                                         re.compile(r'(?P<end>kohtowannem)Jaak'),\
                                         re.compile(r'(?P<end>rannast)Leno'),\
                                         re.compile(r'(?P<end>wallast)Kiiwita'),\
                                         re.compile(r'(?P<end>wallas)Kristjan'),\
                                         re.compile(r'(?P<end>Pedoson)rahul'),\
                                         re.compile(r'(?P<end>pere)Jaan'),\
                                         re.compile(r'(?P<end>kohtu)poolest'),\
                                         re.compile(r'(?P<end>Kurrista)kaudo'),\
                                         re.compile(r'(?P<end>mölder)Gottlieb'),\
                                         re.compile(r'(?P<end>wöörmündri)Jaan'),\
                                         re.compile(r'(?P<end>Oinas)ja'),\
                                         re.compile(r'(?P<end>ette)Leenu'),\
                                         re.compile(r'(?P<end>Tommingas)peab'),\
                                         re.compile(r'(?P<end>wäljaja)Kotlep'),\
                                         re.compile(r'(?P<end>pea)A'),\
                                         re.compile(r'(?P<end>talumees)Nikolai')])

# Reading in files from the distributed corpus:

In [47]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

# Making `ner` and `wordner` layers from goldstandard files

In [50]:
%%time
for file in files:
    if int(files[file]) in (1, 2, 3):
        with open("vallakohtufailid_json_flat/" + file, 'r', encoding='UTF-8') as f:
            text_import = json_to_text(f.read())
            text = Text(text_import.text)
            text = text.tag_layer()
            text.pop_layer('compound_tokens')
            token_splitter.retag(text)
            CompoundTokenTagger(tag_initials = False, tag_abbreviations = False, tag_hyphenations = False).tag(text)
            nertagger.tag(text)
            word_level_ner.tag(text)
            
            text.add_layer(flatten(text['ner'], 'flat_ner'))
            text.add_layer(flatten(text['wordner'], 'flat_wordner'))
            
            removed_layers = ['sentences', 'morph_analysis', 'wordner', 'compound_tokens', 'ner', 'words', 'tokens']
            for x in removed_layers:
                text.pop_layer(x)
            
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)

CPU times: user 20min 29s, sys: 6.52 s, total: 20min 35s
Wall time: 33min 22s


# Calculating the f1-scores
Layer `ner` against `gold_ner` and `wordner` against `gold_wordner`.

In [55]:
for file in os.listdir("./vallakohtufailid_nertagger/"):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    with open("./vallakohtufailid_nertagger/" + str(file), 'r', encoding='UTF-8') as test, \
    open("./vallakohtufailid_json_flat/" + str(file), 'r', encoding='UTF-8') as train:
        test_import = json_to_text(test.read())
        train_import = json_to_text(train.read())
        
        for i in range(len(train_import['flat_gold_wordner'])):
            y_train.append(train_import['flat_gold_wordner'][i].nertag[0])
        for i in range(len(test_import['flat_wordner'])):
            y_test.append(test_import['flat_wordner'][i].nertag[0])
        
    labels = set(y_train + y_test)
    labels = sorted(labels,key=lambda name: (name[1:], name[0]))
    print(classification_report(y_train, y_test, labels=labels))


ValueError: Found input variables with inconsistent numbers of samples: [155, 154]

In [54]:
print(len(train_import.flat_gold_wordner))
print(len(test_import['flat_wordner']))
print(list(set(train_import.flat_gold_wordner.text) - set(test_import.flat_wordner.text)))
print(test_import)
print(train_import)

155
154
['Kautjallast']
Text(text='\nWata Protokoll № 46. astus Jürri Welmann ette ja ütles et mitte tõssi ei olle et temma Jaan Kütti on lubband püssiga mahha lasta, waid et karjane Jaan Kütt on temma härjad metsa jätnud, ja härjad on Nabbalasse kinni aetud ja Jürri Welmann piddand 90 kopp. trahwi maksma slle ülle on Jürri Welmann karjatse ülle pahhandanud ja karjatse kohta öölnud sa olled nago hunt ja ei karda keddagi muud kui agga püssi hirmo.\n\nTunnistuseks karjane Jaan KüttKautjallast pilleti peal soldat Juhhan Tapp ette kes ütles kuulnud ollwad, et Jürri Welmann on Jaan Küttil öölnud, sind olleks ammu tahtnud mahha lasta.\n\nKautjalla Tallitaja Kristian Tapp astus ette ja ütles kui temma Jürri Welmannil on käsko viind et ta härjad Nabbals kinni on, siis on Jürri Welmann melepahhaga karjatse kohta öölnud, nisuggune tahhaks püssiga lasta.\n\nKohhus pakkus neile leppitust ja nemmad leppisid kohtu ees ärra.\n\n\n')
Text(text='\nWata Protokoll № 46. astus Jürri Welmann ette ja ütles 