In [5]:
import json
import time
import os
import sklearn_crfsuite
import re

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from estnltk.taggers import Retagger
from estnltk.taggers import CompoundTokenTagger


nertagger = NerTagger()
word_level_ner = WordLevelNerTagger()

### Adding TokenSplitter to make an equal amount of tokens.

In [6]:
class TokenSplitter( Retagger ):
    """Splits tokens into smaller tokens based on regular expression patterns.""" 
    conf_param = ['patterns', 'break_group_name']
    
    def __init__(self, patterns, break_group_name:str='end'):
        # Set input/output layers
        self.input_layers = ['tokens']
        self.output_layer = 'tokens'
        self.output_attributes = ()
        # Set other configuration parameters
        if not (isinstance(break_group_name, str) and len(break_group_name) > 0):
            raise TypeError('(!) break_group_name should be a non-empty string.')
        self.break_group_name = break_group_name
        # Assert that all patterns are regular expressions in the valid format
        if not isinstance(patterns, list):
            raise TypeError('(!) patterns should be a list of compiled regular expressions.')
        # TODO: we use an adhoc way to verify that patterns are regular expressions 
        #       because there seems to be no common way of doing it both in py35 
        #       and py36
        for pat in patterns:
            # Check for the existence of methods/attributes
            has_match   = callable(getattr(pat, "match", None))
            has_search  = callable(getattr(pat, "search", None))
            has_pattern = getattr(pat, "pattern", None) is not None
            for (k,v) in (('method match()',has_match),\
                          ('method search()',has_search),\
                          ('attribute pattern',has_pattern)):
                if v is False:
                    raise TypeError('(!) Unexpected regex pattern: {!r} is missing {}.'.format(pat, k))
            symbolic_groups = pat.groupindex
            if self.break_group_name not in symbolic_groups.keys():
                raise TypeError('(!) Pattern {!r} is missing symbolic group named {!r}.'.format(pat, self.break_group_name))
        self.patterns = patterns

    def _change_layer(self, text, layers, status):
        # Get changeble layer
        changeble_layer = layers[self.output_layer]
        # Iterate over tokens
        add_spans    = []
        remove_spans = []
        for span in changeble_layer:
            token_str = text.text[span.start:span.end]
            for pat in self.patterns:
                m = pat.search(token_str)
                if m:
                    break_group_end = m.end( self.break_group_name )
                    if break_group_end > -1 and \
                       break_group_end > 0  and \
                       break_group_end < len(token_str):
                        # Make the split
                        add_spans.append( (span.start, span.start+break_group_end) )
                        add_spans.append( (span.start+break_group_end, span.end) )
                        remove_spans.append( span )
                        # Once a token has been split, then break and move on to 
                        # the next token ...
                        break
        if add_spans:
            assert len(remove_spans) > 0
            for old_span in remove_spans:
                changeble_layer.remove_span( old_span )
            for new_span in add_spans:
                changeble_layer.add_annotation( new_span )

In [7]:
token_splitter = TokenSplitter(patterns=[re.compile(r'(?P<end>[A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>Piebenomme)metsawaht'),\
                                         re.compile(r'(?P<end>maa)peal'),\
                                         re.compile(r'(?P<end>reppi)käest'),\
                                         re.compile(r'(?P<end>Kiidjerwelt)J'),\
                                         re.compile(r'(?P<end>Ameljanow)Persitski'),\
                                         re.compile(r'(?P<end>mõistmas)Mihkel'),\
                                         re.compile(r'(?P<end>tema)Käkk'),\
                                         re.compile(r'(?P<end>Ahjawalla)liikmed'),\
                                         re.compile(r'(?P<end>kohtumees)A'),\
                                         re.compile(r'(?P<end>Pechmann)x'),\
                                         re.compile(r'(?P<end>pölli)Anni'),\
                                         re.compile(r'(?P<end>külla)Rauba'),\
                                         re.compile(r'(?P<end>kohtowannem)Jaak'),\
                                         re.compile(r'(?P<end>rannast)Leno'),\
                                         re.compile(r'(?P<end>wallast)Kiiwita'),\
                                         re.compile(r'(?P<end>wallas)Kristjan'),\
                                         re.compile(r'(?P<end>Pedoson)rahul'),\
                                         re.compile(r'(?P<end>pere)Jaan'),\
                                         re.compile(r'(?P<end>kohtu)poolest'),\
                                         re.compile(r'(?P<end>Kurrista)kaudo'),\
                                         re.compile(r'(?P<end>mölder)Gottlieb'),\
                                         re.compile(r'(?P<end>wöörmündri)Jaan'),\
                                         re.compile(r'(?P<end>Oinas)ja'),\
                                         re.compile(r'(?P<end>ette)Leenu'),\
                                         re.compile(r'(?P<end>Tommingas)peab'),\
                                         re.compile(r'(?P<end>wäljaja)Kotlep'),\
                                         re.compile(r'(?P<end>pea)A'),\
                                         re.compile(r'(?P<end>talumees)Nikolai')])

# Reading in files from the distributed corpus:

In [2]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

### Making `ner` and `wordner` layers from goldstandard files

In [27]:
%%time
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)
        
counter = 0
removed_layers = ['sentences', 'morph_analysis', 'wordner', 'compound_tokens', 'ner', 'words', 'tokens']
for file in {key: value for key, value in files.items() if value == '1'}:
    with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
        text = Text(f.read())
        text = text.tag_layer(['tokens'])
        token_splitter.retag(text)
        CompoundTokenTagger(tag_initials = False, tag_abbreviations = False, tag_hyphenations = False).tag(text)
        text.tag_layer('morph_analysis')
        nertagger.tag(text)
        word_level_ner.tag(text)

        text.add_layer(flatten(text['ner'], 'flat_ner'))
        text.add_layer(flatten(text['wordner'], 'flat_wordner'))

        for x in removed_layers:
            text.pop_layer(x)
        text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)
        counter += 1
    print(f"File {counter}")

File 1
File 2
File 3
File 4
File 5
File 6
File 7
File 8
File 9
File 10
File 11
File 12
File 13
File 14
File 15
File 16
File 17
File 18
File 19
File 20
File 21
File 22
File 23
File 24
File 25
File 26
File 27
File 28
File 29
File 30
File 31
File 32
File 33
File 34
File 35
File 36
File 37
File 38
File 39
File 40
File 41
File 42
File 43
File 44
File 45
File 46
File 47
File 48
File 49
File 50
File 51
File 52
File 53
File 54
File 55
File 56
File 57
File 58
File 59
File 60
File 61
File 62
File 63
File 64
File 65
File 66
File 67
File 68
File 69
File 70
File 71
File 72
File 73
File 74
File 75
File 76
File 77
File 78
File 79
File 80
File 81
File 82
File 83
File 84
File 85
File 86
File 87
File 88
File 89
File 90
File 91
File 92
File 93


AttributeError: ("'NerMorphFeatureTagger' object has no attribute 'retag'", "in the 'WordLevelNerTagger'")

In [16]:
%%time
counter = 0
removed_layers = ['sentences', 'morph_analysis', 'wordner', 'compound_tokens', 'ner', 'words', 'tokens']
for file in files:
    if int(files[file]) == 1:
        with open("vallakohtufailid_json_flat/" + file, 'r', encoding='UTF-8') as f:
            text_import = json_to_text(f.read())
            text = Text(text_import.text)
            text = text.tag_layer(['tokens'])
            token_splitter.retag(text)
            CompoundTokenTagger(tag_initials = False, tag_abbreviations = False, tag_hyphenations = False).tag(text)
            text.tag_layer('morph_analysis')
            nertagger.tag(text)
            word_level_ner.tag(text)
            
            text.add_layer(flatten(text['ner'], 'flat_ner'))
            text.add_layer(flatten(text['wordner'], 'flat_wordner'))
            
            for x in removed_layers:
                text.pop_layer(x)
            text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)
            counter += 1
    print(f"File {counter}")

File 1
File 2
File 3
File 4
File 5
File 6
File 7
File 8
File 9
File 10
File 11
File 12
File 13
File 14
File 15
File 16
File 17
File 18
File 19
File 20
File 21
File 22
File 23
File 24
File 25
File 26
File 27
File 28
File 29
File 30
File 31
File 32
File 33
File 34
File 35
File 36
File 37
File 38
File 39
File 40
File 41
File 42
File 43
File 44
File 45
File 46
File 47
File 48
File 49
File 50
File 51
File 52
File 53
File 54
File 55
File 56
File 57
File 58
File 59
File 60
File 61
File 62
File 63
File 64
File 65
File 66
File 67
File 68
File 69
File 70
File 71
File 72
File 73
File 74
File 75
File 76
File 77
File 78
File 79
File 80
File 81
File 82
File 83
File 84
File 85
File 86
File 87
File 88
File 89
File 90


KeyboardInterrupt: 

### Calculating the f1-scores
Layer `ner` against `gold_ner` and `wordner` against `gold_wordner`.

In [33]:
y_train = []
y_test = []
for file in os.listdir("./vallakohtufailid_nertagger/"):
    if file.endswith(".json"):
        with open("./vallakohtufailid_nertagger/" + str(file), 'r', encoding='UTF-8') as test, \
            open("./vallakohtufailid_json_flat/" + str(file), 'r', encoding='UTF-8') as train:
                test_import = json_to_text(test.read())
                train_import = json_to_text(train.read())
                
                for i in range(len(train_import['flat_gold_wordner'])):
                    y_train.append(train_import['flat_gold_wordner'][i].nertag[0])
                for i in range(len(test_import['flat_wordner'])):
                    y_test.append(test_import['flat_wordner'][i].nertag[0])

labels = set(y_train + y_test)
labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(classification_report(y_train, y_test, labels=labels))


ValueError: Found input variables with inconsistent numbers of samples: [23403, 23554]

In [34]:
print(len(train_import.flat_gold_wordner))
print(len(test_import['flat_wordner']))
print(list(set(train_import.flat_gold_wordner.text) - set(test_import.flat_wordner.text)))
print(test_import)
print(train_import)

179
179
[]
Text(text='Prottokoll sel 15mal Juunis 1881\nKoos olid kohtu Peawanem: Jaan Saarmann\nJaan Kreep\nHans Simsohn\nKirjutaja: H. Saarmu\n\n 32.\n\nKohtu laua ette astus Käru walla mees  Kaarel Liwersohn ja kaebas: et Hans Saarmann ja Hans Wanaisak on teda hirmsaste Kirna Reopalu kõrtsus peksnud ja et Toomas Wirkaus, Hans Poak ja Mihkel  Nass on seda peksmist pealt näinud.\n Toomas Wirkaus, Hans Poak ja Mihkel Nass astusid ette ja ütlesid: et  Kaarel Liwersohni kaebdus tõsi on.\nHans Saarmann ja Hans Wanaisak astusid ette ja ei wõinud seda mitte ära salata, seepärast, et need ülemal nimetud tunistuse mehed on seda pealt waatanud.\nNemad lepisid selle moodiga ära, et Hans Saarmann ja Hans Wanaisaak maksid 11 Rubla 50 kop. kus sest rahast 10 Rubla Kaarel Liwersohnile sai tema peksu eest, ja 1 Rubla 50 kop käemestele.\nJa on ka täidetud. \n\nKirna walla kogukonna kohtu nimel,\nPeawanem: Jaan Saarmann XXX\nJaan Kreeps XXX\nHans Simsohn XXX\nTunnistuseks kirjutaja: H. Saarmu /allkiri