In [7]:
import json
import time
import os
import sklearn_crfsuite
import re
import nereval

from estnltk import Text
from estnltk.taggers import NerTagger
from estnltk.taggers import WordLevelNerTagger
from estnltk.converters import text_to_json
from estnltk.converters import json_to_text
from estnltk.layer_operations import flatten
from sklearn.metrics import classification_report
from estnltk.taggers import Retagger
from estnltk.taggers import CompoundTokenTagger
from sklearn_crfsuite import metrics

from nervaluate import Evaluator

nertagger = NerTagger()
word_level_ner = WordLevelNerTagger()

### Adding TokenSplitter to make an equal amount of tokens.

In [5]:
class TokenSplitter( Retagger ):
    """Splits tokens into smaller tokens based on regular expression patterns.""" 
    conf_param = ['patterns', 'break_group_name']
    
    def __init__(self, patterns, break_group_name:str='end'):
        # Set input/output layers
        self.input_layers = ['tokens']
        self.output_layer = 'tokens'
        self.output_attributes = ()
        # Set other configuration parameters
        if not (isinstance(break_group_name, str) and len(break_group_name) > 0):
            raise TypeError('(!) break_group_name should be a non-empty string.')
        self.break_group_name = break_group_name
        # Assert that all patterns are regular expressions in the valid format
        if not isinstance(patterns, list):
            raise TypeError('(!) patterns should be a list of compiled regular expressions.')
        # TODO: we use an adhoc way to verify that patterns are regular expressions 
        #       because there seems to be no common way of doing it both in py35 
        #       and py36
        for pat in patterns:
            # Check for the existence of methods/attributes
            has_match   = callable(getattr(pat, "match", None))
            has_search  = callable(getattr(pat, "search", None))
            has_pattern = getattr(pat, "pattern", None) is not None
            for (k,v) in (('method match()',has_match),\
                          ('method search()',has_search),\
                          ('attribute pattern',has_pattern)):
                if v is False:
                    raise TypeError('(!) Unexpected regex pattern: {!r} is missing {}.'.format(pat, k))
            symbolic_groups = pat.groupindex
            if self.break_group_name not in symbolic_groups.keys():
                raise TypeError('(!) Pattern {!r} is missing symbolic group named {!r}.'.format(pat, self.break_group_name))
        self.patterns = patterns

    def _change_layer(self, text, layers, status):
        # Get changeble layer
        changeble_layer = layers[self.output_layer]
        # Iterate over tokens
        add_spans    = []
        remove_spans = []
        for span in changeble_layer:
            token_str = text.text[span.start:span.end]
            for pat in self.patterns:
                m = pat.search(token_str)
                if m:
                    break_group_end = m.end( self.break_group_name )
                    if break_group_end > -1 and \
                       break_group_end > 0  and \
                       break_group_end < len(token_str):
                        # Make the split
                        add_spans.append( (span.start, span.start+break_group_end) )
                        add_spans.append( (span.start+break_group_end, span.end) )
                        remove_spans.append( span )
                        # Once a token has been split, then break and move on to 
                        # the next token ...
                        break
        if add_spans:
            assert len(remove_spans) > 0
            for old_span in remove_spans:
                changeble_layer.remove_span( old_span )
            for new_span in add_spans:
                changeble_layer.add_annotation( new_span )

In [6]:
token_splitter = TokenSplitter(patterns=[re.compile(r'(?P<end>[A-ZÕÄÖÜ]{1}\w+)[A-ZÕÄÖÜ]{1}\w+'),\
                                         re.compile(r'(?P<end>Piebenomme)metsawaht'),\
                                         re.compile(r'(?P<end>maa)peal'),\
                                         re.compile(r'(?P<end>reppi)käest'),\
                                         re.compile(r'(?P<end>Kiidjerwelt)J'),\
                                         re.compile(r'(?P<end>Ameljanow)Persitski'),\
                                         re.compile(r'(?P<end>mõistmas)Mihkel'),\
                                         re.compile(r'(?P<end>tema)Käkk'),\
                                         re.compile(r'(?P<end>Ahjawalla)liikmed'),\
                                         re.compile(r'(?P<end>kohtumees)A'),\
                                         re.compile(r'(?P<end>Pechmann)x'),\
                                         re.compile(r'(?P<end>pölli)Anni'),\
                                         re.compile(r'(?P<end>külla)Rauba'),\
                                         re.compile(r'(?P<end>kohtowannem)Jaak'),\
                                         re.compile(r'(?P<end>rannast)Leno'),\
                                         re.compile(r'(?P<end>wallast)Kiiwita'),\
                                         re.compile(r'(?P<end>wallas)Kristjan'),\
                                         re.compile(r'(?P<end>Pedoson)rahul'),\
                                         re.compile(r'(?P<end>pere)Jaan'),\
                                         re.compile(r'(?P<end>kohtu)poolest'),\
                                         re.compile(r'(?P<end>Kurrista)kaudo'),\
                                         re.compile(r'(?P<end>mölder)Gottlieb'),\
                                         re.compile(r'(?P<end>wöörmündri)Jaan'),\
                                         re.compile(r'(?P<end>Oinas)ja'),\
                                         re.compile(r'(?P<end>ette)Leenu'),\
                                         re.compile(r'(?P<end>Tommingas)peab'),\
                                         re.compile(r'(?P<end>wäljaja)Kotlep'),\
                                         re.compile(r'(?P<end>pea)A'),\
                                         re.compile(r'(?P<end>talumees)Nikolai')])

### Reading in files from the distributed corpus:

In [3]:
files = {}

with open('divided_corpus.txt', 'r', encoding = 'UTF-8') as f:
    txt = f.readlines()

for fileName in txt:
    file, subdistribution = fileName.split(":")
    files[file] = subdistribution.rstrip("\n")

### Making `ner` and `wordner` layers from goldstandard files

In [2]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [None]:
%%time
removed_layers = ['sentences', 'morph_analysis', 'wordner', 'compound_tokens', 'ner', 'words', 'tokens']
for file in {key: value for key, value in files.items() if value in ('1', '2', '3')}:
    with open(find(file.replace(".json", ".txt"), "./vallakohtufailid/"), 'r', encoding='UTF-8') as f:
        text = Text(f.read())
        if f == "Tartu_V6nnu_Ahja_id3502_1882a.txt":
            text = text.replace('..', '. .')
        
        text = text.tag_layer(['tokens'])
        token_splitter.retag(text)
        CompoundTokenTagger(tag_initials = False, tag_abbreviations = False, tag_hyphenations = False).tag(text)
        text.tag_layer('morph_analysis')
        
        nertagger.tag(text)
        word_level_ner.tag(text)

        text.add_layer(flatten(text['ner'], 'flat_ner'))
        text.add_layer(flatten(text['wordner'], 'flat_wordner'))

        for x in removed_layers:
            text.pop_layer(x)
        text_to_json(text, file=os.getcwd() + "/vallakohtufailid_nertagger/" + file)


### Calculating the f1-scores
Layer `ner` against `gold_ner` and `wordner` against `gold_wordner`.

In [8]:
files_not_working = ['J2rva_Tyri_V22tsa_id22177_1911a.json', \
                     'J2rva_Tyri_V22tsa_id18538_1894a.json', \
                     'Tartu_V6nnu_Ahja_id3502_1882a.json', \
                     'J2rva_Tyri_V22tsa_id22155_1911a.json', \
                     'Saare_Kihelkonna_Kotlandi_id18845_1865a.json', \
                     'P2rnu_Halliste_Abja_id257_1844a.json', \
                     'Saare_Kaarma_Loona_id7575_1899a.json', \
                     'Tartu_R6ngu_Aakre_id14648_1829a.json']

In [13]:
gold = []
test = []
gold_ner = []
test_ner = []
for file in sorted(os.listdir("./vallakohtufailid_nertagger/")):
    appendable_gold_ner = []
    appendable_test_ner = []
    if file.endswith(".json"):
        if file in files_not_working:
            continue
        else:
            with open("./vallakohtufailid_nertagger/" + str(file), 'r', encoding='UTF-8') as f_test, \
                open("./vallakohtufailid_json_flat/" + str(file), 'r', encoding='UTF-8') as f_gold:
                    test_import = json_to_text(f_test.read())
                    gold_import = json_to_text(f_gold.read())

                    for i in range(len(gold_import['flat_gold_wordner'])):
                        gold.append(gold_import['flat_gold_wordner'][i].nertag[0])
                    for i in range(len(test_import['flat_wordner'])):
                        test.append(test_import['flat_wordner'][i].nertag[0])
                    
                    for i in range(len(gold_import['flat_gold_ner'])):
                        ner = gold_import['flat_gold_ner'][i]
                        appendable_gold_ner.append({"label": ner.nertag[0], "start": int(ner.start), "end": int(ner.end)})
                    for i in range(len(test_import['flat_ner'])):
                        ner = test_import['flat_ner'][i]
                        appendable_test_ner.append({"label": ner.nertag[0], "start": int(ner.start), "end": int(ner.end)})

    gold_ner.append(appendable_gold_ner)
    test_ner.append(appendable_test_ner)
labels = set(gold) 
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))

In [10]:
print(classification_report(gold, test, labels=sorted_labels, zero_division=0))

              precision    recall  f1-score   support

           O       0.96      0.97      0.97    133687
       B-LOC       0.14      0.23      0.18       509
       I-LOC       0.05      0.02      0.03       212
   B-LOC_ORG       0.00      0.00      0.00      1190
   I-LOC_ORG       0.00      0.00      0.00       699
      B-MISC       0.00      0.00      0.00       133
      I-MISC       0.00      0.00      0.00       392
       B-ORG       0.03      0.30      0.06       195
       I-ORG       0.05      0.21      0.08       462
       B-PER       0.79      0.74      0.76     11088
       I-PER       0.85      0.68      0.75     10386

    accuracy                           0.92    158953
   macro avg       0.26      0.29      0.26    158953
weighted avg       0.92      0.92      0.92    158953



In [16]:
evaluator = Evaluator(gold_ner, test_ner, tags=['ORG', 'PER', 'MISC', 'LOC', 'LOC_ORG'])

In [17]:
results, results_per_tag = evaluator.evaluate()

In [18]:
results

{'ent_type': {'correct': 8814,
  'incorrect': 2056,
  'partial': 0,
  'missed': 2245,
  'spurious': 2044,
  'possible': 13115,
  'actual': 12914,
  'precision': 0.6825150998915905,
  'recall': 0.6720548989706443,
  'f1': 0.6772446117791694},
 'partial': {'correct': 8114,
  'incorrect': 0,
  'partial': 2756,
  'missed': 2245,
  'spurious': 2044,
  'possible': 13115,
  'actual': 12914,
  'precision': 0.7350162614217128,
  'recall': 0.7237514296606938,
  'f1': 0.7293403511467976},
 'strict': {'correct': 7420,
  'incorrect': 3450,
  'partial': 0,
  'missed': 2245,
  'spurious': 2044,
  'possible': 13115,
  'actual': 12914,
  'precision': 0.5745702338547313,
  'recall': 0.5657643919176516,
  'f1': 0.570133312843367},
 'exact': {'correct': 8114,
  'incorrect': 2756,
  'partial': 0,
  'missed': 2245,
  'spurious': 2044,
  'possible': 13115,
  'actual': 12914,
  'precision': 0.6283103608486913,
  'recall': 0.61868089973313,
  'f1': 0.6234584501901724}}