# Evaluate

In [1]:
import logging
import regex
import lemma
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.DEBUG)

In [2]:
def _parse_ud_line(line):
    return line.split("\t")[1:4]

lemmatizer = lemma.load()
UD_TRAIN_FILE = "./data/UD_Danish/da-ud-train.conllu"

correct = 0
incorrect = 0
ambiguous = 0
mistakes = {}
pos_prev = ""
for line in open(UD_TRAIN_FILE).readlines():
    if line.startswith("#"):
        continue
    if line.strip() == "":
        pos_prev = ""
        continue

    result = _parse_ud_line(line)
    orth, lemma_expected, pos = result
    if regex.search("[0-9\.'/\"é_-]", orth + lemma_expected):
        continue
    if pos == "PRON" and orth.lower() == "det" and lemma_expected.lower() == "det":
        # UD has lemma("PRON", "det") == "det" while DSN has lemma("PRON", "det") == "den
        # We ignore these.
        continue
        
    lemmas_actual = lemmatizer.lemmatize(pos, orth.lower(), pos_previous=pos_prev)
    
    lemma_actual = lemmas_actual[0]
    if len(lemmas_actual) > 1:
        ambiguous += 1
    elif lemma_actual.lower() == lemma_expected.lower():
        correct += 1
    else:
        mistakes[(pos, orth, lemma_expected, lemma_actual)] = mistakes.get((pos, orth, lemma_expected, lemma_actual), 0) + 1
        incorrect += 1
    pos_prev = pos

print("* correct:", correct)
print("* incorrect:", incorrect)
print("* ambiguous:", ambiguous)
print("*", correct/(incorrect+ambiguous+correct))
print("*", (correct+ambiguous)/(incorrect+ambiguous+correct))

* correct: 70590
* incorrect: 297
* ambiguous: 583
* 0.9876871414579543
* 0.9958444102420596


In [3]:
sorted(mistakes.items(), key=lambda x: (x[0][1].lower(), x))

[(('NOUN', 'aftes', 'aften', 'aftes'), 5),
 (('ADV', 'allerhelst', 'helst', 'allerhelst'), 1),
 (('ADJ', 'allerstørste', 'stor', 'allerstørst'), 1),
 (('VERB', 'Bor', 'bo', 'bor'), 2),
 (('VERB', 'bor', 'bo', 'bor'), 1),
 (('VERB', 'Bortset', 'bortset', 'bortse'), 1),
 (('VERB', 'bortset', 'bortset', 'bortse'), 3),
 (('PRON', 'De', 'de', 'den'), 1),
 (('PRON', 'Deres', 'Deres', 'de'), 1),
 (('PRON', 'deres', 'deres', 'de'), 1),
 (('NOUN', 'drikkevarer', 'drikkevarer', 'drikkevare'), 2),
 (('NOUN', 'dyrenes', 'dyrenes', 'dyr'), 1),
 (('ADJ', 'Flere', 'mange', 'flere'), 7),
 (('ADJ', 'flere', 'mange', 'flere'), 64),
 (('ADJ', 'flest', 'mange', 'flest'), 1),
 (('ADJ', 'fleste', 'mange', 'flest'), 30),
 (('NOUN', 'forvejen', 'forvej', 'forvejen'), 4),
 (('VERB', 'følges', 'følges', 'følge'), 1),
 (('NOUN', 'g', 'gram', 'g'), 8),
 (('VERB', 'gør', 'gøre', 'gø'), 4),
 (('VERB', 'gør', 'gøre', 'gør'), 3),
 (('PRON', 'hans', 'hans', 'han'), 2),
 (('PRON', 'hendes', 'hendes', 'hun'), 1),
 (('NO