# Train a Lemmatizer using lemma

In [None]:
import logging
import random
import pandas as pd
from lemma import Lemmatizer, SimpleLemmatizer

In [None]:
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s : %(levelname)s : %(message)s")

In [None]:
def load_data(filename):
    df = pd.read_csv(csv_file, usecols=[0, 1, 2], keep_default_na=False)
    X = [(word_class, full_form) for _, (word_class, full_form, _) in df.iterrows()]
    y = [lemma for _, (_word_class, _full_form, lemma,) in df.iterrows()]
    return X, y

def split_data(X, y):
    mask = [False] * len(y)
    test_indices = random.sample(range(len(y)), len(y) // 500)
    for index in test_indices:
        mask[index] = True

    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for index, test in enumerate(mask):
        if test:
            X_test += [X[index]]
            y_test += [y[index]]
        else:
            X_train += [X[index]]
            y_train += [y[index]]
    
    return X_train, y_train, X_test, y_test

def print_examples(lemmatizer):
    examples = [["verb", "drak"], ["noun", "kattene"], ["noun", "ukrudtet"], ["noun", "slaraffenlandet"],
                ["noun", "alen"], ["noun", "skaber"], ["noun", "venskaber"], ["noun", "tilbageførelser"],
                ["noun", "aftenbønnerne"], ["noun", "altankassepassere"]]
    for word_class, full_form in examples:
        lemma = lemmatizer.lemmatize(word_class, full_form)
        print("(%s, %s) -> %s" % (word_class, full_form, lemma))

def calculate_accuracy(lemmatizer, X, y):
    total = 0
    correct = 0
    ambiguous = 0

    for index in range(len(y)):
        word_class, full_form = X[index]
        target = y[index]
        predicted = lemmatizer.lemmatize(word_class, full_form)
        total += 1
        if len(predicted) > 1:
            ambiguous += 1
        elif predicted[0] == target:
            correct += 1
        else:            
            #print("(%s, %s) -> %s (expected: %s)" % (word_class, full_form, predicted, target))
            pass

    print("correct:", correct)
    print("ambiguous:", ambiguous)
    print("total:", total)
    print("accuracy:", correct/total)
    print("ambiguous%:", ambiguous/total)
    print("ambiguous + accuracy:", (ambiguous+correct)/total)

## Load Data

In [None]:
csv_file = "./temp/lemma_data.csv"
X, y = load_data(csv_file)

## Split Data

In [None]:
random.seed(0)
X_train, y_train, X_test, y_test = split_data(X, y)

In [None]:
print(len(X))
print(len(X_test))
print(len(X_train))

## Train CST-like Lemmatizer

In [None]:
lemmatizer = Lemmatizer()
lemmatizer.fit(X_train, y_train)

In [None]:
print_examples(lemmatizer)

In [None]:
calculate_accuracy(lemmatizer, X_train, y_train)

In [None]:
calculate_accuracy(lemmatizer, X_test, y_test)

## Explore Ambiguities

In [None]:
len([(k,v) for k, v in lemmatizer.rules['noun'].items() if len(v) > 1])

In [None]:
[(k,v) for k, v in lemmatizer.rules['noun'].items() if len(v) > 1][0]