In [84]:
import spacy
import pyconll
import json

import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

import sddk

In [85]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


# Initialize the model

In [3]:
!python3 -m spacy init-model xx ../models/spacy_grc_model_5_vectors --vectors-loc ../data/word2vec_win2.txt

[2K[38;5;2m✔ Successfully created model[0m
134209it [00:06, 20598.20it/s]/word2vec_win2.txt
[2K[38;5;2m✔ Loaded vectors from ../data/word2vec_win2.txt[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
134382 entries, 134209 vectors


In [41]:
# load it into our envoronment
nlp = spacy.load("../models/spacy_grc_model_5_vectors")
tagger = nlp.create_pipe("tagger")
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()

# Add lookup tables

In [71]:
ag_lemma_lookup = json.load(open("../data/ag_lemma_lookup.json"))
ag_lemma_lookup_merged = json.load(open("../data/ag_lemma_lookup_merged.json"))

In [72]:
ag_lemma_lookup["NOUN"]["ἐνεργειῶν"]

'ἐνέργεια'

ok, this works but there is a substantial number of cases in which there is a wrong tag, e.g. "VERB" - in this case the lemmatization is unsuccessful

In [73]:
ag_lemma_lookup_merged["ἐνεργειῶν"]

'ἐνέργεια'

In [74]:
try:
    nlp.vocab.lookups.remove_table("lemma_lookup")
    nlp.vocab.lookups.remove_table("lemma_lookup_merged")
except:
    pass

In [75]:
# let add our table to our model
table = nlp.vocab.lookups.add_table("lemma_lookup", ag_lemma_lookup)
table = nlp.vocab.lookups.add_table("lemma_lookup_merged", ag_lemma_lookup_merged)
# nlp.vocab.lookups.remove_table("lemma_exc")

In [76]:
# load it back from the model (to check functionality for future usage)
ag_lemma_lookup = nlp.vocab.lookups.get_table("lemma_lookup")
ag_lemma_lookup_merged = nlp.vocab.lookups.get_table("lemma_lookup_merged")


ag_lemma_lookup["VERB"]["μνήσομαι"]

'μιμνήσκω'

# Lemmatization functions

In [93]:
# !pip install greek-accentuation
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *



def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))


def list_of_possible_accentuations(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph.lower())
            s = syllabify(morph)
            morph_vars = []
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation #add_accentuation(s, accentuation))
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                morph_vars.append(morph_acc_var)
                morph_vars.append(morph_acc_var.capitalize())
            return morph_vars
        else:
            return []
    except:
        return []


def lemmatizer_v0(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        lemma = token_text # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except: 
            try:
                lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
            except:
                morph_vars = list_of_possible_accentuations(token_text)
                for var in morph_vars:
                    lemma = morph_vars[0]
                    try:
                        lemma = ag_lemma_lookup[tag][var] 
                        break
                    except: 
                        pass
        return lemma
    else:
        return old_lemma

def update_lemmata(doc):
    for token in doc:
        token.lemma_ = lemmatizer(token.text, token.pos_, token.lemma_)
    return doc   

def apply_nlp(sentences_list):
    spacy_docs = []
    for sentence in sentences_list:
        doc = nlp(sentence)
        doc = update_lemmata(doc)
        spacy_docs.append(doc)
    return spacy_docs
    
def lemmatizer(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        lemma = token_text # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except:
            try:
                lemma = ag_lemma_lookup_merged[token_text]
            except:
                try:
                    lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
                except:
                    try:
                        lemma = ag_lemma_lookup_merged[grave_to_acute(token_text)]
                    except:
                        morph_vars = list_of_possible_accentuations(token_text)
                        for var in morph_vars:
                            lemma = morph_vars[0]
                            try:
                                lemma = ag_lemma_lookup[tag][var] 
                                break
                            except:
                                try:
                                    lemma = ag_lemma_lookup_merged[var]
                                    break
                                except:
                                    pass
        return lemma
    else:
        return old_lemma

In [78]:
lemmatizer("ΠΑΥΛΟΣ", "NOUN")

'Παῦλος'

In [49]:
lemmatizer("ΒΙΒΛΟΣ", "NOUN")

'βίβλος'

In [50]:
lemmatizer("Ἰησοῦ", "NOUN")

'Ἰησοῦς'

In [51]:
# nonexistent word
lemmatizer("ΒΙΒΒΙΒΛΟΣ", "NOUN")

'βιββιβλός'

In [52]:
# it works even with wrong POS-tag
lemmatizer("ἐνεργειῶν", "VERB")

'ἐνέργεια'

# Training our own model with gold-parse

In [53]:
urls = [
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-dev.conllu"]

In [54]:
def tagged_data_from_url(url):
    corpus = pyconll.load.iter_from_url(url)
    tagged_data = []
    for sentence in corpus:
        words, tags, lemmata = [], [], []
        for token in sentence:
            words.append(token.form)
            tags.append(token.upos)
            lemmata.append(token.lemma)
        if "-dev." in url:
            tagged_data.append((sentence.text, {"words" : words, "tags" : tags, "lemmata" : lemmata}))
        else:
            tagged_data.append((sentence.text, {"words" : words, "tags" : tags}))
    print("File name: {0}; number of sentences: {1}".format(url.rpartition("/")[2], len(tagged_data)))
    return tagged_data

In [55]:
perseus_train = tagged_data_from_url(urls[0])
perseus_dev = tagged_data_from_url(urls[1])
proiel_train = tagged_data_from_url(urls[2])
proiel_dev = tagged_data_from_url(urls[3])

File name: grc_perseus-ud-train.conllu; number of sentences: 11476
File name: grc_perseus-ud-dev.conllu; number of sentences: 1137
File name: grc_proiel-ud-train.conllu; number of sentences: 15014
File name: grc_proiel-ud-dev.conllu; number of sentences: 1019


In [56]:
print(perseus_train[0])

('ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·', {'words': ['ἐρᾷ', 'μὲν', 'ἁγνὸς', 'οὐρανὸς', 'τρῶσαι', 'χθόνα', ',', 'ἔρως', 'δὲ', 'γαῖαν', 'λαμβάνει', 'γάμου', 'τυχεῖν', '·'], 'tags': ['VERB', 'ADV', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PUNCT']})


In [57]:
train_data = perseus_train + proiel_train

In [58]:
len(train_data)

26490

In [59]:
proiel_dev_len = sum([len(sent[1]["tags"]) for sent in proiel_dev])
print("tokens in proiel_dev: " + str(proiel_dev_len))

perseus_dev_len = sum([len(sent[1]["tags"]) for sent in perseus_dev])
print("tokens in perseus_dev: " + str(perseus_dev_len))
total_len = proiel_dev_len + perseus_dev_len

tokens in proiel_dev: 13652
tokens in perseus_dev: 22135


In [80]:
n_iter = 10

from spacy.util import decaying
dropout = decaying(0.6, 0.2, 1e-4)

model_tests = []
for i in range(n_iter):
    
    ### TRAIN THE MODEL
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, drop=0.6, sgd=optimizer, losses=losses)
    
    ### TEST THE MODEL
    model_test = {}
    model_test["Losses"] = losses
    proiel_tags_correct = 0
    proiel_lemmata_correct = 0
    for sent in proiel_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
    model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

    perseus_tags_correct = 0
    perseus_lemmata_correct = 0
    for sent in perseus_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
    model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100
    print(model_test)
    model_tests.append(model_test)

{'Losses': {'tagger': 51263.093717992306}, 'proiel_tags_correct%': 88.17755640199239, 'proiel_lemmata_correct%': 88.0457075886317, 'perseus_tags_correct%': 84.85656200587304, 'perseus_lemmata_correct%': 88.11836458098034}
{'Losses': {'tagger': 49777.75423413515}, 'proiel_tags_correct%': 88.22150600644594, 'proiel_lemmata_correct%': 88.06768239085848, 'perseus_tags_correct%': 85.14117912807771, 'perseus_lemmata_correct%': 88.17257736616219}
{'Losses': {'tagger': 48755.168744176626}, 'proiel_tags_correct%': 88.1629065338412, 'proiel_lemmata_correct%': 88.0457075886317, 'perseus_tags_correct%': 84.74361870341089, 'perseus_lemmata_correct%': 88.14547097357128}
{'Losses': {'tagger': 47687.73197457008}, 'proiel_tags_correct%': 88.27278054497509, 'proiel_lemmata_correct%': 88.03838265455612, 'perseus_tags_correct%': 85.05534221820646, 'perseus_lemmata_correct%': 88.15450643776825}
{'Losses': {'tagger': 47103.34317180514}, 'proiel_tags_correct%': 88.00908291825374, 'proiel_lemmata_correct%': 8

In [81]:
nlp.to_disk("../models/spacy_grc_model_5_tagger")

In [79]:
# COMPARE WITH OUR FIRST MODEL
nlp_1 = spacy.load("../models/spacy_grc_model_1")

model_test = {}
proiel_tags_correct = 0
proiel_lemmata_correct = 0
for sent in proiel_dev:
    doc = nlp_1(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

perseus_tags_correct = 0
perseus_lemmata_correct = 0
for sent in perseus_dev:
    doc = nlp_1(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100

model_test

{'proiel_tags_correct%': 87.90653384119544,
 'proiel_lemmata_correct%': 88.13360679753882,
 'perseus_tags_correct%': 85.05534221820646,
 'perseus_lemmata_correct%': 88.1364355093743}

our results with first model:
```
{'proiel_tags_correct%': 87.90653384119544,
 'proiel_lemmata_correct%': 85.62115440961031,
 'perseus_tags_correct%': 85.05534221820646,
 'perseus_lemmata_correct%': 82.66997967020556}
 ```

# Testing

In [86]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_preprocessed_20201127.json", "df", conf)

In [87]:
doc = update_lemmata(nlp(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["sentences"][0]))
print([(token.text, token.pos_, token.lemma_) for token in doc])

[('ΠΑΥΛΟΣ', 'VERB', 'Παῦλος'), ('ἀπόστολος', 'NOUN', 'ἀπόστολος'), (',', 'PUNCT', ','), ('οὐκ', 'ADV', 'οὐ'), ('ἀπʼ', 'ADP', 'ἄπʼ'), ('ἀνθρώπων', 'NOUN', 'ἄνθρωπος'), ('οὐδὲ', 'CCONJ', 'οὐδέ'), ('διʼ', 'ADP', 'δίʼ'), ('ἀνθρώπου', 'NOUN', 'ἄνθρωπος'), ('ἀλλὰ', 'CCONJ', 'ἀλλά'), ('διὰ', 'ADP', 'διά'), ('Ἰησοῦ', 'PROPN', 'Ἰησοῦς'), ('Χριστοῦ', 'PROPN', 'Χριστός'), ('καὶ', 'CCONJ', 'καί'), ('θεοῦ', 'NOUN', 'θεός'), ('πατρὸς', 'NOUN', 'πατήρ'), ('τοῦ', 'DET', 'ὁ'), ('ἐγείραντος', 'VERB', 'ἐγείρω'), ('αὐτὸν', 'PRON', 'αὐτός'), ('ἐκ', 'ADP', 'ἐκ'), ('νεκρῶν', 'ADJ', 'νεκρός'), (',', 'PUNCT', ','), ('καὶ', 'CCONJ', 'καί'), ('οἱ', 'DET', 'ὁ'), ('σὺν', 'ADP', 'σύν'), ('ἐμοὶ', 'PRON', 'ἐγώ'), ('πάντες', 'ADJ', 'πᾶς'), ('ἀδελφοί', 'NOUN', 'ἀδελφός'), (',', 'PUNCT', ','), ('ταῖς', 'DET', 'ὁ'), ('ἐκκλησίαις', 'NOUN', 'ἐκκλησία'), ('τῆς', 'DET', 'ὁ'), ('Γαλατίας', 'NOUN', 'Γαλατία'), ('·', 'PUNCT', '·')]


In [94]:
arist_test = apply_nlp(AGT[AGT["doc_id"]=="tlg0086.tlg010"]["sentences"].tolist()[0])
for doc in arist_test[:10]:
    print([(token.text, token.pos_, token.lemma_) for token in doc])

[('πᾶσα', 'DET', 'πᾶς'), ('τέχνη', 'NOUN', 'τέχνη'), ('καὶ', 'CCONJ', 'καί'), ('πᾶσα', 'DET', 'πᾶς'), ('μέθοδος', 'NOUN', 'μέθοδος'), (',', 'PUNCT', ','), ('ὁμοίως', 'ADV', 'ὁμοίως'), ('δὲ', 'PART', 'δέ'), ('πρᾶξίς', 'ADJ', 'πρᾶξις'), ('τε', 'PART', 'τε'), ('καὶ', 'CCONJ', 'καί'), ('προαίρεσις', 'NOUN', 'προαίρεσις'), (',', 'PUNCT', ','), ('ἀγαθοῦ', 'ADJ', 'ἀγαθός'), ('τινὸς', 'PRON', 'τίς'), ('ἐφίεσθαι', 'VERB', 'ἐφίημι'), ('δοκεῖ', 'VERB', 'δοκέω'), ('·', 'PUNCT', '·')]
[('διὸ', 'ADV', 'διό'), ('καλῶς', 'ADV', 'καλῶς'), ('ἀπεφήναντο', 'VERB', 'ἀποφαίνω'), ('τἀγαθόν', 'NOUN', 'ἀγαθός'), (',', 'PUNCT', ','), ('οὗ', 'PRON', 'ὅς'), ('πάντʼ', 'ADJ', 'πάντʼ'), ('ἐφίεται', 'VERB', 'ἐφίημι'), ('.', 'PUNCT', '.')]
[('διαφορὰ', 'NOUN', 'διαφορά'), ('δέ', 'ADV', 'δέ'), ('τις', 'ADJ', 'τὶς'), ('φαίνεται', 'VERB', 'φαίνω'), ('τῶν', 'DET', 'ὁ'), ('τελῶν', 'NOUN', 'τέλος'), ('·', 'PUNCT', '·')]
[('τὰ', 'DET', 'ὁ'), ('μὲν', 'PART', 'μέν'), ('γάρ', 'PART', 'γάρ'), ('εἰσιν', 'VERB', 'εἰμί'), ('ἐνέργει