In [36]:
import spacy
import pyconll
import json

In [37]:
import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

In [13]:
lang="el"
# initatize en empty model
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
#for tag, values in TAG_MAP.items():
#    tagger.add_label(tag, values)
nlp.add_pipe(tagger)

In [24]:
path = "../data/ag_lemma_lookup.json"
ag_lemma_lookup = json.load(open(path))

In [25]:
from spacy.lookups import Lookups
lookups = Lookups()

In [26]:
path = "../data/ag_lemma_lookup.json"
ag_lemma_lookup = json.load(open(path))
table = lookups.add_table("ag_lemma_lookup", ag_lemma_lookup)

In [28]:
from spacy.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer(lookups)

# Training our own model

In [38]:
# extract first part of training data
corpus_perseus = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu")

corpus_proiel = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu")


In [39]:
train_data_perseus = []
for sentence in corpus_perseus:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_perseus.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [40]:
train_data_proiel = []
for sentence in corpus_proiel:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_proiel.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [41]:
print(train_data_proiel[:3])

[('Ἡροδότου Ἁλικαρνησσέος ἱστορίης ἀπόδεξις ἥδε ὡς μήτε τὰ γενόμενα ἐξ ἀνθρώπων τῷ χρόνῳ ἐξίτηλα γένηται μήτε ἔργα μεγάλα τε καὶ θωμαστά τὰ μὲν Ἕλλησι τὰ δὲ βαρβάροισι ἀποδεχθέντα ἀκλεᾶ γένηται τά τε ἄλλα καὶ δι’ ἣν αἰτίην ἐπολέμησαν ἀλλήλοισι', {'words': ['Ἡροδότου', 'Ἁλικαρνησσέος', 'ἱστορίης', 'ἀπόδεξις', 'ἥδε', 'ὡς', 'μήτε', 'τὰ', 'γενόμενα', 'ἐξ', 'ἀνθρώπων', 'τῷ', 'χρόνῳ', 'ἐξίτηλα', 'γένηται', 'μήτε', 'ἔργα', 'μεγάλα', 'τε', 'καὶ', 'θωμαστά', 'τὰ', 'μὲν', 'Ἕλλησι', 'τὰ', 'δὲ', 'βαρβάροισι', 'ἀποδεχθέντα', 'ἀκλεᾶ', 'γένηται', 'τά', 'τε', 'ἄλλα', 'καὶ', 'δι’', 'ἣν', 'αἰτίην', 'ἐπολέμησαν', 'ἀλλήλοισι'], 'tags': ['PROPN', 'NOUN', 'NOUN', 'NOUN', 'DET', 'SCONJ', 'CCONJ', 'DET', 'VERB', 'ADP', 'NOUN', 'DET', 'NOUN', 'ADJ', 'VERB', 'CCONJ', 'NOUN', 'ADJ', 'CCONJ', 'CCONJ', 'ADJ', 'PRON', 'ADV', 'NOUN', 'PRON', 'ADV', 'NOUN', 'VERB', 'ADJ', 'VERB', 'DET', 'CCONJ', 'ADJ', 'CCONJ', 'ADP', 'PRON', 'NOUN', 'VERB', 'PRON']}), ('Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασὶ γενέσθαι τῆς δι

In [42]:
train_data = train_data_perseus + train_data_proiel

In [43]:
len(train_data)

26490

In [39]:
random.shuffle(train_data)

In [None]:
lang="xx"
nlp = spacy.blank(lang)
tagger = nlp.create_pipe("tagger")

nlp.add_pipe(tagger)

optimizer = nlp.begin_training()

n_iter = 10
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, losses=losses)
    print("Losses", losses)

In [55]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('ἀρχόμενος', 'VERB', 'VERB'), ('σέο', 'PRON', 'PRON'), (',', 'PUNCT', 'PUNCT'), ('Φοῖβε', 'NOUN', 'NOUN'), (',', 'PUNCT', 'PUNCT'), ('παλαιγενέων', 'ADJ', 'ADJ'), ('κλέα', 'NOUN', 'NOUN'), ('φωτῶν', 'NOUN', 'NOUN'), ('μνήσομαι', 'VERB', 'VERB'), (',', 'PUNCT', 'PUNCT'), ('οἳ', 'PRON', 'PRON'), ('Πόντοιο', 'NOUN', 'NOUN'), ('κατὰ', 'ADP', 'ADP'), ('στόμα', 'NOUN', 'NOUN'), ('καὶ', 'CCONJ', 'CCONJ'), ('διὰ', 'ADP', 'ADP'), ('πέτρας', 'NOUN', 'NOUN'), ('Κυανέας', 'NOUN', 'NOUN'), ('βασιλῆος', 'NOUN', 'NOUN'), ('ἐφημοσύνῃ', 'VERB', 'VERB'), ('Πελίαο', 'NOUN', 'NOUN'), ('χρύσειον', 'ADJ', 'ADJ'), ('μετὰ', 'ADP', 'ADP'), ('κῶας', 'NOUN', 'NOUN'), ('ἐύζυγον', 'VERB', 'VERB'), ('ἤλασαν', 'VERB', 'VERB'), ('Ἀργώ', 'ADV', 'ADV'), ('.', 'PUNCT', 'PUNCT')]


In [42]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('ἀρχόμενος', 'VERB', 'VERB'), ('σέο', 'PRON', 'PRON'), (',', 'PUNCT', 'PUNCT'), ('Φοῖβε', 'NOUN', 'NOUN'), (',', 'PUNCT', 'PUNCT'), ('παλαιγενέων', 'VERB', 'VERB'), ('κλέα', 'NOUN', 'NOUN'), ('φωτῶν', 'NOUN', 'NOUN'), ('μνήσομαι', 'VERB', 'VERB'), (',', 'PUNCT', 'PUNCT'), ('οἳ', 'PRON', 'PRON'), ('Πόντοιο', 'NOUN', 'NOUN'), ('κατὰ', 'ADP', 'ADP'), ('στόμα', 'NOUN', 'NOUN'), ('καὶ', 'CCONJ', 'CCONJ'), ('διὰ', 'ADP', 'ADP'), ('πέτρας', 'NOUN', 'NOUN'), ('Κυανέας', 'NOUN', 'NOUN'), ('βασιλῆος', 'NOUN', 'NOUN'), ('ἐφημοσύνῃ', 'ADV', 'ADV'), ('Πελίαο', 'NOUN', 'NOUN'), ('χρύσειον', 'ADJ', 'ADJ'), ('μετὰ', 'ADP', 'ADP'), ('κῶας', 'NOUN', 'NOUN'), ('ἐύζυγον', 'VERB', 'VERB'), ('ἤλασαν', 'VERB', 'VERB'), ('Ἀργώ', 'NOUN', 'NOUN'), ('.', 'PUNCT', 'PUNCT')]


In [57]:
nlp.to_disk("../data/spacy_model")

# Loading the model back

In [35]:
# load it back
nlp = spacy.load("../spacy_model")

In [6]:
path = "../data/large_files/ag_lemma_lookup.json"
ag_lemma_lookup = json.load(open(path))

In [32]:
# let add our table to our model
#table = nlp.vocab.lookups.add_table("ag_lemma_lookup", ag_lemma_lookup)
table = nlp.vocab.lookups.add_table("lemma_exc", ag_lemma_lookup)

In [10]:
nlp.to_disk("../spacy_model")

In [14]:
ag_lemma_lookup = nlp.vocab.lookups.get_table("ag_lemma_lookup")
ag_lemma_lookup["VERB"]["μνήσομαι"]