In [None]:
import spacy
import pyconll
import json

In [None]:
import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

In [None]:
lang="el"
# initatize en empty model
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
#for tag, values in TAG_MAP.items():
#    tagger.add_label(tag, values)
nlp.add_pipe(tagger)

In [None]:
path = "../data/ag_lemma_lookup.json"
ag_lemma_lookup = json.load(open(path))

In [None]:
from spacy.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer(lookups)

# Training our own model

In [None]:
# extract first part of training data
corpus_perseus = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu")

corpus_proiel = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu")


In [None]:
train_data_perseus = []
for sentence in corpus_perseus:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_perseus.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [None]:
train_data_proiel = []
for sentence in corpus_proiel:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_proiel.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [None]:
print(train_data_proiel[:3])

In [None]:
train_data = train_data_perseus + train_data_proiel

In [None]:
len(train_data)

In [None]:
random.shuffle(train_data)

In [None]:
lang="xx"
nlp = spacy.blank(lang)
tagger = nlp.create_pipe("tagger")

nlp.add_pipe(tagger)

optimizer = nlp.begin_training()

n_iter = 10
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, losses=losses)
    print("Losses", losses)

In [None]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

In [None]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

In [None]:
nlp.to_disk("../data/spacy_model")

# Loading the model back

In [None]:
# load it back
nlp = spacy.load("../spacy_model")

In [None]:
nlp.vocab.lookups.tables

In [None]:
nlp.vocab.lookups.remove_table("ag_lemma_lookup")
nlp.vocab.lookups.remove_table("lemma_exc")

In [14]:
path = "../data/ag_lemma_lookup.json"
lemma_lookup = json.load(open(path))

In [15]:
# let add our table to our model
#table = nlp.vocab.lookups.add_table("ag_lemma_lookup", ag_lemma_lookup)
table = nlp.vocab.lookups.add_table("lemma_lookup", lemma_lookup)

In [17]:
nlp.to_disk("../spacy_model")

In [16]:
lemma_lookup = nlp.vocab.lookups.get_table("lemma_lookup")
lemma_lookup["VERB"]["μνήσομαι"]

'μιμνήσκω'