In [2]:
import spacy

In [13]:
import pyconll

In [4]:
nlp = spacy.load("xx")

In [5]:
doc = nlp("This is my sentence I want to play with.")

In [6]:
for token in doc:
    print((token.text, token.pos_))

('This', '')
('is', '')
('my', '')
('sentence', '')
('I', '')
('want', '')
('to', '')
('play', '')
('with', '')
('.', '')


# Training our own model

# Simple test with few English sentences

In [7]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


In [8]:
TRAIN_DATA = [
    ("I like green eggs", {"words" : ["I", "like", "green", "eggs"], "tags": ["N", "V", "J", "N"]}),
    ("Eat blue ham", {"words": ['Eat', 'blue', 'ham'], "tags": ["V", "J", "N"]}),]


In [9]:
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}

In [11]:
random.shuffle(TRAIN_DATA)

In [10]:
lang="xx"
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
for tag, values in TAG_MAP.items():
    tagger.add_label(tag, values)
nlp.add_pipe(tagger)

optimizer = nlp.begin_training()


n_iter = 10
for i in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, losses=losses)
    print("Losses", losses)

Losses {'tagger': 6.681818962097168}
Losses {'tagger': 6.5615010261535645}
Losses {'tagger': 6.222362995147705}
Losses {'tagger': 5.625188827514648}
Losses {'tagger': 4.727659225463867}
Losses {'tagger': 3.612746238708496}
Losses {'tagger': 2.466280221939087}
Losses {'tagger': 1.4634302854537964}
Losses {'tagger': 0.7310025691986084}
Losses {'tagger': 0.3093547523021698}


In [11]:
test_text = "I like blue eggs"
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('I', 'N', 'NOUN'), ('like', 'V', 'VERB'), ('blue', 'J', 'ADJ'), ('eggs', 'N', 'NOUN')]


# Greek test

In [44]:
# extract first part of training data
corpus_perseus = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu")

corpus_proiel = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu")


In [49]:
train_data_perseus = []
for sentence in corpus_perseus:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_perseus.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [51]:
print(train_data_perseus[:3])

[('ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·', {'words': ['ἐρᾷ', 'μὲν', 'ἁγνὸς', 'οὐρανὸς', 'τρῶσαι', 'χθόνα', ',', 'ἔρως', 'δὲ', 'γαῖαν', 'λαμβάνει', 'γάμου', 'τυχεῖν', '·'], 'tags': ['VERB', 'ADV', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PUNCT']}), ('ὄμβρος δ̓ ἀπ̓ εὐνάοντος οὐρανοῦ πεσὼν ἔκυσε γαῖαν·', {'words': ['ὄμβρος', 'δ̓', 'ἀπ̓', 'εὐνάοντος', 'οὐρανοῦ', 'πεσὼν', 'ἔκυσε', 'γαῖαν', '·'], 'tags': ['NOUN', 'ADV', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'NOUN', 'PUNCT']}), ('ἡ δὲ τίκτεται βροτοῖς μήλων τε βοσκὰς καὶ βίον Δημήτριον·', {'words': ['ἡ', 'δὲ', 'τίκτεται', 'βροτοῖς', 'μήλων', 'τε', 'βοσκὰς', 'καὶ', 'βίον', 'Δημήτριον', '·'], 'tags': ['DET', 'ADV', 'VERB', 'NOUN', 'NOUN', 'ADV', 'NOUN', 'CCONJ', 'NOUN', 'ADJ', 'PUNCT']})]


In [46]:
train_data_proiel = []
for sentence in corpus_proiel:
    words, tags = [], []
    for token in sentence:
        words.append(token.form)
        tags.append(token.upos)
    train_data_proiel.append((sentence.text, {"words" : words, "tags" : tags}))
        #forms_lemmas_dict[token.form] = [{"l" : token.lemma, "p" : token.xpos, "s" : ""}]

In [48]:
print(train_data_proiel[:3])

[('Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασὶ γενέσθαι τῆς διαφορῆς', {'words': ['Περσέων', 'μέν', 'νυν', 'οἱ', 'λόγιοι', 'Φοίνικας', 'αἰτίους', 'φασὶ', 'γενέσθαι', 'τῆς', 'διαφορῆς'], 'tags': ['NOUN', 'ADV', 'ADV', 'DET', 'NOUN', 'NOUN', 'ADJ', 'VERB', 'VERB', 'DET', 'NOUN']}), ('τούτους γὰρ ἀπὸ τῆς Ἐρυθρῆς καλεομένης θαλάσσης ἀπικομένους ἐπὶ τήνδε τὴν θάλασσαν καὶ οἰκήσαντας τοῦτον τὸν χῶρον τὸν καὶ νῦν οἰκέουσι αὐτίκα ναυτιλίῃσι μακρῇσι ἐπιθέσθαι ἀπαγινέοντας δὲ φορτία Αἰγύπτιά τε καὶ Ἀσσύρια τῇ τε ἄλλῃ ἐσαπικνέεσθαι καὶ δὴ καὶ ἐς Ἄργος', {'words': ['τούτους', 'γὰρ', 'ἀπὸ', 'τῆς', 'Ἐρυθρῆς', 'καλεομένης', 'θαλάσσης', 'ἀπικομένους', 'ἐπὶ', 'τήνδε', 'τὴν', 'θάλασσαν', 'καὶ', 'οἰκήσαντας', 'τοῦτον', 'τὸν', 'χῶρον', 'τὸν', 'καὶ', 'νῦν', 'οἰκέουσι', 'αὐτίκα', 'ναυτιλίῃσι', 'μακρῇσι', 'ἐπιθέσθαι', 'ἀπαγινέοντας', 'δὲ', 'φορτία', 'Αἰγύπτιά', 'τε', 'καὶ', 'Ἀσσύρια', 'τῇ', 'τε', 'ἄλλῃ', 'ἐσαπικνέεσθαι', 'καὶ', 'δὴ', 'καὶ', 'ἐς', 'Ἄργος'], 'tags': ['ADJ', 'ADV', 'ADP', 'DET', 'ADJ', 'VER

In [52]:
train_data = train_data_perseus + train_data_proiel

In [53]:
random.shuffle(train_data)

In [54]:
lang="xx"
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
#for tag, values in TAG_MAP.items():
#    tagger.add_label(tag, values)
nlp.add_pipe(tagger)

optimizer = nlp.begin_training()

n_iter = 10
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, losses=losses)
    print("Losses", losses)

Losses {'tagger': 67534.63089752197}
Losses {'tagger': 40686.78945232928}
Losses {'tagger': 30711.836481655948}
Losses {'tagger': 24673.386923532}
Losses {'tagger': 20724.32412486011}
Losses {'tagger': 17717.56746959647}
Losses {'tagger': 15355.740474200109}
Losses {'tagger': 13496.945424583624}
Losses {'tagger': 12036.107190164388}
Losses {'tagger': 10968.727297964346}


In [55]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('ἀρχόμενος', 'VERB', 'VERB'), ('σέο', 'PRON', 'PRON'), (',', 'PUNCT', 'PUNCT'), ('Φοῖβε', 'NOUN', 'NOUN'), (',', 'PUNCT', 'PUNCT'), ('παλαιγενέων', 'ADJ', 'ADJ'), ('κλέα', 'NOUN', 'NOUN'), ('φωτῶν', 'NOUN', 'NOUN'), ('μνήσομαι', 'VERB', 'VERB'), (',', 'PUNCT', 'PUNCT'), ('οἳ', 'PRON', 'PRON'), ('Πόντοιο', 'NOUN', 'NOUN'), ('κατὰ', 'ADP', 'ADP'), ('στόμα', 'NOUN', 'NOUN'), ('καὶ', 'CCONJ', 'CCONJ'), ('διὰ', 'ADP', 'ADP'), ('πέτρας', 'NOUN', 'NOUN'), ('Κυανέας', 'NOUN', 'NOUN'), ('βασιλῆος', 'NOUN', 'NOUN'), ('ἐφημοσύνῃ', 'VERB', 'VERB'), ('Πελίαο', 'NOUN', 'NOUN'), ('χρύσειον', 'ADJ', 'ADJ'), ('μετὰ', 'ADP', 'ADP'), ('κῶας', 'NOUN', 'NOUN'), ('ἐύζυγον', 'VERB', 'VERB'), ('ἤλασαν', 'VERB', 'VERB'), ('Ἀργώ', 'ADV', 'ADV'), ('.', 'PUNCT', 'PUNCT')]


In [42]:
test_text = "ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ."
doc = nlp(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('ἀρχόμενος', 'VERB', 'VERB'), ('σέο', 'PRON', 'PRON'), (',', 'PUNCT', 'PUNCT'), ('Φοῖβε', 'NOUN', 'NOUN'), (',', 'PUNCT', 'PUNCT'), ('παλαιγενέων', 'VERB', 'VERB'), ('κλέα', 'NOUN', 'NOUN'), ('φωτῶν', 'NOUN', 'NOUN'), ('μνήσομαι', 'VERB', 'VERB'), (',', 'PUNCT', 'PUNCT'), ('οἳ', 'PRON', 'PRON'), ('Πόντοιο', 'NOUN', 'NOUN'), ('κατὰ', 'ADP', 'ADP'), ('στόμα', 'NOUN', 'NOUN'), ('καὶ', 'CCONJ', 'CCONJ'), ('διὰ', 'ADP', 'ADP'), ('πέτρας', 'NOUN', 'NOUN'), ('Κυανέας', 'NOUN', 'NOUN'), ('βασιλῆος', 'NOUN', 'NOUN'), ('ἐφημοσύνῃ', 'ADV', 'ADV'), ('Πελίαο', 'NOUN', 'NOUN'), ('χρύσειον', 'ADJ', 'ADJ'), ('μετὰ', 'ADP', 'ADP'), ('κῶας', 'NOUN', 'NOUN'), ('ἐύζυγον', 'VERB', 'VERB'), ('ἤλασαν', 'VERB', 'VERB'), ('Ἀργώ', 'NOUN', 'NOUN'), ('.', 'PUNCT', 'PUNCT')]


In [57]:
nlp.to_disk("nlp_model_angr")

In [None]:
# load it back
nlp = spacy.load("nlp_model_angr")