In [1]:
import spacy
import pyconll
import json
import re
import unicodedata

import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

import sddk

In [2]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


# Initialize the model

In [3]:
!python3 -m spacy init-model el ../models/spacy_grc_model_7_vectors --vectors-loc ../data/word2vec_win2.txt

[2K[38;5;2m✔ Successfully created model[0m
132846it [00:06, 20486.41it/s]/word2vec_win2.txt
[2K[38;5;2m✔ Loaded vectors from ../data/word2vec_win2.txt[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
133518 entries, 132846 vectors


In [32]:
# load it into our envoronment
nlp = spacy.load("../models/spacy_grc_model_7_vectors")
tagger = nlp.create_pipe("tagger")
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()

# Add lookup tables

In [33]:
ag_lemma_lookup = json.load(open("../data/ag_lemma_lookup.json"))
ag_lemma_lookup_merged = json.load(open("../data/ag_lemma_lookup_merged.json"))

In [34]:
print(list(ag_lemma_lookup.keys()))

['ADJ', 'PRON', 'ADP', 'PUNCT', 'DET', 'CCONJ', 'INTJ', 'PART', 'PROPN', 'AUX', 'SCONJ', 'ADV', 'NOUN', 'X', 'NUM', 'VERB']


In [35]:
ag_lemma_lookup["NOUN"]["ἐνεργειῶν"]

'ἐνέργεια'

ok, this works but there is a substantial number of cases in which there is a wrong tag, e.g. "VERB" - in this case the lemmatization is unsuccessful

In [36]:
ag_lemma_lookup_merged["ἐνεργειῶν"]

'ἐνέργεια'

In [37]:
try:
    nlp.vocab.lookups.remove_table("lemma_lookup")
    nlp.vocab.lookups.remove_table("lemma_lookup_merged")
except:
    pass

In [38]:
# let add our table to our model
table = nlp.vocab.lookups.add_table("lemma_lookup", ag_lemma_lookup)
table = nlp.vocab.lookups.add_table("lemma_lookup_merged", ag_lemma_lookup_merged)
# nlp.vocab.lookups.remove_table("lemma_exc")

In [39]:
# load it back from the model (to check functionality for future usage)
ag_lemma_lookup = nlp.vocab.lookups.get_table("lemma_lookup")
ag_lemma_lookup_merged = nlp.vocab.lookups.get_table("lemma_lookup_merged")


ag_lemma_lookup["VERB"]["μνήσομαι"]

'μιμνήσκω'

# Lemmatization functions

In [40]:
# !pip install greek-accentuation
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *



def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))


def list_of_possible_accentuations(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph.lower())
            s = syllabify(morph)
            morph_vars = []
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation #add_accentuation(s, accentuation))
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                morph_vars.append(morph_acc_var)
                morph_vars.append(morph_acc_var.capitalize())
            return morph_vars
        else:
            return []
    except:
        return []


def grc_doc_lemmatizer(doc):
    for token in doc:
        token.lemma_, token.pos_ = lemmatizer(token.text, token.pos_, token.lemma_)
    return doc   

def apply_nlp(sentences_list):
    spacy_docs = []
    for sentence in sentences_list:
        doc = nlp(sentence)
        spacy_docs.append(doc)
    return spacy_docs

def check_char_validity(token_text, tag):
    if re.search("\W", token_text):
        if re.match("\W+$", token_text):
            tag = "PUNCT"
        else:
            if "’" not in token_text:
                token_text = re.sub("\W", "", token_text)
    return token_text, tag

# for new tags in specific order...

keys = ['PRON', 'ADP', 'PUNCT', 'DET', 'CCONJ', 'INTJ', 'PART', 'ADJ', 'AUX', 'SCONJ', 'ADV', 'NOUN','PROPN', 'VERB','X', 'NUM']
 
def check_other_tags(token_text, tag):
    lemma = token_text
    match = False
    # for new tags in specific order...
    for new_tag in keys:
        try:
            lemma = ag_lemma_lookup[new_tag][token_text]
            tag = new_tag
            match = True
            break
        except:
            pass
    return lemma, tag, match


def lemmatizer_v1(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        lemma = token_text.lower() # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except:
            try:
                lemma = ag_lemma_lookup_merged[token_text]
            except:
                try:
                    lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
                except:
                    try:
                        lemma = ag_lemma_lookup_merged[grave_to_acute(token_text)]
                    except:
                        morph_vars = list_of_possible_accentuations(token_text)
                        for var in morph_vars:
                            try:
                                lemma = ag_lemma_lookup[tag][var] 
                                break
                            except:
                                try:
                                    lemma = ag_lemma_lookup_merged[var]
                                    break
                                except:
                                    pass
        return lemma, tag
    else:
        return old_lemma, tag
    
def lemmatizer(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        token_text, tag = check_char_validity(token_text, tag)
        lemma = token_text.lower() # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except:
            try:
                lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
            except:
                if check_other_tags(token_text, tag)[2] == True:
                    lemma, tag, match = check_other_tags(token_text, tag)
                else:
                        morph_vars = list_of_possible_accentuations(token_text)
                        for var in morph_vars:
                            try:
                                try:
                                    lemma = ag_lemma_lookup[tag][var] 
                                    break
                                except:
                                    a, b, match = check_other_tags(var, tag)
                                    if match ==True:
                                        lemma, tag, match = check_other_tags(var, tag)
                                        break
                            except:
                                pass

        return lemma, tag
    else:
        return old_lemma, tag

In [41]:
lemmatizer('ἀπʼ', 'PUNCT')

('ἀπό', 'ADP')

In [42]:
lemmatizer("δι’", "ADP")

('δι’', 'ADP')

In [43]:
lemmatizer("ΠΑΥΛΟΣ", "VERB")

('Παῦλος', 'NOUN')

In [44]:
lemmatizer("ΒΙΒΛΟΣ", "ADJ")

('βίβλος', 'NOUN')

In [45]:
lemmatizer("Ἰησοῦ", "NOUN")

('Ἰησοῦς', 'NOUN')

In [46]:
# nonexistent word
lemmatizer("ΒΙΒΒΙΒΛΟΣ", "NOUN")

('βιββιβλος', 'NOUN')

In [47]:
# it works even with wrong POS-tag
lemmatizer("ἐνεργειῶν", "VERB")

('ἐνέργεια', 'NOUN')

In [48]:
lemmatizer("θεοῦ.", "PUNCT")

('θεός', 'NOUN')

# Training our own model with gold-parse

In [49]:
urls = [
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-dev.conllu"]

In [50]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)
v_ud = "’"
v_agt = "ʼ"

v_ud == v_agt # because of that, we will everywhere use the agt version, since it does not confuse the word tokenizer

False

In [51]:
def tagged_data_from_url(url):
    corpus = pyconll.load.iter_from_url(url)
    tagged_data = []
    for sentence in corpus:
        words, tags, lemmata = [], [], []
        for token in sentence:
            words.append(normalize_encoding(re.sub(v_ud, v_agt, token.form)))
            tags.append(token.upos)
            lemmata.append(normalize_encoding(token.lemma))
        if "-dev." in url:
            tagged_data.append((normalize_encoding(re.sub(v_ud, v_agt, sentence.text)), {"words" : words, "tags" : tags, "lemmata" : lemmata}))
        else:
            tagged_data.append((normalize_encoding(re.sub(v_ud, v_agt, sentence.text)), {"words" : words, "tags" : tags}))
    print("File name: {0}; number of sentences: {1}".format(url.rpartition("/")[2], len(tagged_data)))
    return tagged_data

In [52]:
perseus_train = tagged_data_from_url(urls[0])
perseus_dev = tagged_data_from_url(urls[1])
proiel_train = tagged_data_from_url(urls[2])
proiel_dev = tagged_data_from_url(urls[3])

File name: grc_perseus-ud-train.conllu; number of sentences: 11476
File name: grc_perseus-ud-dev.conllu; number of sentences: 1137
File name: grc_proiel-ud-train.conllu; number of sentences: 15014
File name: grc_proiel-ud-dev.conllu; number of sentences: 1019


In [53]:
print(perseus_train[0])

('ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·', {'words': ['ἐρᾷ', 'μὲν', 'ἁγνὸς', 'οὐρανὸς', 'τρῶσαι', 'χθόνα', ',', 'ἔρως', 'δὲ', 'γαῖαν', 'λαμβάνει', 'γάμου', 'τυχεῖν', '·'], 'tags': ['VERB', 'ADV', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PUNCT']})


In [54]:
train_data = perseus_train + proiel_train

In [55]:
len(train_data)

26490

In [56]:
proiel_dev_len = sum([len(sent[1]["tags"]) for sent in proiel_dev])
print("tokens in proiel_dev: " + str(proiel_dev_len))

perseus_dev_len = sum([len(sent[1]["tags"]) for sent in perseus_dev])
print("tokens in perseus_dev: " + str(perseus_dev_len))
total_len = proiel_dev_len + perseus_dev_len

tokens in proiel_dev: 13652
tokens in perseus_dev: 22135


In [57]:
model_tests = []

In [58]:
# add lemmatizer to the pipeline
#nlp.remove_pipe("grc_doc_lemmatizer")
nlp.add_pipe(grc_doc_lemmatizer, "grc_doc_lemmatizer", after="tagger") # grc_doc_lemmatizer

In [60]:
n_iter = 25

model_tests = []
for i in range(n_iter):
    
    ### TRAIN THE MODEL
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, drop=0.3, sgd=optimizer, losses=losses)
    
    ### TEST THE MODEL
    
    model_test = {}
    model_test["Losses"] = losses
    proiel_tags_correct = 0
    proiel_lemmata_correct = 0
    for sent in proiel_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [token.lemma_ for token in doc]
        #predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
    model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

    perseus_tags_correct = 0
    perseus_lemmata_correct = 0
    for sent in perseus_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [token.lemma_ for token in doc]
        #predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
    model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100
    print(model_test)
    model_tests.append(model_test)

{'Losses': {'tagger': 56364.95995467901}, 'proiel_tags_correct%': 95.01171989452095, 'proiel_lemmata_correct%': 95.29739232346908, 'perseus_tags_correct%': 87.53557714027558, 'perseus_lemmata_correct%': 87.06573300203297}
{'Losses': {'tagger': 46929.60791128874}, 'proiel_tags_correct%': 95.89071198359214, 'proiel_lemmata_correct%': 95.35599179607384, 'perseus_tags_correct%': 87.88344251185904, 'perseus_lemmata_correct%': 87.04766207363903}
{'Losses': {'tagger': 41977.65289503336}, 'proiel_tags_correct%': 95.407266334603, 'proiel_lemmata_correct%': 95.45121593905654, 'perseus_tags_correct%': 88.44364129207139, 'perseus_lemmata_correct%': 87.21933589338153}
{'Losses': {'tagger': 38295.55491784215}, 'proiel_tags_correct%': 96.08848520363317, 'proiel_lemmata_correct%': 95.43656607090536, 'perseus_tags_correct%': 88.00090354641969, 'perseus_lemmata_correct%': 87.03410887734357}
{'Losses': {'tagger': 35758.74683788419}, 'proiel_tags_correct%': 95.64166422502197, 'proiel_lemmata_correct%': 95

In [61]:
# The reason why the performance on perseus is not so good is actually caused by inperfect tokenization 
# -> implying that our sentences are of different length than the sentences in the test data

all_tags_len = 0
inconsistencies = []
perseus_tags_correct = 0
perseus_lemmata_correct = 0
for sent in perseus_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        all_tags_len = all_tags_len + len(predicted_tags)
        predicted_lemmata = [token.lemma_ for token in doc]
        #predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        if len(predicted_lemmata) != len(given_lemmata):
            inconsistencies.append((sent[1]["words"], [(token.text, token.lemma_) for token in doc]))
        perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100
print(inconsistencies[:3])

[(['πολλῷ', 'γὰρ', 'ὕστερον', 'ἔτι', 'καὶ', 'τῶν', 'Τρωικῶν', 'γενόμενος', 'οὐδαμοῦ', 'τοὺς', 'ξύμπαντας', 'ὠνόμασεν', ',', 'οὐδ̓', 'ἄλλους', 'ἢ', 'τοὺς', 'μετ̓', 'Ἀχιλλέως', 'ἐκ', 'τῆς', 'Φθιώτιδος', ',', 'οἵπερ', 'καὶ', 'πρῶτοι', 'Ἕλληνες', 'ἦσαν', ',', 'Δαναοὺς', 'δὲ', 'ἐν', 'τοῖς', 'ἔπεσι', 'καὶ', 'Ἀργείους', 'καὶ', 'Ἀχαιοὺς', 'ἀνακαλεῖ', '.'], [('πολλῷ', 'πολύς'), ('γὰρ', 'γάρ'), ('ὕστερον', 'ὕστερος'), ('ἔτι', 'ἔτι'), ('καὶ', 'καί'), ('τῶν', 'ὁ'), ('Τρωικῶν', 'Τρωικός'), ('γενόμενος', 'γίγνομαι'), ('οὐδαμοῦ', 'οὐδαμοῦ'), ('τοὺς', 'ὁ'), ('ξύμπαντας', 'σύμπας'), ('ὠνόμασεν', 'ὀνομάζω'), (',', ','), ('οὐδ̓', 'οὐδ'), ('ἄλλους', 'ἄλλος'), ('ἢ', 'ἤ'), ('τοὺς', 'ὁ'), ('μετ̓', 'μετ'), ('Ἀχιλλέως', 'Ἀχιλλεύς'), ('ἐκ', 'ἐκ'), ('τῆς', 'ὁ'), ('Φθιώτιδος', 'Φθιῶτις'), (',', ','), ('οἵπερ', 'ὅσπερ'), ('καὶ', 'καί'), ('πρῶτοι', 'πρῶτος'), ('Ἕλληνες', 'Ἕλλην'), ('ἦσαν', 'εἰμί'), (',', ','), ('Δαναοὺς', 'Δαναοί'), ('δὲ', 'δέ'), ('ἐν', 'ἐν'), ('τοῖς', 'ὁ'), ('ἔπεσι', 'ἔπος'), ('καὶ', 'καί'), ('Ἀργ

In [62]:
all_tags_len

22107

In [63]:
# COMPARE WITH OUR FIRST MODEL
nlp_1 = spacy.load("../models/spacy_grc_model_1")

model_test = {}
proiel_tags_correct = 0
proiel_lemmata_correct = 0
for sent in proiel_dev:
    doc = nlp(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    all_tags_len = all_tags_len + len(predicted_tags)
    predicted_lemmata = [token.lemma_ for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

perseus_tags_correct = 0
perseus_lemmata_correct = 0
for sent in perseus_dev:
    doc = nlp(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    all_tags_len = all_tags_len + len(predicted_tags)
    predicted_lemmata = [token.lemma_ for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100

model_test

{'proiel_tags_correct%': 96.60123058892471,
 'proiel_lemmata_correct%': 95.51714034573689,
 'perseus_tags_correct%': 88.9405918229049,
 'perseus_lemmata_correct%': 87.15608764400271}

our results with first model:
```
{'proiel_tags_correct%': 87.90653384119544,
 'proiel_lemmata_correct%': 85.62115440961031,
 'perseus_tags_correct%': 85.05534221820646,
 'perseus_lemmata_correct%': 82.66997967020556}
 ```

In [68]:
nlp.remove_pipe('grc_doc_lemmatizer')
nlp.to_disk("../models/spacy_grc_model_7")

# Testing with real data

In [64]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_preprocessed_20201209.json", "df", conf)

In [69]:
nlp = spacy.load("../models/spacy_grc_model_7")

In [70]:
nlp.add_pipe(grc_doc_lemmatizer, "grc_doc_lemmatizer", after="tagger") # grc_doc_lemmatizer

In [71]:
doc = nlp(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["sentences"][0])
print([(token.text, token.pos_, token.lemma_) for token in doc])

[('ΠΑΥΛΟΣ', 'NOUN', 'Παῦλος'), ('ἀπόστολος', 'NOUN', 'ἀπόστολος'), (',', 'PUNCT', ','), ('οὐκ', 'ADV', 'οὐ'), ('ἀπʼ', 'ADP', 'ἀπό'), ('ἀνθρώπων', 'NOUN', 'ἄνθρωπος'), ('οὐδὲ', 'CCONJ', 'οὐδέ'), ('διʼ', 'ADP', 'διά'), ('ἀνθρώπου', 'NOUN', 'ἄνθρωπος'), ('ἀλλὰ', 'CCONJ', 'ἀλλά'), ('διὰ', 'ADP', 'διά'), ('Ἰησοῦ', 'PROPN', 'Ἰησοῦς'), ('Χριστοῦ', 'PROPN', 'Χριστός'), ('καὶ', 'CCONJ', 'καί'), ('θεοῦ', 'NOUN', 'θεός'), ('πατρὸς', 'NOUN', 'πατήρ'), ('τοῦ', 'DET', 'ὁ'), ('ἐγείραντος', 'VERB', 'ἐγείρω'), ('αὐτὸν', 'PRON', 'αὐτός'), ('ἐκ', 'ADP', 'ἐκ'), ('νεκρῶν', 'NOUN', 'νεκρός'), (',', 'PUNCT', ','), ('καὶ', 'CCONJ', 'καί'), ('οἱ', 'DET', 'ὁ'), ('σὺν', 'ADP', 'σύν'), ('ἐμοὶ', 'PRON', 'ἐγώ'), ('πάντες', 'ADJ', 'πᾶς'), ('ἀδελφοί', 'NOUN', 'ἀδελφός'), (',', 'PUNCT', ','), ('ταῖς', 'DET', 'ὁ'), ('ἐκκλησίαις', 'NOUN', 'ἐκκλησία'), ('τῆς', 'DET', 'ὁ'), ('Γαλατίας', 'NOUN', 'Γαλατία'), ('·', 'PUNCT', '·')]


In [72]:
doc = nlp(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["sentences"][0])
print([(token.text, token.pos_, token.lemma_) for token in doc])

[('ΠΑΥΛΟΣ', 'NOUN', 'Παῦλος'), ('ἀπόστολος', 'NOUN', 'ἀπόστολος'), (',', 'PUNCT', ','), ('οὐκ', 'ADV', 'οὐ'), ('ἀπʼ', 'ADP', 'ἀπό'), ('ἀνθρώπων', 'NOUN', 'ἄνθρωπος'), ('οὐδὲ', 'CCONJ', 'οὐδέ'), ('διʼ', 'ADP', 'διά'), ('ἀνθρώπου', 'NOUN', 'ἄνθρωπος'), ('ἀλλὰ', 'CCONJ', 'ἀλλά'), ('διὰ', 'ADP', 'διά'), ('Ἰησοῦ', 'PROPN', 'Ἰησοῦς'), ('Χριστοῦ', 'PROPN', 'Χριστός'), ('καὶ', 'CCONJ', 'καί'), ('θεοῦ', 'NOUN', 'θεός'), ('πατρὸς', 'NOUN', 'πατήρ'), ('τοῦ', 'DET', 'ὁ'), ('ἐγείραντος', 'VERB', 'ἐγείρω'), ('αὐτὸν', 'PRON', 'αὐτός'), ('ἐκ', 'ADP', 'ἐκ'), ('νεκρῶν', 'NOUN', 'νεκρός'), (',', 'PUNCT', ','), ('καὶ', 'CCONJ', 'καί'), ('οἱ', 'DET', 'ὁ'), ('σὺν', 'ADP', 'σύν'), ('ἐμοὶ', 'PRON', 'ἐγώ'), ('πάντες', 'ADJ', 'πᾶς'), ('ἀδελφοί', 'NOUN', 'ἀδελφός'), (',', 'PUNCT', ','), ('ταῖς', 'DET', 'ὁ'), ('ἐκκλησίαις', 'NOUN', 'ἐκκλησία'), ('τῆς', 'DET', 'ὁ'), ('Γαλατίας', 'NOUN', 'Γαλατία'), ('·', 'PUNCT', '·')]


In [73]:
arist_test = apply_nlp(AGT[AGT["doc_id"]=="tlg0086.tlg010"]["sentences"].tolist()[0])
for doc in arist_test[:10]:
    print([(token.text, token.pos_, token.lemma_) for token in doc])

[('πᾶσα', 'DET', 'πᾶς'), ('τέχνη', 'NOUN', 'τέχνη'), ('καὶ', 'CCONJ', 'καί'), ('πᾶσα', 'ADJ', 'πᾶς'), ('μέθοδος', 'NOUN', 'μέθοδος'), (',', 'PUNCT', ','), ('ὁμοίως', 'ADV', 'ὁμοίως'), ('δὲ', 'CCONJ', 'δέ'), ('πρᾶξίς', 'NOUN', 'πρᾶξις'), ('τε', 'PART', 'τε'), ('καὶ', 'CCONJ', 'καί'), ('προαίρεσις', 'NOUN', 'προαίρεσις'), (',', 'PUNCT', ','), ('ἀγαθοῦ', 'ADJ', 'ἀγαθός'), ('τινὸς', 'PRON', 'τίς'), ('ἐφίεσθαι', 'VERB', 'ἐφίημι'), ('δοκεῖ', 'VERB', 'δοκέω'), ('·', 'PUNCT', '·')]
[('διὸ', 'ADV', 'διό'), ('καλῶς', 'ADV', 'καλῶς'), ('ἀπεφήναντο', 'VERB', 'ἀποφαίνω'), ('τἀγαθόν', 'ADJ', 'ἀγαθός'), (',', 'PUNCT', ','), ('οὗ', 'ADV', 'οὗ'), ('πάντʼ', 'ADJ', 'πᾶς'), ('ἐφίεται', 'VERB', 'ἐφίημι'), ('.', 'PUNCT', '.')]
[('διαφορὰ', 'NOUN', 'διαφορά'), ('δέ', 'ADV', 'δέ'), ('τις', 'PRON', 'τίς'), ('φαίνεται', 'VERB', 'φαίνω'), ('τῶν', 'DET', 'ὁ'), ('τελῶν', 'NOUN', 'τέλος'), ('·', 'PUNCT', '·')]
[('τὰ', 'PRON', 'ὁ'), ('μὲν', 'PART', 'μέν'), ('γάρ', 'PART', 'γάρ'), ('εἰσιν', 'VERB', 'εἰμί'), ('ἐνέργει