In [2]:
import spacy
import pyconll
import json
import re
import unicodedata

import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

import sddk

In [3]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


# Initialize the model

In [1]:
!python3 -m spacy init-model xx ../models/spacy_grc_model_6_vectors --vectors-loc ../data/word2vec_win2.txt

[2K[38;5;2m✔ Successfully created model[0m
134209it [00:06, 20760.69it/s]/word2vec_win2.txt
[2K[38;5;2m✔ Loaded vectors from ../data/word2vec_win2.txt[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
134382 entries, 134209 vectors


In [5]:
# load it into our envoronment
nlp = spacy.load("../models/spacy_grc_model_6_vectors")
tagger = nlp.create_pipe("tagger")
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()

  **kwargs
  **kwargs


# Add lookup tables

In [4]:
ag_lemma_lookup = json.load(open("../data/ag_lemma_lookup.json"))
ag_lemma_lookup_merged = json.load(open("../data/ag_lemma_lookup_merged.json"))

In [5]:
ag_lemma_lookup["NOUN"]["ἐνεργειῶν"]

'ἐνέργεια'

ok, this works but there is a substantial number of cases in which there is a wrong tag, e.g. "VERB" - in this case the lemmatization is unsuccessful

In [6]:
ag_lemma_lookup_merged["ἐνεργειῶν"]

'ἐνέργεια'

In [7]:
try:
    nlp.vocab.lookups.remove_table("lemma_lookup")
    nlp.vocab.lookups.remove_table("lemma_lookup_merged")
except:
    pass

In [8]:
# let add our table to our model
table = nlp.vocab.lookups.add_table("lemma_lookup", ag_lemma_lookup)
table = nlp.vocab.lookups.add_table("lemma_lookup_merged", ag_lemma_lookup_merged)
# nlp.vocab.lookups.remove_table("lemma_exc")

NameError: name 'nlp' is not defined

In [9]:
# load it back from the model (to check functionality for future usage)
ag_lemma_lookup = nlp.vocab.lookups.get_table("lemma_lookup")
ag_lemma_lookup_merged = nlp.vocab.lookups.get_table("lemma_lookup_merged")


ag_lemma_lookup["VERB"]["μνήσομαι"]

NameError: name 'nlp' is not defined

# Lemmatization functions

In [104]:
# !pip install greek-accentuation
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *



def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))


def list_of_possible_accentuations(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph.lower())
            s = syllabify(morph)
            morph_vars = []
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation #add_accentuation(s, accentuation))
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                morph_vars.append(morph_acc_var)
                morph_vars.append(morph_acc_var.capitalize())
            return morph_vars
        else:
            return []
    except:
        return []


def lemmatizer_v0(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        lemma = token_text # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except: 
            try:
                lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
            except:
                morph_vars = list_of_possible_accentuations(token_text)
                for var in morph_vars:
                    lemma = morph_vars[0]
                    try:
                        lemma = ag_lemma_lookup[tag][var] 
                        break
                    except: 
                        pass
        return lemma
    else:
        return old_lemma

def grc_doc_lemmatizer(doc):
    for token in doc:
        token.lemma_, token.pos_ = lemmatizer(token.text, token.pos_, token.lemma_)
    return doc   

def apply_nlp(sentences_list):
    spacy_docs = []
    for sentence in sentences_list:
        doc = nlp(sentence)
        doc = update_lemmata(doc)
        spacy_docs.append(doc)
    return spacy_docs
    
def check_other_tags(token_text, tag):
    lemma = token_text
    match = False
    # for new tags in specific order...
    for new_tag in ["PUNCT",'ADP', 'AUX', 'DET', 'INTJ', 'PART', "ADV", "ADJ", "NOUN", "PROPN", "VERB", "NUM"]:
        try:
            lemma = ag_lemma_lookup[new_tag][token_text]
            tag = new_tag
            match = True
            break
        except:
            pass
    return lemma, tag, match
    
def lemmatizer(token_text, tag, old_lemma=None):
    if (old_lemma==None) or (token_text == old_lemma):
        lemma = token_text.lower() # start with assigning the word as it stands
        try:
            lemma = ag_lemma_lookup[tag][token_text]
        except:
            lemma, tag, match = check_other_tags(token_text, tag)
            if match == False:
                try:
                    lemma = ag_lemma_lookup[tag][grave_to_acute(token_text)]
                except:
                    lemma, tag, match = check_other_tags(token_text, tag)
                    if match == False:
                        morph_vars = list_of_possible_accentuations(token_text)
                        for var in morph_vars:
                            try:
                                lemma = ag_lemma_lookup[tag][var] 
                                break
                            except:
                                lemma, tag, match = check_other_tags(var, tag)
                                if match ==True:
                                    break
        return lemma, tag
    else:
        return old_lemma, tag

In [105]:
lemmatizer("δι’", "DET")

('διά', 'ADP')

In [106]:
lemmatizer("ΠΑΥΛΟΣ", "VERB")

('Παῦλος', 'NOUN')

In [80]:
lemmatizer("ΒΙΒΛΟΣ", "ADJ")

('βίβλος', 'NOUN')

In [81]:
lemmatizer("Ἰησοῦ", "NOUN")

('Ἰησοῦς', 'NOUN')

In [82]:
# nonexistent word
lemmatizer("ΒΙΒΒΙΒΛΟΣ", "NOUN")

('Βίββιβλος', 'NOUN')

In [83]:
# it works even with wrong POS-tag
lemmatizer("ἐνεργειῶν", "VERB")

('ἐνέργεια', 'NOUN')

# Training our own model with gold-parse

In [85]:
urls = [
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu",
    "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-dev.conllu"]

In [87]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)
v_ud = "’"
v_agt = "ʼ"

v_ud == v_agt # because of that, we will everywhere use the agt version, since it does not confuse the word tokenizer

False

In [88]:
def tagged_data_from_url(url):
    corpus = pyconll.load.iter_from_url(url)
    tagged_data = []
    for sentence in corpus:
        words, tags, lemmata = [], [], []
        for token in sentence:
            words.append(normalize_encoding(re.sub(v_ud, v_agt, token.form)))
            tags.append(token.upos)
            lemmata.append(normalize_encoding(token.lemma))
        if "-dev." in url:
            tagged_data.append((normalize_encoding(re.sub(v_ud, v_agt, sentence.text)), {"words" : words, "tags" : tags, "lemmata" : lemmata}))
        else:
            tagged_data.append((normalize_encoding(re.sub(v_ud, v_agt, sentence.text)), {"words" : words, "tags" : tags}))
    print("File name: {0}; number of sentences: {1}".format(url.rpartition("/")[2], len(tagged_data)))
    return tagged_data

In [89]:
perseus_train = tagged_data_from_url(urls[0])
perseus_dev = tagged_data_from_url(urls[1])
proiel_train = tagged_data_from_url(urls[2])
proiel_dev = tagged_data_from_url(urls[3])

File name: grc_perseus-ud-train.conllu; number of sentences: 11476
File name: grc_perseus-ud-dev.conllu; number of sentences: 1137
File name: grc_proiel-ud-train.conllu; number of sentences: 15014
File name: grc_proiel-ud-dev.conllu; number of sentences: 1019


In [90]:
print(perseus_train[0])

('ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·', {'words': ['ἐρᾷ', 'μὲν', 'ἁγνὸς', 'οὐρανὸς', 'τρῶσαι', 'χθόνα', ',', 'ἔρως', 'δὲ', 'γαῖαν', 'λαμβάνει', 'γάμου', 'τυχεῖν', '·'], 'tags': ['VERB', 'ADV', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PUNCT']})


In [91]:
train_data = perseus_train + proiel_train

In [92]:
len(train_data)

26490

In [93]:
proiel_dev_len = sum([len(sent[1]["tags"]) for sent in proiel_dev])
print("tokens in proiel_dev: " + str(proiel_dev_len))

perseus_dev_len = sum([len(sent[1]["tags"]) for sent in perseus_dev])
print("tokens in perseus_dev: " + str(perseus_dev_len))
total_len = proiel_dev_len + perseus_dev_len

tokens in proiel_dev: 13652
tokens in perseus_dev: 22135


In [94]:
model_tests = []

In [141]:
annotations_list = []


In [142]:
annotations_list

[]

In [139]:
annotations_list = []

n_iter = 1

model_tests = []
for i in range(n_iter):
    
    ### TRAIN THE MODEL
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        #for token_text, tag in zip
        annotations_list.append(annotations)
        #nlp.update(texts, annotations, drop=0.3, sgd=optimizer, losses=losses)

In [127]:
corrected_tags = []
all_tags = 0
corrected_tags_N = 0
for annotations_entry in annotations_list:
    all_tags = all_tags + len(annotations_entry["tags"])
    for token_text, tag in zip(annotations_entry["words"], annotations_entry["tags"]):
        corrected_tag = lemmatizer(token_text, tag)[1]
        if corrected_tag != tag:
            corrected_tags_N = corrected_tags_N + 1
            corrected_tags.append(corrected_tag)
        else:
            corrected_tags.append(tag)
    annotations_entry["tags"] = corrected_tags

In [128]:
all_tags

9190122720

In [125]:
np.round(corrected_tags_N / all_tags, 4)

0.0001

In [29]:
n_iter = 10

nlp.add_pipe(grc_doc_lemmatizer, "grc_doc_lemmatizer", after="tagger") # grc_doc_lemmatizer

model_tests = []
for i in range(n_iter):
    
    ### TRAIN THE MODEL
    random.shuffle(train_data)
    losses = {}
        # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, drop=0.3, sgd=optimizer, losses=losses)
    
    ### TEST THE MODEL
    model_test = {}
    model_test["Losses"] = losses
    proiel_tags_correct = 0
    proiel_lemmata_correct = 0
    for sent in proiel_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
    model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

    perseus_tags_correct = 0
    perseus_lemmata_correct = 0
    for sent in perseus_dev:
        doc = nlp(sent[0])
        predicted_tags = [token.pos_ for token in doc]
        predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
        given_tags = sent[1]["tags"]
        given_lemmata = sent[1]["lemmata"]
        perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
        perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
    model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
    model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100
    print(model_test)
    model_tests.append(model_test)

{'Losses': {'tagger': 28335.17343378486}, 'proiel_tags_correct%': 95.32669205977146, 'proiel_lemmata_correct%': 91.49575153823616, 'perseus_tags_correct%': 87.07928619832845, 'perseus_lemmata_correct%': 89.26586853399593}
{'Losses': {'tagger': 27338.618908771314}, 'proiel_tags_correct%': 95.34134192792266, 'proiel_lemmata_correct%': 91.5616759449165, 'perseus_tags_correct%': 86.92116557488141, 'perseus_lemmata_correct%': 89.28845719448837}
{'Losses': {'tagger': 26388.92041359283}, 'proiel_tags_correct%': 95.44389100498096, 'proiel_lemmata_correct%': 91.58365074714328, 'perseus_tags_correct%': 86.86695278969957, 'perseus_lemmata_correct%': 89.41495369324599}
{'Losses': {'tagger': 25767.658849308267}, 'proiel_tags_correct%': 95.64166422502197, 'proiel_lemmata_correct%': 91.5543510108409, 'perseus_tags_correct%': 87.01152021685115, 'perseus_lemmata_correct%': 89.36525864016264}
{'Losses': {'tagger': 25209.311948458664}, 'proiel_tags_correct%': 95.51714034573689, 'proiel_lemmata_correct%':

In [31]:
nlp.to_disk("../models/spacy_grc_model_6")

In [30]:
# COMPARE WITH OUR FIRST MODEL
nlp_1 = spacy.load("../models/spacy_grc_model_1")

model_test = {}
proiel_tags_correct = 0
proiel_lemmata_correct = 0
for sent in proiel_dev:
    doc = nlp_1(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    proiel_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    proiel_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["proiel_tags_correct%"] = (proiel_tags_correct / proiel_dev_len) * 100
model_test["proiel_lemmata_correct%"] = (proiel_lemmata_correct / proiel_dev_len) * 100

perseus_tags_correct = 0
perseus_lemmata_correct = 0
for sent in perseus_dev:
    doc = nlp_1(sent[0])
    predicted_tags = [token.pos_ for token in doc]
    predicted_lemmata = [lemmatizer(token.text, token.pos_) for token in doc]
    given_tags = sent[1]["tags"]
    given_lemmata = sent[1]["lemmata"]
    perseus_tags_correct += len([p for p, g in zip(predicted_tags, given_tags) if p ==g])
    perseus_lemmata_correct += len([p for p, g in zip(predicted_lemmata, given_lemmata) if p ==g])
model_test["perseus_tags_correct%"] = (perseus_tags_correct / perseus_dev_len) * 100
model_test["perseus_lemmata_correct%"] = (perseus_lemmata_correct / perseus_dev_len) * 100

model_test

{'proiel_tags_correct%': 93.59068268385585,
 'proiel_lemmata_correct%': 91.31995312042191,
 'perseus_tags_correct%': 85.05534221820646,
 'perseus_lemmata_correct%': 89.01739326857917}

our results with first model:
```
{'proiel_tags_correct%': 87.90653384119544,
 'proiel_lemmata_correct%': 85.62115440961031,
 'perseus_tags_correct%': 85.05534221820646,
 'perseus_lemmata_correct%': 82.66997967020556}
 ```

In [76]:
#nlp.pipe_names
nlp.pipe_names

['tagger']

In [61]:
nlp.to_disk("../models/spacy_grc_model_6")

# Testing with real data

In [10]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_preprocessed_20201127.json", "df", conf)

In [19]:
nlp = spacy.load("../models/spacy_grc_model_6")

In [20]:
nlp.add_pipe(grc_doc_lemmatizer, "grc_doc_lemmatizer", after="tagger") # grc_doc_lemmatizer

In [72]:
doc = nlp(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["sentences"][0])
print([(token.text, token.pos_, token.lemma_) for token in doc])

[('ΠΑΥΛΟΣ', 'VERB', 'Παῦλος'), ('ἀπόστολος', 'NOUN', 'ἀπόστολος'), (',', 'PUNCT', ','), ('οὐκ', 'ADV', 'οὐ'), ('ἀπʼ', 'ADP', 'ἀπό'), ('ἀνθρώπων', 'NOUN', 'ἄνθρωπος'), ('οὐδὲ', 'CCONJ', 'οὐδέ'), ('διʼ', 'ADP', 'διά'), ('ἀνθρώπου', 'NOUN', 'ἄνθρωπος'), ('ἀλλὰ', 'CCONJ', 'ἀλλά'), ('διὰ', 'ADP', 'διά'), ('Ἰησοῦ', 'PROPN', 'Ἰησοῦς'), ('Χριστοῦ', 'PROPN', 'Χριστός'), ('καὶ', 'CCONJ', 'καί'), ('θεοῦ', 'NOUN', 'θεός'), ('πατρὸς', 'NOUN', 'πατήρ'), ('τοῦ', 'DET', 'ὁ'), ('ἐγείραντος', 'VERB', 'ἐγείρω'), ('αὐτὸν', 'PRON', 'αὐτός'), ('ἐκ', 'ADP', 'ἐκ'), ('νεκρῶν', 'NOUN', 'νεκρών'), (',', 'PUNCT', ','), ('καὶ', 'CCONJ', 'καί'), ('οἱ', 'DET', 'ὁ'), ('σὺν', 'ADP', 'σύν'), ('ἐμοὶ', 'PRON', 'ἐγώ'), ('πάντες', 'ADJ', 'πᾶς'), ('ἀδελφοί', 'NOUN', 'ἀδελφός'), (',', 'PUNCT', ','), ('ταῖς', 'DET', 'ὁ'), ('ἐκκλησίαις', 'NOUN', 'ἐκκλησία'), ('τῆς', 'DET', 'ὁ'), ('Γαλατίας', 'NOUN', 'Γαλατία'), ('·', 'PUNCT', '·')]


In [73]:
nlp.remove_pipe("grc_doc_lemmatizer")

('grc_doc_lemmatizer', <function __main__.grc_doc_lemmatizer(doc)>)

In [74]:
nlp.to_disk("../models/spacy_grc_model_6")

In [77]:
# let's continue with testing our old functions

In [36]:
doc = update_lemmata(nlp(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["sentences"][0]))
print([(token.text, token.pos_, token.lemma_) for token in doc])

[('ΠΑΥΛΟΣ', 'VERB', 'Παῦλος'), ('ἀπόστολος', 'NOUN', 'ἀπόστολος'), (',', 'PUNCT', ','), ('οὐκ', 'ADV', 'οὐ'), ('ἀπʼ', 'ADP', 'ἀπό'), ('ἀνθρώπων', 'NOUN', 'ἄνθρωπος'), ('οὐδὲ', 'CCONJ', 'οὐδέ'), ('διʼ', 'ADP', 'διά'), ('ἀνθρώπου', 'NOUN', 'ἄνθρωπος'), ('ἀλλὰ', 'CCONJ', 'ἀλλά'), ('διὰ', 'ADP', 'διά'), ('Ἰησοῦ', 'PROPN', 'Ἰησοῦς'), ('Χριστοῦ', 'PROPN', 'Χριστός'), ('καὶ', 'CCONJ', 'καί'), ('θεοῦ', 'NOUN', 'θεός'), ('πατρὸς', 'NOUN', 'πατήρ'), ('τοῦ', 'DET', 'ὁ'), ('ἐγείραντος', 'VERB', 'ἐγείρω'), ('αὐτὸν', 'PRON', 'αὐτός'), ('ἐκ', 'ADP', 'ἐκ'), ('νεκρῶν', 'ADJ', 'νεκρός'), (',', 'PUNCT', ','), ('καὶ', 'CCONJ', 'καί'), ('οἱ', 'DET', 'ὁ'), ('σὺν', 'ADP', 'σύν'), ('ἐμοὶ', 'PRON', 'ἐγώ'), ('πάντες', 'ADJ', 'πᾶς'), ('ἀδελφοί', 'NOUN', 'ἀδελφός'), (',', 'PUNCT', ','), ('ταῖς', 'DET', 'ὁ'), ('ἐκκλησίαις', 'NOUN', 'ἐκκλησία'), ('τῆς', 'DET', 'ὁ'), ('Γαλατίας', 'NOUN', 'Γαλατία'), ('·', 'PUNCT', '·')]


In [94]:
arist_test = apply_nlp(AGT[AGT["doc_id"]=="tlg0086.tlg010"]["sentences"].tolist()[0])
for doc in arist_test[:10]:
    print([(token.text, token.pos_, token.lemma_) for token in doc])

[('πᾶσα', 'DET', 'πᾶς'), ('τέχνη', 'NOUN', 'τέχνη'), ('καὶ', 'CCONJ', 'καί'), ('πᾶσα', 'DET', 'πᾶς'), ('μέθοδος', 'NOUN', 'μέθοδος'), (',', 'PUNCT', ','), ('ὁμοίως', 'ADV', 'ὁμοίως'), ('δὲ', 'PART', 'δέ'), ('πρᾶξίς', 'ADJ', 'πρᾶξις'), ('τε', 'PART', 'τε'), ('καὶ', 'CCONJ', 'καί'), ('προαίρεσις', 'NOUN', 'προαίρεσις'), (',', 'PUNCT', ','), ('ἀγαθοῦ', 'ADJ', 'ἀγαθός'), ('τινὸς', 'PRON', 'τίς'), ('ἐφίεσθαι', 'VERB', 'ἐφίημι'), ('δοκεῖ', 'VERB', 'δοκέω'), ('·', 'PUNCT', '·')]
[('διὸ', 'ADV', 'διό'), ('καλῶς', 'ADV', 'καλῶς'), ('ἀπεφήναντο', 'VERB', 'ἀποφαίνω'), ('τἀγαθόν', 'NOUN', 'ἀγαθός'), (',', 'PUNCT', ','), ('οὗ', 'PRON', 'ὅς'), ('πάντʼ', 'ADJ', 'πάντʼ'), ('ἐφίεται', 'VERB', 'ἐφίημι'), ('.', 'PUNCT', '.')]
[('διαφορὰ', 'NOUN', 'διαφορά'), ('δέ', 'ADV', 'δέ'), ('τις', 'ADJ', 'τὶς'), ('φαίνεται', 'VERB', 'φαίνω'), ('τῶν', 'DET', 'ὁ'), ('τελῶν', 'NOUN', 'τέλος'), ('·', 'PUNCT', '·')]
[('τὰ', 'DET', 'ὁ'), ('μὲν', 'PART', 'μέν'), ('γάρ', 'PART', 'γάρ'), ('εἰσιν', 'VERB', 'εἰμί'), ('ἐνέργει