In [7]:
#import pandas as pd
import spacy
import pandas as pd
from spacy.tokens import Doc
import sddk
import re
import regex
import unicodedata
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *
import nltk
import os
import pickle
import json

In [2]:
#!sudo pip install https://huggingface.co/Jacobo/grc_proiel_trf/resolve/main/grc_proiel_trf-3.7.5-py3-none-any.whl
nlp = spacy.load('grc_proiel_trf')

In [3]:
spacy.__version__

'3.8.4'

In [32]:
doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι, δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι.')

In [33]:
for t in doc:
    print(t.text, t.lemma_, t.pos_)

δοκῶ δοκέω VERB
μοι ἐγώ PRON
περὶ περί ADP
ὧν ὅς PRON
πυνθάνεσθε πυνθάνομαι VERB
οὐκ οὐ ADV
ἀμελέτητος ἀμελέτητος ADJ
εἶναι εἰμί AUX
, , PUNCT
δοκῶ δοκέω VERB
μοι ἐγώ PRON
περὶ περί ADP
ὧν ὅς PRON
πυνθάνεσθε πυνθάνομαι VERB
οὐκ οὐ ADV
ἀμελέτητος ἀμελέτητος ADJ
εἶναι εἰμί AUX
. . PUNCT


xLet's improve it: preclean the string and make it suitable to work with large documents

# Text preprocessing & function developement

In [14]:
# functioms for cleaning the string

def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))

def possible_accentuation(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph)
            s = syllabify(morph)
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                return morph_acc_var  # Return the first accentuation immediately
        return morph  # If no accentuation is available, return the original morph
    except:
        return morph

def decap(token):
    if len(token) > 1:
        if token[1].isupper():
            token = token[0] + token[1:].lower()
            token = possible_accentuation(token)
    return token

def clean_string(string):
    string = re.sub("ϲ(\W)", r"ς\1", string) # if "ϲ" is last letter of a word
    string = re.sub("ϲ(\w)", r"σ\1", string)
    string = grave_to_acute(string)
    string = string.replace("—", " — ")
    cleaned_string = regex.sub(r'[^\p{Greek}\p{P}]', ' ', string)
    cleaned_string = " ".join([decap(token) for token in cleaned_string.split()])
    cleaned_string = regex.sub(' +', ' ', cleaned_string)
    cleaned_string = cleaned_string.replace("·", ".")
    return cleaned_string

In [15]:
def get_doc(string, segment_len=100000):
    if len(string) > segment_len:
        segment = string[:segment_len]
        matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
        if matches:
            split_at = matches[-1][1]  # taking the end index of last match
            current_segment = segment[:split_at]
            next_segment_beginning = segment[split_at:]
        else:
            current_segment = segment
            next_segment_beginning = ""

        segment_doc = nlp(current_segment)
        segment_docs = [segment_doc]

        for n in range(segment_len, len(string), segment_len):
            segment = string[n:n+segment_len]
            if len(segment) == segment_len:
                matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
                if matches:
                    split_at = matches[-1][1]
                    current_segment = next_segment_beginning + segment[:split_at]
                    next_segment_beginning = segment[split_at:]
                else:
                    current_segment = next_segment_beginning + segment
                    next_segment_beginning = ""
            else:
                current_segment = next_segment_beginning + segment

            segment_doc = nlp(current_segment)
            segment_docs.append(segment_doc)

        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(string)
    return doc

In [46]:
# extract lemmata from the sentences
# target_path = "../data/large_files/sents_data/"
target_path = "/srv/data/greek/exprecce_sentences/"
try:
    os.mkdir(target_path)
except:
    pass

sents_data_ready = os.listdir(target_path)


#pos_dict = {"VERB" : "v", "ADJ" : "a", "PROPN" : "n", "NOUN" : "n"}
reformat_tags_dict = {
    "NOUN": "n",
    "VERB": "v",
    "ADJ": "a",
    "ADV": "r",
    "PRON": "p",
    "DET": "l",
    "ADP": "r",
    "CCONJ": "c",
    "SCONJ": "c",
    "PROPN": "n",
    "PUNCT": "u",
    "n": "n",
    "v": "v",
    "a": "a",
    "r": "r",
    "p": "p",
    "l": "l",
    "c": "c",
    "u": "u"  # Assuming punctuation remains "u"
}


def reformat_tags(tag):
    try:
        return reformat_tags_dict[tag]
    except:
        return "x"


def get_sents_data(doc, doc_id):
    sentences = []
    lemmatized_sentences = []
    doc_sentdata = []
    for n, sent in enumerate(doc.sents):
        sentences.append(sent.text)
    
        # Extract lemmata for specific POS tags
        sentence_lemmata = [regex.sub(r'[^\p{Greek}]', "", t.lemma_) for t in sent if
                            t.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]
        lemmatized_sentences.append(sentence_lemmata)
    
        # Extract token data including text, lemma, POS, and position indices
        token_data = [
            (t.text,
             t.lemma_.lower(),
             reformat_tags(t.pos_),
             (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))
             )
            for t in sent
        ]
    
        doc_sentdata.append((doc_id, n, sent.text, token_data))
    return doc_sentdata, lemmatized_sentences, sentences


In [48]:
exprecce = pd.read_json("../data/exprecce.json")
exprecce.head()


Unnamed: 0,title,doc_id,author_id,raw_date,not_before,not_after,genre,author,provenience,source,string
0,The Letter of the Churches of Vienne and Lyons,tlg1352.tlg001,tlg1352,A.D. 2,101,200.0,Hagiogr.,,christian,exprecce,οἱ ἐν Βιέννῃ καὶ Λουγδούνῳ τῆς Γαλλίας παροικο...
1,Martyrium Pionii presbyteri et sodalium,tlg2005.tlg001,tlg2005,A.D. 4,301,400.0,Hagiogr.,,christian,exprecce,Μαρτύριον τοῦ ἁγίου Πιονίου τοῦ πρεσβυτέρου κα...
2,"Martyrium Agapae, Irenae, Chionae et sodalium",tlg2011.tlg001,tlg2011,A.D. 4,301,400.0,Hagiogr.,,christian,exprecce,"Μαρτύριον τῶν ἁγίων Ἀγάπης, Εἰρήνης καὶ Χιόνης..."
3,Testamentum xl martyrum,tlg2015.tlg001,tlg2015,post A.D. 4,401,,Hagiogr.,,christian,exprecce,ΔΙΑΘΗΚΗ τῶν ἁγίων καὶ ἐνδόξων τοῦ Χριστοῦ τεσσ...
4,Passio sancti Sabae Gothi (sub auctore Athanar...,tlg5451.tlg001,tlg5451,A.D. 4,301,400.0,Hagiogr.,,christian,exprecce,Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου. Ἡ ἐκκλησί...


## Get exprecce sentence data

In [49]:
def get_row_data(row):
    doc_id = row["doc_id"]
    string = row["string"]
    doc = get_doc(string)
    tokencount = len(doc)
    sents_data, sentences, lemmatized_sentences = get_sents_data(doc, doc_id)
    with open(target_path + doc_id + ".pickle", "wb") as f:
       pickle.dump(sents_data, f)
    return tokencount #sentences, lemmatized_sentences # , source, lemmata_source

# Apply function
tokencount = exprecce.apply(lambda row: get_row_data(row), axis=1)

# Assign the results to separate columns in your DataFrame
#exprecce["sentences"] = result[0]  # Assign sentences
#exprecce["lemmatized_sentences"] = result[1]  # Assign lemmatized_sentences


In [42]:
exprecce["tokencount"] = tokencount

In [44]:
exprecce.drop("string", axis=1, inplace=True)

In [None]:
exprecce.to_json("../data/exprecce_sentences.json", orient="records")

In [23]:
os.listdir(target_path)[:10]

['tlg0389.tlg001.pickle',
 'tlg2062.tlg050.pickle',
 'tlg2017.tlg065.pickle',
 'tlg2012.tlg001.pickle',
 'tlg0304.tlg001.pickle',
 'tlg2798.tlg005.pickle',
 'tlg0388.tlg002.pickle',
 'tlg2062.tlg043.pickle',
 'tlg2010.tlg001.pickle',
 'tlg2008.tlg001.pickle']

In [25]:
with open(target_path + os.listdir(target_path)[0], "rb") as f:
    doc_sentdata = pickle.load(f)
doc_sentdata[:3]

[('tlg0389.tlg001',
  0,
  'Μαρτύριον τοῦ ἁγίου ἀποστόλου Πέτρου.',
  [('Μαρτύριον', 'μαρτύριος', 'NOUN', (0, 9)),
   ('τοῦ', 'ὁ', 'DET', (10, 13)),
   ('ἁγίου', 'ἅγιος', 'ADJ', (14, 19)),
   ('ἀποστόλου', 'ἀποστόλου', 'NOUN', (20, 29)),
   ('Πέτρου', 'πέτρος', 'PROPN', (30, 36)),
   ('.', '.', 'PUNCT', (36, 37))]),
 ('tlg0389.tlg001',
  1,
  'Κυριακῆς οὔσης, ὁμιλοῦντος τοῦ Πέτρου τοῖς ἀδελφοῖς, καὶ προτρέποντος εἰς τὴν τοῦ Χριστοῦ πίστιν, παρόντων πολλῶν συγκλητικῶν καὶ ἱππικῶν πλειόνων καὶ γυναικῶν πλουσίων καὶ ματρωνῶν καὶ στηριζομένων τῇ πίστει,',
  [('Κυριακῆς', 'κυριακός', 'ADJ', (0, 8)),
   ('οὔσης', 'εἰμί', 'AUX', (9, 14)),
   (',', ',', 'PUNCT', (14, 15)),
   ('ὁμιλοῦντος', 'ὁμιλοῦντος', 'VERB', (16, 26)),
   ('τοῦ', 'ὁ', 'DET', (27, 30)),
   ('Πέτρου', 'πέτρος', 'PROPN', (31, 37)),
   ('τοῖς', 'ὁ', 'DET', (38, 42)),
   ('ἀδελφοῖς', 'ἀδελφός', 'NOUN', (43, 51)),
   (',', ',', 'PUNCT', (51, 52)),
   ('καὶ', 'καί', 'CCONJ', (53, 56)),
   ('προτρέποντος', 'προτρέπω', 'VERB', (57,

In [20]:
LAGT[LAGT["lemmatized_sentences"].isnull()].sample(10)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,sentences,raw_date,genre
684,tlg2035,tlg2035.tlg132,tlg2035.tlg132.1st1K-grc1.xml,Athanasius,Oratio III contra Arianos,"1. ΟΙ Ἀρειομανῖται, ὡς ἔοικε, κρίναντες ἅπαξ ἀ...",26026.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,[Theologici],christian,10132.0,,,
844,tlg2200,tlg2200.tlg00543,tlg2200.tlg00543.opp-grc1.xml,Libanius,Declamatio 43,καὶ ἡ γυνὴ φιλόπολις καὶ ὁ δῆμος φιλε- R IV 79...,11263.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,"[Rhetorici, Sophistae]",pagan,3489.0,,,
793,tlg2200,tlg2200.tlg00456,tlg2200.tlg00456.opp-grc1.xml,Libanius,Oratio 56,εἰμὶ μὲν τῶν ἀπηντηκότων Λουκιανῷ οὐ\n\nδιὰ τὸ...,4417.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,"[Rhetorici, Sophistae]",pagan,1298.0,,,
942,tlg4102,tlg4102.tlg043,tlg4102.tlg043.1st1K-grc1.xml,Catenae (Novum Testamentum),Catena In Epistulam Joannis I,ΥΠΟΘΕΣΙΣ. Ἐπειδη αὐτὸς ὁ Ἰωάννης τὸ Εὐαγγέλιον...,11904.0,1Kgr,,grecy,p. A.D. 5,501.0,,False,[],christian,5206.0,,,
705,tlg2042,tlg2042.tlg019,tlg2042.tlg019.1st1K-grc1.xml,Origen,Philocalia,Ἐκλογὴν ἡ παροῦσα περιέχει βίβλος γραφικῶν ζητ...,74499.0,1Kgr,,grecy,A.D. 2-3,101.0,300.0,False,[Theologici],christian,30547.0,,,
836,tlg2200,tlg2200.tlg00535,tlg2200.tlg00535.opp-grc1.xml,Libanius,Declamatio 35,"τῷ μὲν δήμῳ πολλὴ χάρις, ὦ βουλή, μηδ’ RIV 227...",4805.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,"[Rhetorici, Sophistae]",pagan,1636.0,,,
1698,tlg2003,tlg2003.tlg007,tlg2003.tlg007.perseus-grc1.xml,Julian the Emperor,To the Cynic Heracleios,\n*)=h polla\ gi/netai e)n makrw=| xro/nw|: to...,8829.0,perseus,,grecy,A.D. 4,301.0,400.0,False,[Philosophici/-ae],pagan,0.0,,,
862,tlg2655,tlg2655.tlg002,tlg2655.tlg002.1st1K-grc1.xml,Damigeron,De lapidibus,1] Τηρήσει τὰς κριθὰς ἀβλαβεῖς δάφνης καρποφόρ...,2657.0,1Kgr,,grecy,Incertum,-200.0,,False,[],pagan,1277.0,,,
756,tlg2200,tlg2200.tlg00405,tlg2200.tlg00405.opp-grc1.xml,Libanius,Oratio V,Αὐτὸ τοῦτο τὸ νῦν ἐμὲ καὶ ζῆν καὶ λέγειν καὶ R...,4703.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,"[Rhetorici, Sophistae]",pagan,1612.0,,,
764,tlg2200,tlg2200.tlg00427,tlg2200.tlg00427.opp-grc1.xml,Libanius,Oratio 27,ἐπαινῶν ἠξιωκὼς ἃ τῆς βελτίονος ἦν μοί- R II ...,5965.0,1Kgr,,grecy,A.D. 4,301.0,400.0,False,"[Rhetorici, Sophistae]",pagan,1961.0,,,


In [26]:
# there is an invalid document:
row_index = LAGT[LAGT["doc_id"] == "tlg0530.tlg006"].index[0]
LAGT = LAGT.drop(row_index)

In [27]:
def get_row_data(row):
    doc_id = row["doc_id"]
    # source = row["source"]
    # lemmata_source = row["lemmata_source"]
    try:
        file_path = target_path + doc_id + ".pickle"
        with open(file_path, "rb") as f:
            sentences_data = pickle.load(f)
        sentences = [sent[2] for sent in sentences_data]
        lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v", "NOUN", "PROPN", "ADJ", "VERB"]] for sent in sentences_data]
        #source = "glaux1"
        #lemmata_source = "glaux1"
    except:
        sentences = None
        lemmatized_sentences = None
    return sentences, lemmatized_sentences # , source, lemmata_source

In [28]:
result = LAGT.apply(lambda row: pd.Series(get_row_data(row)), axis=1)

In [29]:
result.sample(10)

Unnamed: 0,0,1
192,[περὶ μὲν οὖν τῶν πρώτων αἰτίων τῆς φύσεως καὶ...,"[[πρῶτος, αἴτιος, φύσις, κίνησις, φυσικός, φορ..."
401,[πᾶν μέρος λόγου εὕρηται μὲν ἐπὶ μηνύσει πράγμ...,"[[μέρος, λόγος, εὑρίσκω, μήνυσις, πρᾶγμα, καιρ..."
1474,[ἡγεῖσθαι μὲν τῆς πάσης στρατιᾶς τοὺς κατασκόπ...,"[[ἡγέομαι, στρατιά, κατάσκοπος, ἱππεύς, τάσσω,..."
1094,[εἰς τοῦτο δὴ τὸ δικαστήριον καὶ αὐτὸς εἰσάγων...,"[[δικαστήριον, εἰσάγω, γίγνομαι, διάλεξις, οἴο..."
551,"[ὁδοὶ δύο εἰσί, μία τῆς ζωῆς καὶ μία τοῦ θανάτ...","[[ὁδός, εἰμί, ζωή, θάνατος, διαφορά, πολύς, ὁδ..."
1099,[διὰ τί τὸ θαλάττιον ὕδωρ οὐ τρέφει τὰ δένδρα;...,"[[θαλάσσιος, ὕδωρ, τρέφω, δένδρον], [αἰτία, ζῷ..."
177,[περὶ δὲ τῆς μαντικῆς τῆς ἐν τοῖς ὕπνοις γινομ...,"[[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐ..."
1763,"[τὰς δὲ βροτοὶ καλέουσι Πελειάδας., χειμέριαι ...","[[βροτός, καλέω, Πελειάς], [χειμέριος, δύω, Πε..."
1314,[Παῦλος ἀπόστολος Χριστοῦ Ἰησοῦ διὰ θελήματος ...,"[[Παῦλος, ἀπόστολος, Χριστός, Ἰησοῦς, θέλημα, ..."
572,[Μισθοφόροι τόν ποταμόν τοῖς πολεμίοις ἐπήγαγο...,"[[μισθοφόρος, ποταμός, πολέμιος, ἐπήγαγος], []..."


In [30]:
LAGT["sentences"] = result[0]  # Extract sentences
LAGT["lemmatized_sentences"] = result[1]

In [31]:
LAGT.loc[1963]

author_id                                                         tlg2798
doc_id                                                     tlg2798.tlg005
filename                                                             None
author                                        Pseudo-Dionysius Areopagita
title                                                De mystica theologia
string                  Τριὰς ὑπερούσιε, καὶ ὑπέρθεε, καὶ ὑπεράγαθε, τ...
wordcount                                                             NaN
source                                                           exprecce
lemmatized_sentences    [[τριάς, ὑπερούσιε, ὑπέρθεε, ὑπεράγαθε, χριστι...
lemmata_source                                                       None
tlg_date                                                             None
not_before                                                          401.0
not_after                                                           600.0
date_uncertain                        

In [32]:
LAGT["lemmatized_sentences"].isnull().sum()

0

In [34]:
len(LAGT)

1963

In [35]:
LAGT = LAGT.drop_duplicates(subset="doc_id", keep="first")
len(LAGT)

1958

### Checking what is in what shape

In [36]:
LAGT[LAGT["source"]=="exprecce"]

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,sentences,raw_date,genre
1939,tlg1352,tlg1352.tlg001,,,The Letter of the Churches of Vienne and Lyons,οἱ ἐν Βιέννῃ καὶ Λουγδούνῳ τῆς Γαλλίας παροικο...,,exprecce,"[[βιέννῃ, λουγδούνῳς, γαλλία, παροικέω, δοῦλος...",,,101.0,200.0,,,christian,,[οἱ ἐν Βιέννῃ καί Λουγδούνῳ τῆς Γαλλίας παροικ...,A.D. 2,Hagiogr.
1940,tlg2005,tlg2005.tlg001,,,Martyrium Pionii presbyteri et sodalium,Μαρτύριον τοῦ ἁγίου Πιονίου τοῦ πρεσβυτέρου κα...,,exprecce,"[[μαρτύριος, ἅγιος, πιονίας, πρέσβυς], [μνεία,...",,,301.0,400.0,,,christian,,[Μαρτύριον τοῦ ἁγίου Πιονίου τοῦ πρεσβυτέρου κ...,A.D. 4,Hagiogr.
1941,tlg2011,tlg2011.tlg001,,,"Martyrium Agapae, Irenae, Chionae et sodalium","Μαρτύριον τῶν ἁγίων Ἀγάπης, Εἰρήνης καὶ Χιόνης...",,exprecce,"[[μαρτύριος, ἅγιος, ἀγάπη, εἰρήνης, χιόνης, μα...",,,301.0,400.0,,,christian,,"[Μαρτύριον τῶν ἁγίων Ἀγάπης, Εἰρήνης καί Χιόνη...",A.D. 4,Hagiogr.
1942,tlg2015,tlg2015.tlg001,,,Testamentum xl martyrum,ΔΙΑΘΗΚΗ τῶν ἁγίων καὶ ἐνδόξων τοῦ Χριστοῦ τεσσ...,,exprecce,"[[διαθηκή, ἅγιος, ἔνδοξος, χριστός, μάρτυς, σε...",,,401.0,,,,christian,,[Διαθηκή τῶν ἁγίων καί ἐνδόξων τοῦ Χριστοῦ τεσ...,post A.D. 4,Hagiogr.
1943,tlg5451,tlg5451.tlg001,,,Passio sancti Sabae Gothi (sub auctore Athanar...,Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου. Ἡ ἐκκλησί...,,exprecce,"[[μαρτύριος, ἅγιος, σάβας, γότθης], [ἐκκλησία,...",,,301.0,400.0,,,christian,,"[Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου., Ἡ ἐκκλη...",A.D. 4,Hagiogr.
1945,tlg2010,tlg2010.tlg001,,,Martyrium Dasii,Μαρτύριον τοῦ ἁγίου Δασίου. Κύριε εὐλόγησον. Τ...,,exprecce,"[[μαρτύριος, ἅγιος, δασῖος], [κύριος, εὐλόγησο...",,,401.0,,,,christian,,"[Μαρτύριον τοῦ ἁγίου Δασίου., Κύριε εὐλόγησον....",post A.D. 4,Hagiogr.
1946,tlg2008,tlg2008.tlg001,,,Martyrium Cononis,Πάλιν ὦ τῆς δυσσεβοῦς κρίσεως. Μετὰ τὸ τελειωθ...,,exprecce,"[[δυσσεβής, κρίσις], [τελειόω, ἅγιος, χριστός,...",,,401.0,,,,christian,,"[Πάλιν ὦ τῆς δυσσεβοῦς κρίσεως., Μετά τό τελει...",post A.D. 4,Hagiogr.
1948,tlg2012,tlg2012.tlg001,,,Acta Eupli - RECENSIO GRAECA,Τοῖς κυρίοις ἡμῶν Διοκλιτιανῷ τὸ ἔννατον καὶ Μ...,,exprecce,"[[κύριος, διοκλιτιανῷ, ἔννατος, μαξιμιανός, ὀγ...",,,401.0,,,,christian,,[Τοῖς κυρίοις ἡμῶν Διοκλιτιανῷ τό ἔννατον καί ...,post A.D. 4,Hagiogr.
1949,tlg0390,tlg0390.tlg001,,,"Martyrium sanctorum Carpi, Papyli et Agathonicae",Μαρτύριον τῶν ἁγίων Κάρπου Παπύλου καὶ Ἀγαθονί...,,exprecce,"[[μαρτύριος, ἅγιος, κάρπος, παπύλας, ἀγαθονίκη...",,,101.0,200.0,,,christian,,[Μαρτύριον τῶν ἁγίων Κάρπου Παπύλου καί Ἀγαθον...,A.D. 2,Hagiogr.
1950,tlg0304,tlg0304.tlg001,,,Acta et martyrium Apollonii,Μαρτύριον τοῦ ἁγίου καὶ πανευφήμου ἀποστόλου Ἀ...,,exprecce,"[[μαρτύριος, ἅγιος, πανευφήμος, ἀποστόλου, ἀπο...",,,101.0,400.0,,,christian,,[Μαρτύριον τοῦ ἁγίου καί πανευφήμου ἀποστόλου ...,A.D. 2/4,Hagiogr.


In [37]:
def get_wordcout(doc_id):
    try:
        file_path = target_path + doc_id + ".pickle"
        with open(file_path, "rb") as f:
            sentences_data = pickle.load(f)
        wordcount = sum([len(sent_data[3]) for sent_data in sentences_data])
    except:
        wordcount = 0
    return wordcount

In [38]:
LAGT["wordcount"] = LAGT["doc_id"].apply(get_wordcout)

In [39]:
LAGT["wordcount"].sum()

35809325

In [40]:
lemmatized_sentences = [s for work in LAGT["lemmatized_sentences"] for s in work]
lemmatized_sentences[:10]

[[],
 [],
 [],
 [],
 ['πινυτός', 'ἀντιγράφω'],
 ['θαυμάζω', 'ἀποδέχω', 'διονύσιος'],
 ['ἀντιπαρακαλέω', 'στεῤῥοτέρας', 'μεταδίδωμι', 'τροφή'],
 ['τελειοτέρω',
  'γράμμα',
  'λαός',
  'ὑποθρέψαντα',
  'διατέλος',
  'γαλακτώδεμι',
  'ἐνδιατρίβω',
  'λόγος',
  'νηπιώδης',
  'ἀγωγή',
  'λανθάνω',
  'καταγηράζω'],
 ['ἐπιστολή',
  'πινυτός',
  'πίστις',
  'ὀρθοδοξία',
  'φροντίς',
  'ὑπήκοος',
  'ὡφελεία',
  'λόγιον',
  'θεῖος',
  'σύνεσις'],
 ['ἀκριβεστάτης', 'ἀναδείκνυται', 'εἰκών']]

In [41]:
# update lemmatacount
LAGT["lemmatacount"] = LAGT["lemmatized_sentences"].apply(lambda x: len([l for s in x for l in s]))

In [42]:
LAGT["lemmatacount"]

0          34
1          59
2       10315
3        1496
4         125
        ...  
1959     1011
1960     1227
1961     1505
1962     1893
1963      578
Name: lemmatacount, Length: 1958, dtype: int64

In [43]:
LAGT['lemmata_source'] = LAGT['lemmata_source'].fillna("grecy")

In [44]:
placeholder = "glaux_tmp"
LAGT['lemmata_source'] = LAGT['lemmata_source'].replace("glaux", placeholder)
LAGT['lemmata_source'] = LAGT['lemmata_source'].replace(placeholder, "glaux1")

In [45]:
LAGT.groupby("lemmata_source").size() #.isnull().sum()

lemmata_source
glaux1    1698
grecy      260
dtype: int64

In [46]:
LAGT.groupby("source").size() #.isnull().sum()


source
1Kgr         218
exprecce      20
glaux1      1696
perseus       24
dtype: int64

In [47]:
LAGT.columns

Index(['author_id', 'doc_id', 'filename', 'author', 'title', 'string',
       'wordcount', 'source', 'lemmatized_sentences', 'lemmata_source',
       'tlg_date', 'not_before', 'not_after', 'date_uncertain', 'tlg_epithet',
       'provenience', 'lemmatacount', 'sentences', 'raw_date', 'genre'],
      dtype='object')

In [48]:
LAGT = LAGT[['author_id', 'doc_id', 'filename', 'author', 'title',  'sentences', 'lemmatized_sentences', 'source', 'lemmata_source', 'not_before', 'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount', 'lemmatacount', ]]

In [49]:
LAGT["title"].fillna("", inplace=True)
LAGT["author"].fillna("", inplace=True)


In [50]:
LAGT[LAGT["author"].str.contains("Septuagint")]

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
282,tlg0527,tlg0527.tlg001,tlg0527.tlg001.opp-grc2.xml,Septuaginta,Genesis,[ἐν ἀρχῇ ἐποίησεν ὁ Θεὸς τὸν οὐρανὸν καὶ τὴν γ...,"[[ἀρχή, ποιέω, θεός, οὐρανός, γῆ], [γῆ, εἰμί, ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,37943,14951
283,tlg0527,tlg0527.tlg002,tlg0527.tlg002.opp-grc2.xml,Septuaginta,Exodus,[ταῦτα τὰ ὀνόματα τῶν υἱῶν Ἰσραὴλ εἰσπεπορευμέ...,"[[ὄνομα, υἱός, Ἰσραήλ, εἰσπορεύω, Αἴγυπτος, Ἰα...",glaux1,glaux1,-300.0,-101.0,[],,jewish,28399,11279
284,tlg0527,tlg0527.tlg003,tlg0527.tlg003.opp-grc2.xml,Septuaginta,Leviticus,"[καὶ ἀνεκάλεσε Μωυσῆν, καὶ ἐλάλησε Κύριος αὐτῷ...","[[ἀνακαλέω, Μωϋσῆς, λαλέω, κύριος, σκηνή, μαρτ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,21967,8479
285,tlg0527,tlg0527.tlg004,tlg0527.tlg004.opp-grc2.xml,Septuaginta,Numeri,[καὶ ἐλάλησε Κύριος πρὸς Μωυσῆν ἐν τῇ ἐρήμῳ τῇ...,"[[λαλέω, κύριος, Μωϋσῆς, ἐρῆμος, Σινά, σκηνή, ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,29032,11768
286,tlg0527,tlg0527.tlg005,tlg0527.tlg005.opp-grc2.xml,Septuaginta,Deuteronomium,"[οὗτοι οἱ λόγοι, οὓς ἐλάλησε Μωυσῆς παντὶ Ἰσρα...","[[λόγος, λαλέω, Μωϋσῆς, Ἰσραήλ, Ἰορδάνης, ἐρῆμ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,26149,9749
287,tlg0527,tlg0527.tlg006,tlg0527.tlg006.opp-grc2.xml,Septuaginta,Josue (Cod. Vaticanus + Cod. Alexandrinus),"[καὶ ἐγένετο μετὰ τὴν τελευτὴν Μωυσῆ, εἶπε Κύρ...","[[γίγνομαι, τελευτή, Μωϋσῆς, λέγω, κύριος, Ἰησ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,16799,6854
288,tlg0527,tlg0527.tlg008,tlg0527.tlg008.opp-grc2.xml,Septuaginta,Judices (Cod. Alexandrinus),[Καί ἐγένετο μετά τήν τελευτήν Ἰησοῦ καί ἐπηρώ...,"[[γίγνομαι, τελευτή, ἰησοῦς, ἐπερωτάω, υἱός, ἰ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,29420,10764
289,tlg0527,tlg0527.tlg010,tlg0527.tlg010.opp-grc2.xml,Septuaginta,Ruth,[καὶ ἐγένετο ἐν τῷ κρίνειν τοὺς κριτὰς καὶ ἐγέ...,"[[γίγνομαι, κρίνω, κριτής, γίγνομαι, λιμός, γῆ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,2460,908
290,tlg0527,tlg0527.tlg011,tlg0527.tlg011.opp-grc2.xml,Septuaginta,Regnorum I (Samuelis I in textu Masoretico),"[ἄνθρωπος ἦν ἐξ Ἀρμαθαὶμ Σιφά, ἐξ ὄρους Ἐφραίμ...","[[ἄνθρωπος, εἰμί, Ἀρμαθαίμ, Σιφά, ὄρος, Ἐφραίμ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,24247,9549
291,tlg0527,tlg0527.tlg012,tlg0527.tlg012.opp-grc2.xml,Septuaginta,Regnorum II (Samuelis II In Textu Masoretico),[καὶ ἐγένετο μετὰ τὸ ἀποθανεῖν Σαοὺλ καὶ Δαυὶδ...,"[[γίγνομαι, ἀποθνῄσκω, Σαῦλος, Δαυίδ, ἀναστρέφ...",glaux1,glaux1,-300.0,-101.0,[],,jewish,20459,8185


In [51]:
LAGT[LAGT["lemmata_source"]=="grecy"].sample(10)

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
813,tlg2200,tlg2200.tlg00512,tlg2200.tlg00512.opp-grc1.xml,Libanius,Declamatio 12,"[Χαλεπόν τό ζήτημα., δύο γάρ ἐναντία ἀλλή- λοι...","[[χαλεπός, ζήτημα], [ἐναντίος, ἀλλή, λος, ἦθος...",1Kgr,grecy,301.0,400.0,"[Rhetorici, Sophistae]",,pagan,7265,2875
891,tlg4016,tlg4016.tlg003,tlg4016.tlg003.1st1K-grc1.xml,Ammonius,In Aristotelis Librum De Interpretatione Comme...,"[Α, ,, , Πολύ μέν ἐν σοφοῖσι κοὐκ ἀνώνυμον τό ...","[[], [], [πολύς, σοφός, ἀνώνυμος, ἑρμηνεία, ἀρ...",1Kgr,grecy,401.0,500.0,[Philosophici/-ae],,pagan,154056,53815
740,tlg2057,tlg2057.tlg002,tlg2057.tlg002.1st1K-grc1.xml,"Socrates, Scholasticus",Historia Ecclesiastica,"[Τό τοῦ βιβλίου προοίμιον., Εὐσέβιος ὁ Παμφίλο...","[[βιβλίον, προοίμιον], [εὐσέβιος, παίφμλος, ὅλ...",1Kgr,grecy,301.0,500.0,[Historici/-ae],,christian,120491,51190
760,tlg2200,tlg2200.tlg00409,tlg2200.tlg00409.opp-grc1.xml,Libanius,Oratio 9,[Ἡ μέν οὖν ἑορτή καί αὐτή προσάγει τό αὑτῆς εὖ...,"[[ἑορτή, αὐτός, προσάγω, ποιήσω], [πρότερος, π...",1Kgr,grecy,301.0,400.0,"[Rhetorici, Sophistae]",,pagan,1236,498
750,tlg2189,tlg2189.tlg003,tlg2189.tlg003.1st1K-grc1.xml,Callinicus,Fragmenta,[Οὔτε ἡ καθ’ ὑμᾶς γῆ πρός ἕτερον οὐδέν εὐαρμόσ...,"[[γῆ, ἕτερος, ἔχω, βασιλεία, βασιλεία, χωρίον,...",1Kgr,grecy,201.0,300.0,[Sophistae],,pagan,185,86
680,tlg2035,tlg2035.tlg003,tlg2035.tlg003.1st1K-grc1.xml,Athanasius of Alexandria,De decretis Nicaenae synodi (Chapters 41 and 42),"[Κατά Εὐσεβίου καί Θεογνίου, ., », κωνσταντινό...","[[εὐσεβῖος, θεογνῖος], [], [], [κωνσταντινός, ...",1Kgr,grecy,301.0,400.0,[Theologici],,christian,1485,615
1693,tlg2003,tlg2003.tlg002,tlg2003.tlg002.perseus-grc1.xml,Julian the Emperor,Panegyric on the Empress Eusebia,"[* / )/ \ \ ) / / \ / / ,, ), / \ / ) \ ) / , ...","[[/, \, \, \, /, /], [], [\, /, \, /, \, /, \,...",perseus,grecy,301.0,400.0,[Philosophici/-ae],,pagan,8697,3972
836,tlg2200,tlg2200.tlg00535,tlg2200.tlg00535.opp-grc1.xml,Libanius,Declamatio 35,"[τῷ μέν δήμῳ πολλή χάρις,, ὦ βουλή, μηδ’ ὑπό τ...","[[δῆμος, πολύς, χάρις], [βουλή, πάρειμι, συμφο...",1Kgr,grecy,301.0,400.0,"[Rhetorici, Sophistae]",,pagan,4204,1699
913,tlg4089,tlg4089.tlg003,tlg4089.tlg003.opp-grc1.xml,Theodoretus,Historia ecclesiastica,[Τάδε ἔνεστιν ἐν τῷ πρώτῳ τόμῳ τῆς Θεοδωρήτου ...,"[[ὅδε, ἔνειμι, πρῶτος, τόμος, θεοδωρήτης, ἐκκλ...",1Kgr,grecy,301.0,500.0,"[Scriptores Ecclesiastici, Theologici]",,christian,119273,44883
898,tlg4019,tlg4019.tlg003,tlg4019.tlg003.1st1K-grc1.xml,Olympiodorus,In Aristotelis meteora commentaria,"[Πρᾶξις α . ., Περί μέν οὖν τῶν πρώτων αἰτίων ...","[[πρᾶξις], [πρῶτος, αἰτέω, φύσις], [προοίμιον,...",1Kgr,grecy,501.0,600.0,[Philosophici/-ae],,pagan,145277,53704


In [52]:
LAGT.to_parquet("../data/large_files/LAGT_grecy.parquet")

In [None]:
# save metadata for future usage
LAGT[['author_id', 'doc_id', 'filename', 'author', 'title', 'source', 'lemmata_source', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount',
       'lemmatacount']].to_csv("../data/LAGT_v4-0_metadata.csv", index=False)

### From sents_data pickles to jsons

In [1]:
source_path = "../data/large_files/sents_data/"
target_path = "../data/large_files/sents_data_jsons/"
try:
    os.mkdir(target_path)
except:
    pass

In [114]:
fn = "tlg0527.tlg048.pickle"
with open(source_path + fn, "rb") as f:
    sents_data = pickle.load(f)
sents_data[:10]

[('0527-048',
  0,
  'ὅρασις, ἣν εἶδεν Ἡσαΐας υἱὸς Ἀμώς, ἣν εἶδε κατὰ τῆς Ἰουδαίας καὶ κατὰ Ἱερουσαλὴμ ἐν βασιλεία Ὀζίου καὶ Ἰωάθαμ καὶ Ἄχαζ καὶ Ἐζεκίου, οἵ ἐβασίλευσαν τῆς Ἰουδαίας. E',
  [('ὅρασις', 'ὅρασις', 'n', (0, 6)),
   (',', ',', 'u', (6, 7)),
   ('ἣν', 'ὅς', 'p', (8, 10)),
   ('εἶδεν', 'ὁράω', 'v', (11, 16)),
   ('Ἡσαΐας', 'Ἡσαΐας', 'n', (17, 23)),
   ('υἱὸς', 'υἱός', 'n', (24, 28)),
   ('Ἀμώς', 'Ἀμώς', 'n', (29, 33)),
   (',', ',', 'u', (33, 34)),
   ('ἣν', 'ὅς', 'p', (35, 37)),
   ('εἶδε', 'ὁράω', 'v', (38, 42)),
   ('κατὰ', 'κατά', 'r', (43, 47)),
   ('τῆς', 'ὁ', 'l', (48, 51)),
   ('Ἰουδαίας', 'Ἰουδαία', 'n', (52, 60)),
   ('καὶ', 'καί', 'c', (61, 64)),
   ('κατὰ', 'κατά', 'r', (65, 69)),
   ('Ἱερουσαλὴμ', 'Ἱεροσόλυμα', 'n', (70, 80)),
   ('ἐν', 'ἐν', 'r', (81, 83)),
   ('βασιλεία', 'βασιλεία', 'n', (84, 92)),
   ('Ὀζίου', 'Ὄζιος', 'n', (93, 98)),
   ('καὶ', 'καί', 'c', (99, 102)),
   ('Ἰωάθαμ', 'Ἰωάθαμ', 'n', (103, 109)),
   ('καὶ', 'καί', 'c', (110, 113)),
   ('Ἄχαζ', '

In [4]:
reformat_tags_dict = {
    "NOUN": "n",
    "VERB": "v",
    "ADJ": "a",
    "ADV": "r",
    "PRON": "p",
    "DET": "l",
    "ADP": "r",
    "CCONJ": "c",
    "SCONJ": "c",
    "PROPN": "n",
    "PUNCT": "u",
    "n" : "n",
    "v": "v",
    "a": "a",
    "r": "r",
    "p": "p",
    "l": "l",
    "c": "c",
    "u": "u" # Assuming punctuation remains "u"
}

def reformat_tags(tag):
    try:
        return reformat_tags_dict[tag]
    except:
        return "x"
    
for fn in os.listdir(source_path):
    doc_id =  fn.rpartition(".")[0]
    with open(source_path + fn, "rb") as f:
        sents_data = pickle.load(f)
    sents_data_updated = []
    for id, sent_n, sent_text, sent_data in sents_data:
        sent_data_updated = [(t[0], t[1], reformat_tags(t[2]), t[3]) for t in sent_data]
        sents_data_updated.append((doc_id, sent_n, sent_text, sent_data_updated))
    with open(target_path + doc_id + ".json", "w") as f:
        json.dump(sents_data_updated, f)

In [5]:

fn = "tlg2640.tlg001.json"
with open(target_path + fn, "rb") as f:
    sents_data = json.load(f)
sents_data[:10]

[['tlg2640.tlg001', 0, '.', [['.', '.', 'u', [0, 1]]]],
 ['tlg2640.tlg001',
  1,
  'Ἐχθρός γενοίμην μηδενός, φίλος δέ τοῦ αἰεί καί παραμενέοντος·',
  [['Ἐχθρός', 'ἐχθρός', 'a', [0, 6]],
   ['γενοίμην', 'γίγνομαι', 'v', [7, 15]],
   ['μηδενός', 'μηδείς', 'a', [16, 23]],
   [',', ',', 'u', [23, 24]],
   ['φίλος', 'φίλος', 'n', [25, 30]],
   ['δέ', 'δέ', 'r', [31, 33]],
   ['τοῦ', 'ὁ', 'l', [34, 37]],
   ['αἰεί', 'ἀεί', 'r', [38, 42]],
   ['καί', 'καί', 'c', [43, 46]],
   ['παραμενέοντος', 'παραμενέω', 'v', [47, 60]],
   ['·', '·', 'u', [60, 61]]]],
 ['tlg2640.tlg001',
  2,
  'καί μήκοτε μέν διενεχθείην πρός τούς οἰκειοτάτους,',
  [['καί', 'καί', 'c', [0, 3]],
   ['μήκοτε', 'μήκοτε', 'r', [4, 10]],
   ['μέν', 'μέν', 'r', [11, 14]],
   ['διενεχθείην', 'διενεχθείην', 'v', [15, 26]],
   ['πρός', 'πρός', 'r', [27, 31]],
   ['τούς', 'ὁ', 'l', [32, 36]],
   ['οἰκειοτάτους', 'οἰκεῖος', 'a', [37, 49]],
   [',', ',', 'u', [49, 50]]]],
 ['tlg2640.tlg001',
  3,
  'διενεχθείς δέ διαλλαχθείην ὡς τάχισ

In [20]:
LAGT = pd.read_parquet("../data/large_files/LAGT_v4-0.parquet")
LAGT.head()


Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
0,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"[, , ., . . . . ., — πρός ἥν ( ,, ), ὁ Πινυτός...","[[], [], [], [], [πινυτός, ἀντιγράφω], [θαυμάζ...",glaux1,glaux1,101.0,200.0,[],,christian,109,34
1,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,[Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαί...,"[[εὐνητήρ, μακροπτολέμον, μήτηρ, μαῖα, ἀντιπέτ...",glaux1,glaux1,-400.0,-201.0,[Bucolici],,pagan,95,59
2,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,[ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσ...,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",glaux1,glaux1,-500.0,-401.0,[Tragici],,pagan,21516,10315
3,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,[Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τή...,"[[μιλήσιος, πργοςήκω, τρυφή, ἐπιδείκνυμι], [ἀρ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",,pagan,3206,1496
4,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,"[Κατά πετρῶν σπείρεις., Πλίνθον πλύνεις., Δικτ...","[[πέτρα, σπείρω], [πλίνθος, πλύνω], [δίκτυον, ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",,pagan,195,125


In [21]:
path = "../data/large_files/sents_data/"
LAGT[LAGT["doc_id"].apply(lambda x: f"{x}.pickle" not in os.listdir(path))]

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
339,tlg0530,tlg0530.tlg006,tlg0530.tlg006.1st1K-grc1.xml,Pseudo-Galen,Ad Gaurum quomodo animetur fetus,[],[],1Kgr,grecy,201.0,,[Medici],,pagan,0,0


In [22]:
len(set(LAGT["doc_id"]))

1959

### Backup of an old approach...

In [21]:
# put the string cleaning, doc creation and lemmata together into one function
def from_string_to_lemsents(string):
    try:    
        doc = get_doc(clean_string(string), segment_len=50000)
        lemmatized_sentences = get_lemmatized_sentences(doc)
    except:
        lemmatized_sentences = None
    return lemmatized_sentences

In [27]:
grecy_lemmatized_list = []
def get_grecy_lemmata(string, lemmatized_sentences, doc_id):
    if lemmatized_sentences == None:
        lemmatized_sentences = from_string_to_lemsents(string)
        grecy_lemmatized_list.append(doc_id)
        print(len(grecy_lemmatized_list), doc_id, len(string))
    return lemmatized_sentences

#sample_lemmatized_sentences = LAGT.sample(10, random_state=1).apply(lambda row: get_grecy_lemmata(row["string"], row["lemmatized_sentences"], row["doc_id"]), axis=1)

In [28]:
#sample_lemmatized_sentences

In [29]:
#LAGT.loc[sample_lemmatized_sentences.index]

In [30]:
#grecy_lemmatized_list

# Applying the main funtion

In [31]:
%%time
### will be time consuming...
LAGT["lemmatized_sentences"] = LAGT.apply(lambda row: get_grecy_lemmata(row["string"], row["lemmatized_sentences"], row["doc_id"]), axis=1)

1 ggm0001.ggm001 3857
2 ogl0001.ogl001 1257
3 stoa0033a.tlg028 41978
4 stoa0033a.tlg043 20816
5 stoa0121.stoa001 171855
6 stoa0146d.stoa001 27696
7 tlg0005.tlg003 572
8 tlg0006.tlg020 108623
9 tlg0007.tlg146 18130
10 tlg0007.tlg147 1069
11 tlg0018.tlg001 143932
12 tlg0018.tlg002 260468
13 tlg0018.tlg003 76774
14 tlg0018.tlg004 141567
15 tlg0018.tlg005 96145
16 tlg0018.tlg006 97880
17 tlg0018.tlg007 32667
18 tlg0018.tlg008 94218
19 tlg0018.tlg009 92533
20 tlg0018.tlg010 89337
21 tlg0018.tlg011 108351
22 tlg0018.tlg012 33173
23 tlg0018.tlg013 93899
24 tlg0018.tlg014 110248
25 tlg0018.tlg015 172311
26 tlg0018.tlg016 90855
27 tlg0018.tlg017 114022
28 tlg0018.tlg018 114895
29 tlg0018.tlg019 246616
30 tlg0018.tlg020 151034
31 tlg0018.tlg021 139575
32 tlg0018.tlg022 365437
33 tlg0018.tlg023 95569
34 tlg0018.tlg024 642298
35 tlg0018.tlg025 173969
36 tlg0018.tlg026 99048
37 tlg0018.tlg027 107520
38 tlg0018.tlg028 66001
39 tlg0018.tlg029 118642
40 tlg0018.tlg030 91632
41 tlg0018.tlg031 186072
42

In [39]:
LAGT[LAGT["lemmatized_sentences"].isnull()]

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source
411,tlg0530,tlg0530.tlg006,tlg0530.tlg006.1st1K-grc1.xml,Pseudo-Galen,Ad Gaurum quomodo animetur fetus,Τὸ περὶ τῆς εἰς τὰ σώματα τῶν ψυχῶν εἰσκρίσεως...,9595,1Kgr,,
824,tlg2042,tlg2042.tlg009,tlg2042.tlg009.opp-grc1.xml,Origenes,In Jeremiam (Homiliae 1-11),\n Ὁ θεὸς εἰς ἀγαθοποιίαν πρόχειρός\n ...,36685,1Kgr,,
834,tlg2042,tlg2042.tlg021,tlg2042.tlg021.opp-grc1.xml,Origenes,In Jeremiam (Homiliae 12-20),"Ὃ προστάσσεται ὁ προφήτης λέγειν ὑπὸ θεοῦ, ὀφε...",48360,1Kgr,,


In [42]:
missing_i = LAGT[LAGT["lemmatized_sentences"].isnull()].index

In [64]:
string = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].tolist()[1]

In [65]:
type(string)

str

In [66]:
cleaned_string = clean_string(string)

In [67]:
cleaned_string = re.sub("\.{3}\.*", "...", cleaned_string)

In [68]:
doc = nlp(cleaned_string)

In [69]:
def dealing_with_missing(string):
    string = str(string)
    cleaned_string = clean_string(string)
    cleaned_string = re.sub("\.{3}\.*", "...", cleaned_string)
    doc = nlp(cleaned_string)
    lemmatized_sentences = get_lemmatized_sentences(doc)
    return lemmatized_sentences

lemmatized_missing = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].apply(dealing_with_missing)

In [70]:
lemmatized_missing

411    [[σῶμα, ψυχή, εἰσκρίσις, ζῳογονία, δόγμα, πολύ...
824    [[θεός, ἀγαθοποιία, πρόχειρος, κολάζω, ἄξιος, ...
834    [[προστάσσω, προφήτης, λέγω, θεός], [ὀφείλω, ἄ...
Name: string, dtype: object

In [71]:
LAGT.loc[missing_i, "lemmatized_sentences"] = lemmatized_missing

In [72]:
LAGT.sample(10, random_state=1)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source
1532,tlg0031,tlg0031.tlg008,tlg0031.tlg008.perseus-grc2.xml,,New Testament - 2 Corinthians,\n\n ΠΑΥΛΟΣ ἀπόστολος Χριστ...,4470,perseus,"[[Παῦλος, ἀπόστολος, Χριστός, Ἰησοῦς, θέλημα, ...",morphgnt
1619,tlg0060,tlg0060.tlg001,tlg0060.tlg001.perseus-grc6.xml,Diodorus Siculus,Βιβλιοθήκη Ἱστορική (Books 18-20),τάδε ἔνεστιν ἐν τῇ ὀκτωκαιδεκάτῃ τῶν Διοδώρου...,79283,perseus,"[[θεός, δισσός, παλαιός, ἄνθρωπος, μεταγενής, ...",glaux
581,tlg0732,tlg0732.tlg013,tlg0732.tlg013.1st1K-grc1.xml,Alexander of Aphrodisias,Ἠθικὰ προβλήματα [Sp.],α. Ἀπορίαι πρὸς τοὺς τὸ ζῆν οὐκ ἀγαθὸν\n ...,23343,1Kgr,"[[Ἀπορίαις, ζῶ, ἀγαθός, λέγω], [ὁμοειδέω, ἡδον...",
474,tlg0591,tlg0591.1st1K001,tlg0591.1st1K001.1st1K-grc1.xml,Antisthenes,Αἴας,Ἐβουλόμην ἂν τοὺς αὐτοὺς ἡμῖν δικάζειν οἵπερ \...,502,1Kgr,"[[Ἐβουλόμης, αὐτός, δικάζω, πρᾶγμα, πάρειμι], ...",
704,tlg1699,tlg1699.tlg004,tlg1699.tlg004.1st1K-grc1.xml,Teles Megarenesis,Περὶ συγκρίσεως πενίας καὶ πλούτου,Δοκεῖ μοι ἡ τῶν χρημάτων κτῆσις σπάνεως καὶ ἐν...,1195,1Kgr,"[[δοκέω, χρῆμα, κτῆσις, σπάνις, ἔνδεια, ἀπολύω...",
274,tlg0087,tlg0087.tlg013,tlg0087.tlg013.1st1K-grc1.xml,Herodianus,Περὶ κλίσεως ὀνομάτων,"1. Anecd.Ox.IV 333, 6: Ἐπιτομὴ τῶν ὀνοματικῶν ...",59506,1Kgr,"[[Ἐπιτομή, ὀνοματικός, κανών, Ἡρωδιανός], [ὅμο...",
200,tlg0066,tlg0066.tlg001,tlg0066.tlg001.1st1K-grc1.xml,Ps. Dicaearchus,"Dicaearchi, ut fertur, potius vero Athenaei De...",Ἐντεῦθεν εἰς τὸ Ἀθηναίων ἔπεισιν \n\n ...,2757,1Kgr,"[[Ἀθηναῖος, ἔπειμι, ἄστυ], [ὁδός, ἡδύς, γεωργο...",
652,tlg1383,tlg1383.tlg001,tlg1383.tlg001.1st1K-grc1.xml,Geminus,Elementa astronomia,"Ὁ τῶν ζῳδίων κύκλος διαιρεῖται εἰς μέρη ιβ′, ...",20574,1Kgr,"[[ζωίδιον, κύκλος, διαιρέω, μέρος, καλέω, ἕκασ...",
1107,tlg4102,tlg4102.tlg037,tlg4102.tlg037.1st1K-grc1.xml,Catenae (Novum Testamentum),Catena In Epistulam Ad Philemonem,ΑΝΗΡ τις ἦν Φιλήμων τῶν πιστῶν καὶ γενναίων ἀν...,2185,1Kgr,"[[Ἀνήρ, Φιλήμων, πιστός, γενναῖος, ἀνήρ], [οὗτ...",
1283,tlg0007,tlg0007.tlg089,tlg0007.tlg089.perseus-grc2.xml,Plutarch,Περὶ Ἴσιδος και Ὀσίριδος,"πάντα μέν, ὦ Κλέα, δεῖ τἀγαθὰ τοὺς νοῦν ἔχοντα...",18332,perseus,"[[Κλέα, δεῖ, ἀγαθός, νόος, ἔχω, αἰτέω, θεός], ...",


In [74]:
LAGT["lemmata_source"] = LAGT["lemmata_source"].apply(lambda x: "grecy" if x is None else x)

# Simple explorations of what is in the lemmata

In [83]:
lemmata_series = LAGT["lemmatized_sentences"].apply(lambda x: [l for s in x for l in s])
lemmata_all = [l for lemmata in lemmata_series for l in lemmata]
nltk.FreqDist(lemmata_all).most_common()[:100]

[('οὗτος', 364348),
 ('λέγω', 273729),
 ('εἰμί', 229129),
 ('αὐτός', 219506),
 ('γίγνομαι', 173868),
 ('ἔχω', 166248),
 ('πολύς', 135222),
 ('φημί', 116779),
 ('πᾶς', 111288),
 ('ἄλλος', 106778),
 ('ποιέω', 97082),
 ('λόγος', 93057),
 ('τις', 92160),
 ('τίς', 89570),
 ('θεός', 67982),
 ('μέγας', 62129),
 ('ἐκεῖνος', 59461),
 ('ἄνθρωπος', 58719),
 ('πρῶτος', 55821),
 ('ἕτερος', 52649),
 ('οὐδείς', 49900),
 ('πόλις', 49567),
 ('σῶμα', 48549),
 ('τοιοῦτος', 48415),
 ('λαμβάνω', 48313),
 ('ἀγαθός', 47307),
 ('ὁράω', 47064),
 ('φύσις', 45476),
 ('μόνος', 44592),
 ('ἀρχή', 42234),
 ('δοκέω', 41371),
 ('ἀνήρ', 41105),
 ('δύναμαι', 38233),
 ('ἕκαστος', 37620),
 ('δύναμις', 37101),
 ('δίδωμι', 36702),
 ('καλέω', 36355),
 ('ψυχή', 35405),
 ('γῆ', 35381),
 ('χρόνος', 34719),
 ('μέρος', 34201),
 ('ὑπάρχω', 33759),
 ('κινέω', 32630),
 ('δέω', 32382),
 ('ἅπας', 31728),
 ('βασιλεύς', 31257),
 ('οἶδα', 29574),
 ('ὅλος', 28608),
 ('τόπος', 28354),
 ('βούλομαι', 28263),
 ('χράω', 27548),
 ('ἡμέρα', 2723

In [82]:
lemmata_series = LAGT[LAGT["lemmata_source"]=="grecy"]["lemmatized_sentences"].apply(lambda x: [l for s in x for l in s])
lemmata_all = [l for lemmata in lemmata_series for l in lemmata]
nltk.FreqDist(lemmata_all).most_common()[:100]

[('οὗτος', 203072),
 ('λέγω', 191259),
 ('γίγνομαι', 105697),
 ('ἔχω', 103927),
 ('φημί', 81432),
 ('πολύς', 70140),
 ('τίς', 66012),
 ('λόγος', 62317),
 ('ἄλλος', 57911),
 ('ποιέω', 56310),
 ('πᾶς', 48551),
 ('θεός', 43913),
 ('ἄνθρωπος', 39751),
 ('αὐτός', 37861),
 ('μόνος', 35407),
 ('πρῶτος', 34811),
 ('μέγας', 33667),
 ('ἕτερος', 33187),
 ('σῶμα', 32234),
 ('φύσις', 31748),
 ('ἐκεῖνος', 31561),
 ('λαμβάνω', 29963),
 ('ὁράω', 28237),
 ('ψυχή', 27962),
 ('κινέω', 27778),
 ('ἀγαθός', 27685),
 ('ἀρχή', 27153),
 ('δεῖ', 26878),
 ('δύναμαι', 24669),
 ('τοιοῦτος', 23815),
 ('οὐδείς', 23610),
 ('ὑπάρχω', 23510),
 ('γῆ', 22548),
 ('δοκέω', 22514),
 ('ἕκαστος', 22488),
 ('εἶδος', 22412),
 ('πόλις', 22142),
 ('χρόνος', 22057),
 ('μέρος', 21609),
 ('δύναμις', 21541),
 ('δείκνυμι', 21349),
 ('τόπος', 21248),
 ('καλέω', 21223),
 ('δίδωμι', 20088),
 ('ς', 19563),
 ('ἀνήρ', 19097),
 ('ὅλος', 19077),
 ('οὐσία', 18602),
 ('γένος', 18407),
 ('συμβαίνω', 17823),
 ('πατήρ', 17081),
 ('ὄνομα', 17048),


In [75]:
LAGT.to_json("../data/large_files/LAGT_grecy_20240116.json")

In [76]:
#s = sddk.cloudSession("sciencedata.dk", "SDAM_root", "648597@au.dk")
s.write_file("SDAM_data/AGT/LAGT_grecy_20240116.json", LAGT)

A file with the same name ("LAGT_grecy_20240116.json") already exists in this location.
Your <class 'pandas.core.frame.DataFrame'> object has been succesfully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/AGT/LAGT_grecy_20240116.json"
