In [1]:
import pandas as pd
import spacy
from spacy.tokens import Doc
import sddk
import re
import regex
import unicodedata
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *
import nltk

In [2]:
#! ../lagt_venv/bin/python -m pip install grecy
#! ../lagt_venv/bin/python -m grecy install grc_proiel_trf
nlp = spacy.load('grc_proiel_trf')

In [3]:
doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')

In [4]:
doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')
for t in doc:
    print(t.text, t.lemma_, t.pos_)

δοκῶ δοκέω VERB
μοι ἐγώ PRON
περὶ περί ADP
ὧν ὅς PRON
πυνθάνεσθε πυνθάνομαι VERB
οὐκ οὐ ADV
ἀμελέτητος ἀμελέτητος ADJ
εἶναι εἰμί AUX


In [5]:
LAGT = pd.read_json("../data/large_files/LAGT_treebanks_20240116.json")

### greCy Test

In [6]:
string = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].tolist()[0]

In [7]:
doc = nlp(string)
for sent in doc.sents:
    print([t.lemma_ for t in sent if t.pos_ in ["NOUN", "ADJ", "VERB", "PROPN"]])

['Χρὴ', 'γιγνώσκω', 'γῆ', 'περίμετρος']
['στάδιος', 'μυριάς']
['μῆκος']
['ἡμέτερος', 'οἰκουμένη', 'στόμα', 'Γάγγης', 'Γαδείρων', 'στάδιον']
[]
['πλάτος', 'Αἰθιοπικός', 'θάλασσα', 'Τανάϊδος', 'ποταμός', 'στάδιον']
['Εὐφράτης', 'Τίγρις', 'ποταμός', 'καλέω', 'Μεσοποτάμιος', 'διάστημα', 'ἔχω', 'στάδιον']
[]
['ἀναμέτρησις', 'ποιέω', 'Ἐρατοσθένης', 'ἀρχαῖος', 'μαθητικώτατος']
[]
['Βυζάντιον', 'Σωσθένιος', 'στάδιον', 'μίλιον', 'ἥμισυς']
[]
['Σωσθενίας', 'Ἱερόν', 'στάδιον', 'μίλιον']
['ἥμισυς']
['πᾶς', 'μίλιον']
[]
['Ἱερόν', 'Ζεύς', 'Οὐρῖος', 'στόμα', 'Πόντος', 'ἱερόν', 'στόμα', 'Ἴστρος', 'ποταμός', 'στάδιον', 'μίλιον', 'ἥμισυς']
[]
['Ἱερόν', 'Ζεύς', 'Οὐρίης', 'Βορυσθένης', 'ποταμός', 'Δανάπρις', 'καλέω', 'στάδιον']
['μίλιον']
[]
['ἥμισυς']
['Ἱερόν', 'Ζεύς', 'Οὐρίης', 'Πορθμῖος', 'πόλις', 'τέλος', 'Εὐρώπη', 'Πόντος', 'μέρος', 'στομίον', 'Μαιώτης', 'λίμνη', 'Βόσπορος', 'Κιμμερῖος', 'καλέω', 'στάδιον']
['μίλιον']
[]
['λέγω', 'Εὐρώπη', 'Ποντικός', 'περίπλοος', 'ἴσος', 'περίπλοος', 'Ἀσία', 'μέρος'

Let's improve it: preclean the string and make it suitable to work with large documents

# Text preprocessing & function developement

In [8]:
#check with some of the longest document we have...
string = LAGT[LAGT["lemmatized_sentences"].isnull()].sort_values("wordcount", ascending=False)["string"].tolist()[23]
string[:1000]

'p. 980 a 22 Πάντες ἄνθρωποι τοῦ εἰδέναι ὀρέγονται φύσει. σημεῖον\n δὲ ἡ τῶν αἰσθήσεων ἀγάπησις. Δεῖ ἡμᾶς ἀρχομένους τῆς παρούσης πραγματείας εἰπεῖν τὸν σκοπόν,\nτὴν τάξιν,. τὴν αἰτίαν τῆς ἐπιγραφῆς. σκοπὸς μὲν οὖν ἐστι τῆς παρούσης\nπραγματείας τὸ θεολογῆσαι· θεολογεῖ γὰρ ἐν αὐτῇ Ἀριστοτέλης. ἡ δὲ τάξις,\nὅτι ἐκ τῶν φύσει ὑστέρων ἡμεῖς τὰς ἀρχὰς ποιούμεθα, ἐπειδὴ ταῦτα μᾶλλον\n συνεγνωσμένα ἡμῖν ὑπάρχουσι. διὰ τοῦτο τοίνυν ὁ Ἀριστοτέλης πρότερον\nδιελέχθη ἡμῖν περὶ τῶν φυσικῶν πραγμάτων· ταῦτα γὰρ τῇ φύσει ὕστερα\nὑπάρχουσιν, ἡμῖν δὲ πρότερα. ἡ δὲ παροῦσα πραγματεία τῇ μὲν φύσει\nπροτέρα ὧς τὸ τέλειον ἔχουσα, ἡμῖν δὲ ὑστέρα· πρότερα γὰρ τὰ ἄφθαρτα\nτῶν φθαρτῶν καὶ τὰ ἀγένητα τῶν γινομένων. διὰ τοῦτο τοίνυν ὁ Ἀριστοτέλης\n πρότερον διελέχθη ἡμῖν περὶ τῶν ἀτάκτως κινουμένων ἐν τοῖς Μετεώροις,\nκαὶ πάλιν περὶ τῶν τεταγμένως κινουμένων ἐν τῇ Περὶ οὐρανοῦ,\nφημὶ δὴ περὶ ἀστέρων καὶ σφαιρῶν· καὶ λοιπὸν ἐν ταύτῃ τῇ πραγματείᾳ\nδιαλέγεται ἡμῖν περὶ τῶν πάντῃ ἀκινήτων. τοῦτο δέ ἐστι θεολογία ·

In [9]:
# functioms for cleaning the string

def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))

def possible_accentuation(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph)
            s = syllabify(morph)
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                return morph_acc_var  # Return the first accentuation immediately
        return morph  # If no accentuation is available, return the original morph
    except:
        return morph

def decap(token):
    if len(token) > 1:
        if token[1].isupper():
            token = token[0] + token[1:].lower()
            token = possible_accentuation(token)
    return token

def clean_string(string):
    string = re.sub("ϲ(\W)", r"ς\1", string) # if "ϲ" is last letter of a word
    string = re.sub("ϲ(\w)", r"σ\1", string)
    string = grave_to_acute(string)
    string = string.replace("—", " — ")
    cleaned_string = regex.sub(r'[^\p{Greek}\p{P}]', ' ', string)
    cleaned_string = " ".join([decap(token) for token in cleaned_string.split()])
    cleaned_string = regex.sub(' +', ' ', cleaned_string)
    cleaned_string = cleaned_string.replace("·", ".")
    return cleaned_string

In [10]:
def get_doc(string, segment_len=100000):
    if len(string) > segment_len:
        segment = string[:segment_len]
        matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
        if matches:
            split_at = matches[-1][1]  # taking the end index of last match
            current_segment = segment[:split_at]
            next_segment_beginning = segment[split_at:]
        else:
            current_segment = segment
            next_segment_beginning = ""

        segment_doc = nlp(current_segment)
        segment_docs = [segment_doc]

        for n in range(segment_len, len(string), segment_len):
            segment = string[n:n+segment_len]
            if len(segment) == segment_len:
                matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
                if matches:
                    split_at = matches[-1][1]
                    current_segment = next_segment_beginning + segment[:split_at]
                    next_segment_beginning = segment[split_at:]
                else:
                    current_segment = next_segment_beginning + segment
                    next_segment_beginning = ""
            else:
                current_segment = next_segment_beginning + segment

            segment_doc = nlp(current_segment)
            segment_docs.append(segment_doc)

        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(string)
    return doc

In [11]:
len(string)

1196236

In [12]:
%%time
#doc = get_doc(clean_string(string), segment_len=50000)

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 5.01 µs


In [13]:
(LAGT["string"].apply(len)>1000000).sum()

47

In [14]:
# extract lemmata from the sentences
def get_lemmatized_sentences(doc):
    lemmatized_sentences = []
    for sent in doc.sents:
        sentence_lemmata = [regex.sub(r'[^\p{Greek}]', "", t.lemma_) for t in sent if t.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]
        sentence_lemmata = [t for t in sentence_lemmata if t != ""]
        if sentence_lemmata != []:
            lemmatized_sentences.append(sentence_lemmata)
    return lemmatized_sentences

In [21]:
# put the string cleaning, doc creation and lemmata together into one function
def from_string_to_lemsents(string):
    try:    
        doc = get_doc(clean_string(string), segment_len=50000)
        lemmatized_sentences = get_lemmatized_sentences(doc)
    except:
        lemmatized_sentences = None
    return lemmatized_sentences

In [27]:
grecy_lemmatized_list = []
def get_grecy_lemmata(string, lemmatized_sentences, doc_id):
    if lemmatized_sentences == None:
        lemmatized_sentences = from_string_to_lemsents(string)
        grecy_lemmatized_list.append(doc_id)
        print(len(grecy_lemmatized_list), doc_id, len(string))
    return lemmatized_sentences

#sample_lemmatized_sentences = LAGT.sample(10, random_state=1).apply(lambda row: get_grecy_lemmata(row["string"], row["lemmatized_sentences"], row["doc_id"]), axis=1)

In [28]:
#sample_lemmatized_sentences

In [29]:
#LAGT.loc[sample_lemmatized_sentences.index]

In [30]:
#grecy_lemmatized_list

# Applying the main funtion

In [31]:
%%time
### will be time consuming...
LAGT["lemmatized_sentences"] = LAGT.apply(lambda row: get_grecy_lemmata(row["string"], row["lemmatized_sentences"], row["doc_id"]), axis=1)

1 ggm0001.ggm001 3857
2 ogl0001.ogl001 1257
3 stoa0033a.tlg028 41978
4 stoa0033a.tlg043 20816
5 stoa0121.stoa001 171855
6 stoa0146d.stoa001 27696
7 tlg0005.tlg003 572
8 tlg0006.tlg020 108623
9 tlg0007.tlg146 18130
10 tlg0007.tlg147 1069
11 tlg0018.tlg001 143932
12 tlg0018.tlg002 260468
13 tlg0018.tlg003 76774
14 tlg0018.tlg004 141567
15 tlg0018.tlg005 96145
16 tlg0018.tlg006 97880
17 tlg0018.tlg007 32667
18 tlg0018.tlg008 94218
19 tlg0018.tlg009 92533
20 tlg0018.tlg010 89337
21 tlg0018.tlg011 108351
22 tlg0018.tlg012 33173
23 tlg0018.tlg013 93899
24 tlg0018.tlg014 110248
25 tlg0018.tlg015 172311
26 tlg0018.tlg016 90855
27 tlg0018.tlg017 114022
28 tlg0018.tlg018 114895
29 tlg0018.tlg019 246616
30 tlg0018.tlg020 151034
31 tlg0018.tlg021 139575
32 tlg0018.tlg022 365437
33 tlg0018.tlg023 95569
34 tlg0018.tlg024 642298
35 tlg0018.tlg025 173969
36 tlg0018.tlg026 99048
37 tlg0018.tlg027 107520
38 tlg0018.tlg028 66001
39 tlg0018.tlg029 118642
40 tlg0018.tlg030 91632
41 tlg0018.tlg031 186072
42

In [39]:
LAGT[LAGT["lemmatized_sentences"].isnull()]

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source
411,tlg0530,tlg0530.tlg006,tlg0530.tlg006.1st1K-grc1.xml,Pseudo-Galen,Ad Gaurum quomodo animetur fetus,Τὸ περὶ τῆς εἰς τὰ σώματα τῶν ψυχῶν εἰσκρίσεως...,9595,1Kgr,,
824,tlg2042,tlg2042.tlg009,tlg2042.tlg009.opp-grc1.xml,Origenes,In Jeremiam (Homiliae 1-11),\n Ὁ θεὸς εἰς ἀγαθοποιίαν πρόχειρός\n ...,36685,1Kgr,,
834,tlg2042,tlg2042.tlg021,tlg2042.tlg021.opp-grc1.xml,Origenes,In Jeremiam (Homiliae 12-20),"Ὃ προστάσσεται ὁ προφήτης λέγειν ὑπὸ θεοῦ, ὀφε...",48360,1Kgr,,


In [42]:
missing_i = LAGT[LAGT["lemmatized_sentences"].isnull()].index

In [64]:
string = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].tolist()[1]

In [65]:
type(string)

str

In [66]:
cleaned_string = clean_string(string)

In [67]:
cleaned_string = re.sub("\.{3}\.*", "...", cleaned_string)

In [68]:
doc = nlp(cleaned_string)

In [69]:
def dealing_with_missing(string):
    string = str(string)
    cleaned_string = clean_string(string)
    cleaned_string = re.sub("\.{3}\.*", "...", cleaned_string)
    doc = nlp(cleaned_string)
    lemmatized_sentences = get_lemmatized_sentences(doc)
    return lemmatized_sentences

lemmatized_missing = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].apply(dealing_with_missing)

In [70]:
lemmatized_missing

411    [[σῶμα, ψυχή, εἰσκρίσις, ζῳογονία, δόγμα, πολύ...
824    [[θεός, ἀγαθοποιία, πρόχειρος, κολάζω, ἄξιος, ...
834    [[προστάσσω, προφήτης, λέγω, θεός], [ὀφείλω, ἄ...
Name: string, dtype: object

In [71]:
LAGT.loc[missing_i, "lemmatized_sentences"] = lemmatized_missing

In [72]:
LAGT.sample(10, random_state=1)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source
1532,tlg0031,tlg0031.tlg008,tlg0031.tlg008.perseus-grc2.xml,,New Testament - 2 Corinthians,\n\n ΠΑΥΛΟΣ ἀπόστολος Χριστ...,4470,perseus,"[[Παῦλος, ἀπόστολος, Χριστός, Ἰησοῦς, θέλημα, ...",morphgnt
1619,tlg0060,tlg0060.tlg001,tlg0060.tlg001.perseus-grc6.xml,Diodorus Siculus,Βιβλιοθήκη Ἱστορική (Books 18-20),τάδε ἔνεστιν ἐν τῇ ὀκτωκαιδεκάτῃ τῶν Διοδώρου...,79283,perseus,"[[θεός, δισσός, παλαιός, ἄνθρωπος, μεταγενής, ...",glaux
581,tlg0732,tlg0732.tlg013,tlg0732.tlg013.1st1K-grc1.xml,Alexander of Aphrodisias,Ἠθικὰ προβλήματα [Sp.],α. Ἀπορίαι πρὸς τοὺς τὸ ζῆν οὐκ ἀγαθὸν\n ...,23343,1Kgr,"[[Ἀπορίαις, ζῶ, ἀγαθός, λέγω], [ὁμοειδέω, ἡδον...",
474,tlg0591,tlg0591.1st1K001,tlg0591.1st1K001.1st1K-grc1.xml,Antisthenes,Αἴας,Ἐβουλόμην ἂν τοὺς αὐτοὺς ἡμῖν δικάζειν οἵπερ \...,502,1Kgr,"[[Ἐβουλόμης, αὐτός, δικάζω, πρᾶγμα, πάρειμι], ...",
704,tlg1699,tlg1699.tlg004,tlg1699.tlg004.1st1K-grc1.xml,Teles Megarenesis,Περὶ συγκρίσεως πενίας καὶ πλούτου,Δοκεῖ μοι ἡ τῶν χρημάτων κτῆσις σπάνεως καὶ ἐν...,1195,1Kgr,"[[δοκέω, χρῆμα, κτῆσις, σπάνις, ἔνδεια, ἀπολύω...",
274,tlg0087,tlg0087.tlg013,tlg0087.tlg013.1st1K-grc1.xml,Herodianus,Περὶ κλίσεως ὀνομάτων,"1. Anecd.Ox.IV 333, 6: Ἐπιτομὴ τῶν ὀνοματικῶν ...",59506,1Kgr,"[[Ἐπιτομή, ὀνοματικός, κανών, Ἡρωδιανός], [ὅμο...",
200,tlg0066,tlg0066.tlg001,tlg0066.tlg001.1st1K-grc1.xml,Ps. Dicaearchus,"Dicaearchi, ut fertur, potius vero Athenaei De...",Ἐντεῦθεν εἰς τὸ Ἀθηναίων ἔπεισιν \n\n ...,2757,1Kgr,"[[Ἀθηναῖος, ἔπειμι, ἄστυ], [ὁδός, ἡδύς, γεωργο...",
652,tlg1383,tlg1383.tlg001,tlg1383.tlg001.1st1K-grc1.xml,Geminus,Elementa astronomia,"Ὁ τῶν ζῳδίων κύκλος διαιρεῖται εἰς μέρη ιβ′, ...",20574,1Kgr,"[[ζωίδιον, κύκλος, διαιρέω, μέρος, καλέω, ἕκασ...",
1107,tlg4102,tlg4102.tlg037,tlg4102.tlg037.1st1K-grc1.xml,Catenae (Novum Testamentum),Catena In Epistulam Ad Philemonem,ΑΝΗΡ τις ἦν Φιλήμων τῶν πιστῶν καὶ γενναίων ἀν...,2185,1Kgr,"[[Ἀνήρ, Φιλήμων, πιστός, γενναῖος, ἀνήρ], [οὗτ...",
1283,tlg0007,tlg0007.tlg089,tlg0007.tlg089.perseus-grc2.xml,Plutarch,Περὶ Ἴσιδος και Ὀσίριδος,"πάντα μέν, ὦ Κλέα, δεῖ τἀγαθὰ τοὺς νοῦν ἔχοντα...",18332,perseus,"[[Κλέα, δεῖ, ἀγαθός, νόος, ἔχω, αἰτέω, θεός], ...",


In [74]:
LAGT["lemmata_source"] = LAGT["lemmata_source"].apply(lambda x: "grecy" if x is None else x)

# Simple explorations of what is in the lemmata

In [83]:
lemmata_series = LAGT["lemmatized_sentences"].apply(lambda x: [l for s in x for l in s])
lemmata_all = [l for lemmata in lemmata_series for l in lemmata]
nltk.FreqDist(lemmata_all).most_common()[:100]

[('οὗτος', 364348),
 ('λέγω', 273729),
 ('εἰμί', 229129),
 ('αὐτός', 219506),
 ('γίγνομαι', 173868),
 ('ἔχω', 166248),
 ('πολύς', 135222),
 ('φημί', 116779),
 ('πᾶς', 111288),
 ('ἄλλος', 106778),
 ('ποιέω', 97082),
 ('λόγος', 93057),
 ('τις', 92160),
 ('τίς', 89570),
 ('θεός', 67982),
 ('μέγας', 62129),
 ('ἐκεῖνος', 59461),
 ('ἄνθρωπος', 58719),
 ('πρῶτος', 55821),
 ('ἕτερος', 52649),
 ('οὐδείς', 49900),
 ('πόλις', 49567),
 ('σῶμα', 48549),
 ('τοιοῦτος', 48415),
 ('λαμβάνω', 48313),
 ('ἀγαθός', 47307),
 ('ὁράω', 47064),
 ('φύσις', 45476),
 ('μόνος', 44592),
 ('ἀρχή', 42234),
 ('δοκέω', 41371),
 ('ἀνήρ', 41105),
 ('δύναμαι', 38233),
 ('ἕκαστος', 37620),
 ('δύναμις', 37101),
 ('δίδωμι', 36702),
 ('καλέω', 36355),
 ('ψυχή', 35405),
 ('γῆ', 35381),
 ('χρόνος', 34719),
 ('μέρος', 34201),
 ('ὑπάρχω', 33759),
 ('κινέω', 32630),
 ('δέω', 32382),
 ('ἅπας', 31728),
 ('βασιλεύς', 31257),
 ('οἶδα', 29574),
 ('ὅλος', 28608),
 ('τόπος', 28354),
 ('βούλομαι', 28263),
 ('χράω', 27548),
 ('ἡμέρα', 2723

In [82]:
lemmata_series = LAGT[LAGT["lemmata_source"]=="grecy"]["lemmatized_sentences"].apply(lambda x: [l for s in x for l in s])
lemmata_all = [l for lemmata in lemmata_series for l in lemmata]
nltk.FreqDist(lemmata_all).most_common()[:100]

[('οὗτος', 203072),
 ('λέγω', 191259),
 ('γίγνομαι', 105697),
 ('ἔχω', 103927),
 ('φημί', 81432),
 ('πολύς', 70140),
 ('τίς', 66012),
 ('λόγος', 62317),
 ('ἄλλος', 57911),
 ('ποιέω', 56310),
 ('πᾶς', 48551),
 ('θεός', 43913),
 ('ἄνθρωπος', 39751),
 ('αὐτός', 37861),
 ('μόνος', 35407),
 ('πρῶτος', 34811),
 ('μέγας', 33667),
 ('ἕτερος', 33187),
 ('σῶμα', 32234),
 ('φύσις', 31748),
 ('ἐκεῖνος', 31561),
 ('λαμβάνω', 29963),
 ('ὁράω', 28237),
 ('ψυχή', 27962),
 ('κινέω', 27778),
 ('ἀγαθός', 27685),
 ('ἀρχή', 27153),
 ('δεῖ', 26878),
 ('δύναμαι', 24669),
 ('τοιοῦτος', 23815),
 ('οὐδείς', 23610),
 ('ὑπάρχω', 23510),
 ('γῆ', 22548),
 ('δοκέω', 22514),
 ('ἕκαστος', 22488),
 ('εἶδος', 22412),
 ('πόλις', 22142),
 ('χρόνος', 22057),
 ('μέρος', 21609),
 ('δύναμις', 21541),
 ('δείκνυμι', 21349),
 ('τόπος', 21248),
 ('καλέω', 21223),
 ('δίδωμι', 20088),
 ('ς', 19563),
 ('ἀνήρ', 19097),
 ('ὅλος', 19077),
 ('οὐσία', 18602),
 ('γένος', 18407),
 ('συμβαίνω', 17823),
 ('πατήρ', 17081),
 ('ὄνομα', 17048),


In [75]:
LAGT.to_json("../data/large_files/LAGT_grecy_20240116.json")

In [76]:
#s = sddk.cloudSession("sciencedata.dk", "SDAM_root", "648597@au.dk")
s.write_file("SDAM_data/AGT/LAGT_grecy_20240116.json", LAGT)

A file with the same name ("LAGT_grecy_20240116.json") already exists in this location.
Your <class 'pandas.core.frame.DataFrame'> object has been succesfully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/AGT/LAGT_grecy_20240116.json"
