In [183]:
import pandas as pd
import spacy
from spacy.tokens import Doc
import sddk
import re
import regex
import unicodedata
from greek_accentuation.characters import strip_accents
from greek_accentuation.syllabify import *
from greek_accentuation.accentuation import *

In [10]:
#! ../lagt_venv/bin/python -m pip install grecy
#! ../lagt_venv/bin/python -m grecy install grc_proiel_trf
nlp = spacy.load('grc_proiel_trf')



In [11]:
nlp = spacy.load('grc_proiel_trf')

In [12]:
doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')

In [13]:
doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')
for t in doc:
    print(t.text, t.lemma_)

δοκῶ δοκέω
μοι ἐγώ
περὶ περί
ὧν ὅς
πυνθάνεσθε πυνθάνομαι
οὐκ οὐ
ἀμελέτητος ἀμελέτητος
εἶναι εἰμί


In [15]:
LAGT = pd.read_json("../data/large_files/LAGT_treebanks_20240116.json")

### greCy Test

In [208]:
string = LAGT[LAGT["lemmatized_sentences"].isnull()]["string"].tolist()[0]

In [209]:
doc = nlp(string)
for sent in doc.sents:
    print([t.lemma_ for t in sent if t.pos_ in ["NOUN", "ADJ", "VERB", "PROPN"]])

['Χρὴ', 'γιγνώσκω', 'γῆ', 'περίμετρος']
['στάδιος', 'μυριάς']
['μῆκος']
['ἡμέτερος', 'οἰκουμένη', 'στόμα', 'Γάγγης', 'Γαδείρων', 'στάδιον']
[]
['πλάτος', 'Αἰθιοπικός', 'θάλασσα', 'Τανάϊδος', 'ποταμός', 'στάδιον']
['Εὐφράτης', 'Τίγρις', 'ποταμός', 'καλέω', 'Μεσοποτάμιος', 'διάστημα', 'ἔχω', 'στάδιον']
[]
['ἀναμέτρησις', 'ποιέω', 'Ἐρατοσθένης', 'ἀρχαῖος', 'μαθητικώτατος']
[]
['Βυζάντιον', 'Σωσθένιος', 'στάδιον', 'μίλιον', 'ἥμισυς']
[]
['Σωσθενίας', 'Ἱερόν', 'στάδιον', 'μίλιον']
['ἥμισυς']
['πᾶς', 'μίλιον']
[]
['Ἱερόν', 'Ζεύς', 'Οὐρῖος', 'στόμα', 'Πόντος', 'ἱερόν', 'στόμα', 'Ἴστρος', 'ποταμός', 'στάδιον', 'μίλιον', 'ἥμισυς']
[]
['Ἱερόν', 'Ζεύς', 'Οὐρίης', 'Βορυσθένης', 'ποταμός', 'Δανάπρις', 'καλέω', 'στάδιον']
['μίλιον']
[]
['ἥμισυς']
['Ἱερόν', 'Ζεύς', 'Οὐρίης', 'Πορθμῖος', 'πόλις', 'τέλος', 'Εὐρώπη', 'Πόντος', 'μέρος', 'στομίον', 'Μαιώτης', 'λίμνη', 'Βόσπορος', 'Κιμμερῖος', 'καλέω', 'στάδιον']
['μίλιον']
[]
['λέγω', 'Εὐρώπη', 'Ποντικός', 'περίπλοος', 'ἴσος', 'περίπλοος', 'Ἀσία', 'μέρος'

Let's improve it: preclean the string and make it suitable to work with large documents

In [210]:
def get_doc(string, segment_len=100000):
    if len(string) > segment_len:
        segment = string[:segment_len]
        matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
        if matches:
            split_at = matches[-1][1]  # taking the end index of last match
            current_segment = segment[:split_at]
            next_segment_beginning = segment[split_at:]
        else:
            current_segment = segment
            next_segment_beginning = ""

        segment_doc = nlp(current_segment)
        segment_docs = [segment_doc]

        for n in range(segment_len, len(string), segment_len):
            segment = string[n:n+segment_len]
            if len(segment) == segment_len:
                matches = [(m.start(0), m.end(0)) for m in re.finditer('(\.\s|·\s)', segment)]
                if matches:
                    split_at = matches[-1][1]
                    current_segment = next_segment_beginning + segment[:split_at]
                    next_segment_beginning = segment[split_at:]
                else:
                    current_segment = next_segment_beginning + segment
                    next_segment_beginning = ""
            else:
                current_segment = next_segment_beginning + segment

            segment_doc = nlp(current_segment)
            segment_docs.append(segment_doc)

        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(string)
    return doc

In [211]:
#check with some of the longest document we have...
string = LAGT[LAGT["lemmatized_sentences"].isnull()].sort_values("wordcount", ascending=False)["string"].tolist()[23]
string[:1000]

'p. 980 a 22 Πάντες ἄνθρωποι τοῦ εἰδέναι ὀρέγονται φύσει. σημεῖον\n δὲ ἡ τῶν αἰσθήσεων ἀγάπησις. Δεῖ ἡμᾶς ἀρχομένους τῆς παρούσης πραγματείας εἰπεῖν τὸν σκοπόν,\nτὴν τάξιν,. τὴν αἰτίαν τῆς ἐπιγραφῆς. σκοπὸς μὲν οὖν ἐστι τῆς παρούσης\nπραγματείας τὸ θεολογῆσαι· θεολογεῖ γὰρ ἐν αὐτῇ Ἀριστοτέλης. ἡ δὲ τάξις,\nὅτι ἐκ τῶν φύσει ὑστέρων ἡμεῖς τὰς ἀρχὰς ποιούμεθα, ἐπειδὴ ταῦτα μᾶλλον\n συνεγνωσμένα ἡμῖν ὑπάρχουσι. διὰ τοῦτο τοίνυν ὁ Ἀριστοτέλης πρότερον\nδιελέχθη ἡμῖν περὶ τῶν φυσικῶν πραγμάτων· ταῦτα γὰρ τῇ φύσει ὕστερα\nὑπάρχουσιν, ἡμῖν δὲ πρότερα. ἡ δὲ παροῦσα πραγματεία τῇ μὲν φύσει\nπροτέρα ὧς τὸ τέλειον ἔχουσα, ἡμῖν δὲ ὑστέρα· πρότερα γὰρ τὰ ἄφθαρτα\nτῶν φθαρτῶν καὶ τὰ ἀγένητα τῶν γινομένων. διὰ τοῦτο τοίνυν ὁ Ἀριστοτέλης\n πρότερον διελέχθη ἡμῖν περὶ τῶν ἀτάκτως κινουμένων ἐν τοῖς Μετεώροις,\nκαὶ πάλιν περὶ τῶν τεταγμένως κινουμένων ἐν τῇ Περὶ οὐρανοῦ,\nφημὶ δὴ περὶ ἀστέρων καὶ σφαιρῶν· καὶ λοιπὸν ἐν ταύτῃ τῇ πραγματείᾳ\nδιαλέγεται ἡμῖν περὶ τῶν πάντῃ ἀκινήτων. τοῦτο δέ ἐστι θεολογία ·

In [214]:
# functioms for cleaning the string

def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))

def possible_accentuation(morph):
    try:
        if isinstance(morph, str):
            morph = strip_accents(morph)
            morph = rebreath(morph)
            s = syllabify(morph)
            for accentuation in possible_accentuations(s, default_short=True):
                pos, accent = accentuation
                final = s[1 - pos:] if pos > 1 else [""]
                morph_acc_var = "".join(s[:-pos] + [syllable_add_accent(s[-pos], accent)] + final)
                return morph_acc_var  # Return the first accentuation immediately
        return morph  # If no accentuation is available, return the original morph
    except:
        return morph

def decap(token):
    if len(token) > 1:
        if token[1].isupper():
            token = token[0] + token[1:].lower()
            token = possible_accentuation(token)
    return token

def clean_string(string):
    string = re.sub("ϲ(\W)", r"ς\1", string) # if "ϲ" is last letter of a word
    string = re.sub("ϲ(\w)", r"σ\1", string)
    string = grave_to_acute(string)
    pattern = r'[^\p{Script=Greek}\p{P}\s]+'
    cleaned_string = regex.sub(r'[^\p{Greek}\p{P}]', ' ', string)
    cleaned_string = " ".join([decap(token) for token in cleaned_string.split()])
    cleaned_string = regex.sub(' +', ' ', cleaned_string)
    cleaned_string = cleaned_string.replace("·", ".")
    return cleaned_string

In [None]:
%%time
doc = get_doc(clean_string(string), segment_len=50000)

In [None]:
# extract lemmata from the sentences
def get_lemmatized_sentences(doc):
    lemmatized_sentences = []
    for sent in doc.sents:
        sentence_lemmata = [regex.sub(r'[^\p{Greek}]', "", t.lemma_) for t in sent if t.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]
        sentence_lemmata = [t for t in sentence_lemmata if t != ""]
        if sentence_lemmata != []:
            lemmatized_sentences.append(sentence_lemmata)
    return lemmatized_sentences

In [None]:
# put the string cleaning, doc creation and lemmata together into one function
def from_string_to_lemsents(string):
    doc = get_doc(clean_string(string), segment_len=50000)
    lemmatized_sentences = get_lemmatized_sentences(doc)
    return lemmatized_sentences

In [28]:
%%time
grecy_lemmatized_list = []
def get_grecy_lemmata(string, lemmatized_sentences, doc_id):
    if lemmatized_sentences == None:
        lemmatized_sentences = from_string_to_lemsents(string)
        grecy_lemmatized_list.append(doc_id)
    return lemmatized_sentences

sample_lemmatized_sentences = LAGT.sample(10).apply(lambda row: get_grecy_lemmata(row["string"], row["lemmatized_sentences"], row["doc_id"]), axis=1)

ValueError: [E088] Text of length 1419780 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [30]:
len(grecy_lemmatized_list)

77

In [31]:
LAGT.to_json("../data/large_files/LAGT_grecy_20240116.json")

In [ ]:
s = sddk.cloudSession("sciencedata.dk", "SDAM_root", "648597@au.dk")
s.write_file("SDAM_data/AGT/LAGT_grecy_20240116.json", LAGT)