# Requirements

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import pickle
from collections import defaultdict
import requests
import unicodedata

In [2]:
# load LAGT v 3.0, as we will use it as a source for metadata
#LAGT = pd.read_parquet("../data/large_files/LAGT_v3-0.parquet")
#len(LAGT)

In [3]:
# LAGT.head(5)

In [4]:
# currently from Glaux:
#sum(LAGT["lemmata_source"] == "glaux")

### Load newest local Glaux

In [5]:
greek_data_dir = "/srv/data/greek/"

In [6]:
source_dir = os.path.join(greek_data_dir, "glaux/xml/")
glaux_filenames = os.listdir(source_dir)
glaux_filenames[:10]

['0018-016.xml',
 '0007-131.xml',
 '0057-002.xml',
 '0032-013.xml',
 '0062-022.xml',
 '0284-024.xml',
 '0592-003.xml',
 '1157-001.xml',
 '1183-001.xml',
 '0014-022.xml']

In [7]:
len(glaux_filenames)

1421

In [8]:
with open(source_dir  + glaux_filenames[0], "r") as f:
    soup = BeautifulSoup(f.read())
print(soup.prettify()[:5000])

<html>
 <body>
  <treebank version="2" xml:lang="grc">
   <sentence analysis="auto" document_id="0018-016" id="1" struct_id="349278">
    <word div_section="1" form='"' head="0" id="106302716" lemma='"' postag="u--------" relation="AuxG">
    </word>
    <word div_section="1" form="Σάρα" head="106302723" id="106302717" lemma="Σάρα" postag="n-s---fn-" relation="SBJ">
    </word>
    <word div_section="1" form="δὲ" head="106302723" id="106302718" lemma="δέ" postag="c--------" relation="AuxY">
    </word>
    <word div_section="1" form="ἡ" head="106302720" id="106302719" lemma="ὁ" postag="l-s---fn-" relation="ATR">
    </word>
    <word div_section="1" form="γυνὴ" head="106302717" id="106302720" lemma="γυνή" postag="n-s---fn-" relation="APOS">
    </word>
    <word div_section="1" form="Ἀβραὰμ" head="106302720" id="106302721" lemma="Ἀβραάμ" postag="n--------" relation="ATR">
    </word>
    <word div_section="1" form="οὐκ" head="106302723" id="106302722" lemma="οὐ" postag="d

In [9]:
glaux_tlgs = [re.sub(r'(\d{4})-(\d{3}).xml', r'tlg\1.tlg\2', fn) for fn in glaux_filenames]
glaux_tlgs[:10]

['tlg0018.tlg016',
 'tlg0007.tlg131',
 'tlg0057.tlg002',
 'tlg0032.tlg013',
 'tlg0062.tlg022',
 'tlg0284.tlg024',
 'tlg0592.tlg003',
 'tlg1157.tlg001',
 'tlg1183.tlg001',
 'tlg0014.tlg022']

In [11]:
#LAGT["doc_id"].isin(glaux_tlgs).sum()

In [12]:
#not_lagt = [glaux_fn for glaux_fn in glaux_filenames if re.sub(r'(\d{4})-(\d{3}).xml', r'tlg\1.tlg\2', glaux_fn) not in list(LAGT["doc_id"])]
#not_lagt[:10]

In [13]:
#len(not_lagt)

In [14]:
with open(source_dir  + glaux_filenames[0], "r") as f:
    soup = BeautifulSoup(f.read())

In [15]:
print(soup.prettify()[:20000])

<html>
 <body>
  <treebank version="2" xml:lang="grc">
   <sentence analysis="auto" document_id="0018-016" id="1" struct_id="349278">
    <word div_section="1" form='"' head="0" id="106302716" lemma='"' postag="u--------" relation="AuxG">
    </word>
    <word div_section="1" form="Σάρα" head="106302723" id="106302717" lemma="Σάρα" postag="n-s---fn-" relation="SBJ">
    </word>
    <word div_section="1" form="δὲ" head="106302723" id="106302718" lemma="δέ" postag="c--------" relation="AuxY">
    </word>
    <word div_section="1" form="ἡ" head="106302720" id="106302719" lemma="ὁ" postag="l-s---fn-" relation="ATR">
    </word>
    <word div_section="1" form="γυνὴ" head="106302717" id="106302720" lemma="γυνή" postag="n-s---fn-" relation="APOS">
    </word>
    <word div_section="1" form="Ἀβραὰμ" head="106302720" id="106302721" lemma="Ἀβραάμ" postag="n--------" relation="ATR">
    </word>
    <word div_section="1" form="οὐκ" head="106302723" id="106302722" lemma="οὐ" postag="d

In [16]:
#glaux_metadata = pd.read_csv("../../glaux/metadata.txt", sep="\t")
#glaux_metadata.head(5)

### Extract Glaux sentence level data

In [20]:
target_path = os.path.join(greek_data_dir, "glaux_sentences_2025-08/")
try:
    os.mkdir(target_path)
except:
    pass

In [21]:
os.listdir(target_path)

[]

In [22]:
# Normalize a string to a specified Unicode form
def norm(text):
    return unicodedata.normalize('NFC', text)

In [23]:
def from_filename_to_sentence_data(fn):
    with open(source_dir + fn, "r") as f:
        soup = BeautifulSoup(f.read(), "xml")
    sentences_data = []
    tlg_doc_id = re.sub(r'(\d{4})-(\d{3})\w?\.xml', r'tlg\1.tlg\2', fn)
    for sent_n, sent in enumerate(soup.find_all("sentence")):
        sentence = ""
        sent_data = []
        start_index = 0
        for n, w in enumerate(sent.find_all("word")):
            token = norm(w["form"])
            ref = {}
            for attr in ["div_book", "div_section", "line", "stephanus_page", "div_fragment", "div_jebb_page", "div_ed2page", "div_bekker_page"]:
                try:
                    ref[attr] = w[attr]
                except KeyError:
                    pass

            if n == 0 or w.get("relation") in ["AuxX", "AuxK", "PUNCT"]:
                start_index = len(sentence)
                sentence += token
            else:
                start_index = len(sentence) + 1
                sentence += " " + token

            end_index = start_index + len(token)

            try:
                sent_data.append((token, norm(w["lemma"]), w["postag"][0], start_index, end_index))
            except (KeyError, IndexError):
                sent_data.append((token, token, None, start_index, end_index))
            if w.get("relation") == "AuxK":
                break
        sentences_data.append((tlg_doc_id, sent_n, sentence, sent_data))
    target_fn = tlg_doc_id + ".pickle"
    with open(target_path + target_fn, "wb") as f:
        pickle.dump(sentences_data, f)

In [None]:
%%time
for fn in glaux_filenames:
    from_filename_to_sentence_data(fn)

In [34]:
# ok, this approach does not work for all the files....
files_to_merge = [fn for fn in os.listdir(target_path) if "xml" in fn]
files_to_merge[:10]

['0627-024b.xml.pickle',
 '0007-082a.xml.pickle',
 '0007-052b.xml.pickle',
 '0007-052a.xml.pickle',
 '0541-042c.xml.pickle',
 '0007-084b.xml.pickle',
 '4150-001a.xml.pickle',
 '0541-042a.xml.pickle',
 '0632-002d.xml.pickle',
 '0007-082b.xml.pickle']

In [35]:
# let's rename and merge these files now.
file_groups = defaultdict(list)
pattern = re.compile(r'(\d{4})-(\d{3})[a-z]')

for fn in files_to_merge:
    match = pattern.match(fn)
    if match:
        doc_id = f'tlg{match.group(1)}.tlg{match.group(2)}'
        file_groups[doc_id].append(fn)

# Step 3: Iterate over groups and merge files
for doc_id, files in file_groups.items():
    files.sort()
    merged_list = []
    for fn in files:
        file_path = os.path.join(target_path, fn)
        with open(file_path, "rb") as f:
            data = pickle.load(f)
            merged_list.extend(data)

    # Step 4: Save the merged list
    target_fn = os.path.join(target_path, doc_id + ".pickle")
    with open(target_fn, "wb") as f:
        pickle.dump(merged_list, f)

In [36]:
for fn in files_to_merge:
    os.remove(os.path.join(target_path, fn))

### BACKUP: Add current glaux data to LAGT

In [100]:

os.listdir(target_path)[:10]

['tlg0007.tlg121.pickle',
 'tlg0527.tlg020.pickle',
 'tlg0540.tlg019.pickle',
 'tlg0026.tlg004.pickle',
 'tlg0018.tlg020.pickle',
 'tlg0540.tlg015.pickle',
 'tlg2042.tlg086.pickle',
 'tlg0057.tlg078.pickle',
 'tlg0062.tlg023.pickle',
 'tlg0284.tlg001.pickle']

In [102]:
# open a test file to explore its structure
file_path = target_path + "tlg0007.tlg121.pickle"
with open(file_path, "rb") as f:
    sentences_data = pickle.load(f)
sentences_data[:5]

[('0007-121',
  0,
  'Ἀντιφῶν Σοφίλου μὲν ἦν πατρὸς τῶν δὲ δήμων Ῥαμνούσιος·',
  [('Ἀντιφῶν', 'Ἀντιφῶν', 'n', (0, 7)),
   ('Σοφίλου', 'Σόφιλος', 'n', (8, 15)),
   ('μὲν', 'μέν', 'g', (16, 19)),
   ('ἦν', 'εἰμί', 'v', (20, 22)),
   ('πατρὸς', 'πατήρ', 'n', (23, 29)),
   ('τῶν', 'ὁ', 'l', (30, 33)),
   ('δὲ', 'δέ', 'c', (34, 36)),
   ('δήμων', 'δῆμος', 'n', (37, 42)),
   ('Ῥαμνούσιος', 'Ῥαμνούσιος', 'a', (43, 53)),
   ('·', '·', 'u', (53, 54))]),
 ('0007-121',
  1,
  'μαθητεύσας δὲ τῷ πατρὶ ( ἦν γὰρ σοφιστής, † ὧ καὶ Ἀλκιβιάδης † παρ’ αὐτὸν ἔτι παῖδα ὄντα φοιτῆσαι ) καὶ δύναμιν λόγων κτησάμενος, ὥς τινες νομίζουσιν, ἀπ’ οἰκείας φύσεως, παρέπεμψε μὲν πολιτεύεσθαι, διατριβὴν δὲ συνέστησε καὶ Σωκράτει τῷ φιλοσόφῳ διεφέρετο τὴν ὑπὲρ τῶν λόγων διαφορὰν οὐ φιλονείκως ἀλλ’ ἐλεγκτικῶς, ὡς Ξενοφῶν ἱστόρηκεν ἐν τοῖς Ἀπομνημονεύμασι. E',
  [('μαθητεύσας', 'μαθητεύω', 'v', (0, 10)),
   ('δὲ', 'δέ', 'c', (11, 13)),
   ('τῷ', 'ὁ', 'l', (14, 16)),
   ('πατρὶ', 'πατήρ', 'n', (17, 22)),
   ('(', '(', 'u'

In [103]:
lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v", "NOUN", "PROPN", "ADJ", "VERB"]] for sent in sentences_data]
lemmatized_sentences

[['Ἀντιφῶν', 'Σόφιλος', 'εἰμί', 'πατήρ', 'δῆμος', 'Ῥαμνούσιος'],
 ['μαθητεύω',
  'πατήρ',
  'εἰμί',
  'σοφιστής',
  'Ἀλκιβιάδης',
  'παῖς',
  'εἰμί',
  'φοιτάω',
  'δύναμις',
  'λόγος',
  'κτάομαι',
  'νομίζω',
  'οἰκεῖος',
  'φύσις',
  'παραπέμπω',
  'πολιτεύω',
  'διατριβή',
  'συνίστημι',
  'Σωκράτης',
  'φιλόσοφος',
  'διαφέρω',
  'λόγος',
  'διαφορά',
  'Ξενοφῶν',
  'ἱστορέω',
  'Ἀπομνημόνευμα'],
 ['λόγος',
  'δέω',
  'πολίτης',
  'συγγράφω',
  'δικαστήριον',
  'ἀγών',
  'πρῶτος',
  'τρέπω',
  'φημί'],
 ['γίγνομαι',
  'φέρω',
  'δικανικός',
  'λόγος',
  'ἔθος',
  'συγγράφω',
  'εἰμί',
  'Θεμιστοκλῆς',
  'Ἀριστείδης',
  'Περικλῆς',
  'πολύς',
  'ἀφορμή',
  'ἀνάγκη',
  'παρέχω',
  'καιρός'],
 ['ἀσθένεια',
  'ἀπολείπω',
  'συγγράφω',
  'δῆλος',
  'λέγω',
  'συγγραφεύς',
  'προλέγω',
  'ἀνήρ'],
 ['ἔχω',
  'παλαιός',
  'ἀναφέρω',
  'ἀπομνημονεύω',
  'ἰδέα',
  'λόγος',
  'μεταχειρίζω',
  'εὑρίσκω',
  'ἐπιβάλλω',
  'Ἀντιφών',
  'πρεσβύτης',
  'εἰμί',
  'Ἀλκιβιάδης',
  'Κριτίας',
  'Λυσία

In [104]:
def get_row_data(row):
    doc_id = row["doc_id"]
    source = row["source"]
    lemmata_source = row["lemmata_source"]
    try:
        file_path = target_path + doc_id + ".pickle"
        with open(file_path, "rb") as f:
            sentences_data = pickle.load(f)
        sentences = [sent[2] for sent in sentences_data]
        lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v", "NOUN", "PROPN", "ADJ", "VERB"]] for sent in sentences_data]
        source = "glaux1"
        lemmata_source = "glaux1"
    except:
        sentences = None
        lemmatized_sentences = None
    return sentences, lemmatized_sentences, source, lemmata_source

In [105]:
%%time
result = LAGT.apply(lambda row: pd.Series(get_row_data(row)), axis=1)

CPU times: user 16.7 s, sys: 367 ms, total: 17 s
Wall time: 17 s


In [106]:

len(result)

1710

In [107]:
LAGT["glaux1_lemmatized_sentences"] = result[1]

In [108]:
LAGT["sentences_n_lagt3"] = LAGT["lemmatized_sentences"].apply(len)
LAGT["sentences_n_glaux1"] = LAGT["glaux1_lemmatized_sentences"].apply(lambda x: len(x) if isinstance(x, list) else 0)


In [109]:
LAGT.sample(10)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,glaux1_lemmatized_sentences,sentences_n_lagt3,sentences_n_glaux1
876,tlg2137,tlg2137.tlg001,tlg2137.tlg001.1st1K-grc1.xml,Gaudentius,Harmonica introductio,"„Ἀείδω ξυνετοῖσι, θύρας δʼ ἐπίθεσθε βέβηλοι“\n...",6511,1Kgr,"[[ἀειδλώ, ξυνετός], [θύρα, ἐπιτίθημι, βέβηλος,...",grecy,,101.0,500.0,,[],pagan,2583,"[[ἀείδω, συνετός, θύρα, ἐπιτίθημι, βέβηλος, ἁρ...",442,282
771,tlg2018,tlg2018.tlg011,tlg2018.tlg011.1st1K-grc1.xml,Eusebius,Onomasticon,Εὐσεβίου τοῦ Παμφίλου ἐπισκόπου Καισαρείας τῆς...,19324,1Kgr,"[[Εὐσεβίας, Παμφίλος, ἐπίσκοπος, Καισάρεια, Πα...",grecy,A.D. 4,301.0,400.0,False,"[Scriptores Ecclesiastici, Theologici]",christian,8602,,1178,0
177,tlg0057,tlg0057.tlg092,tlg0057.tlg092.1st1K-grc1.xml,Galen,In Hippocratis Aphorismos,"Ὅτι μὲν οὖν οὗτος ὁ λόγος, εἴθ’ εἷς ἀφορισμός ...",89005,1Kgr,"[[οὗτος, λόγος, εἷς, ἀφορισμός, εἰμί, δύο, προ...",glaux,A.D. 2,101.0,200.0,False,[Medici],pagan,44724,"[[βίος, βραχύς, τέχνη, μακρός, καιρός, ὀξύς, π...",4122,4645
1545,tlg0031e,tlg0031.tlg021,tlg0031.tlg021.perseus-grc2.xml,,New Testament - 1 Peter,\n\n ΠΕΤΡΟΣ ἀπόστολος Ἰησοῦ...,1668,perseus,"[[Πέτρος, ἀπόστολος, Ἰησοῦς, Χριστός, ἐκλεκτός...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,872,"[[Πέτρος, ἀπόστολος, Ἰησοῦς, Χριστός, ἐκλεκτός...",95,88
1816,tlg0533,tlg0533.tlg015,tlg0533.tlg015.perseus-grc4.xml,Callimachus,εἰς Δία,Ζηνὸς ἔοι τί κεν ἄλλο παρὰ σπονδῆισιν ἀείδειν ...,703,perseus,"[[Ζεύς, εἰμί, τίς, ἄλλος, σπονδῆισις, ἀείδω, λ...",glaux,4-3 B.C.,-400.0,-201.0,False,[Philologi],pagan,425,"[[Ζεύς, εἰμί, σπονδή, ἀείδω, λωΐων, θεός, μέγα...",62,62
810,tlg2035,tlg2035.tlg131,tlg2035.tlg131.1st1K-grc1.xml,Athanasius,Oratio II contra Arianos,1. ἘΓΩ μὲν ᾤμην τοὺς τῆς Ἀρείου μανίας ὑποκριτ...,31636,1Kgr,"[[οἴομαι, Ἄρειας, μανία, ὑποκριτής, προερέω, ο...",grecy,A.D. 4,301.0,400.0,False,[Theologici],christian,12621,,2768,0
162,tlg0057,tlg0057.tlg073,tlg0057.tlg073.1st1K-grc2.xml,Galen,Quos quibus catharticis medicamentis et quando...,Τοὺϲ ὑγιεινὰ τὰ ϲώματα ἔχονταϲ ἐργῶδεϲ καθαίρε...,2930,1Kgr,"[[ὑγιεινός, σῶμα, ἔχω, ἐργώδης, καθαίρω], [ἰλι...",glaux,A.D. 2,101.0,200.0,False,[Medici],pagan,1009,"[[ὑγιεινός, σῶμα, ἔχω, ἐργώδης, καθαίρω], [ἰλι...",96,96
1043,tlg4016,tlg4016.tlg001,tlg4016.tlg001.1st1K-grc1.xml,Ammonius,In Porphyrii Isagogen Sive Quinque Voces,"D=Laurentianus 10,26 E=Marcianus 225 F=Parisin...",59111,1Kgr,"[[Μέλλω, ἄρχω, φιλόσοφος, λόγος, ἀναγκαῖος, μα...",grecy,A.D. 5,401.0,500.0,False,[Philosophici/-ae],pagan,20199,,4761,0
1799,tlg0284,tlg0284.tlg052,tlg0284.tlg052.perseus-grc2.xml,Aelius Aristides,Πρεσβευτικὸς πρὸς Ἀχιλλέα,"ἄριστε Ἀχιλλεῦ, τὸ μὲν θυμοῦσθαί σε καὶ χαλεπ...",3439,perseus,"[[ἀγαθός, Ἀχιλλεύς, θυμόω, χαλεπαίνω, ὑβρίσθνω...",glaux,A.D. 2,101.0,200.0,False,[Rhetorici],pagan,1701,"[[ἀγαθός, Ἀχιλλεύς, θυμόω, χαλεπαίνω, ὑβρίζω, ...",195,195
1360,tlg0010,tlg0010.tlg024,tlg0010.tlg024.perseus-grc2.xml,Isocrates,To Archidamus,"εἰδώς, ὦ Ἀρχίδαμε, πολλοὺς ὡρμημένους ἐγκωμιάζ...",1143,perseus,"[[οἶδα, Ἀρχίδαμος, πολύς, ὁρμάω, ἐγκωμιάζω, πα...",glaux,5-4 B.C.,-500.0,-301.0,False,[Oratores],pagan,567,"[[οἶδα, Ἀρχίδαμος, πολύς, ὁρμάω, ἐγκωμιάζω, πα...",35,35


In [110]:
LAGT[LAGT["sentences_n_glaux1"]==0]

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,glaux1_lemmatized_sentences,sentences_n_lagt3,sentences_n_glaux1
2,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",180,1Kgr,"[[Πινυτός, ἀντιγράφω, θαυμάζω, ἀποδέχω, Διονύσ...",grecy,,101.0,200.0,,[],christian,34,,3,0
8,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαί...,77,1Kgr,"[[οὐδενός, εὐνητήρ], [μακροπτολέμοιο, μήτηρ, μ...",grecy,4-3 B.C.,-400.0,-201.0,False,[Bucolici],pagan,61,,8,0
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277,,2036,0
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488,,297,0
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125,,33,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951,tlg2003,tlg2003.tlg017,tlg2003.tlg017.perseus-grc1.xml,Julian the Emperor,Contra Galilaeos,\n Καλῶς ἔχειν ἔμοιγε φαίνεται τὰς αἰτίας ἐκθέ...,10662,perseus,"[[ἔχω, φαίνω, αἰτία, ἐκθέσθαι, ἄνθρωπος, ἐπιέν...",grecy,A.D. 4,301.0,400.0,False,[Philosophici/-ae],pagan,4889,,857,0
1952,tlg2018,tlg2018.tlg002,tlg2018.tlg002.perseus-grc1.xml,Eusebius of Caesarea,Historia ecclesiastica,\nΑ\nΤάδε ἡ πρώτη περιέχει βίβλος τῆς\nἘκκλησι...,103161,perseus,"[[ὅδε, πρῶτος, περιέχω, βίβλος, Ἐκκλησιαστικός...",grecy,A.D. 4,301.0,400.0,False,"[Scriptores Ecclesiastici, Theologici]",christian,49216,,6089,0
1953,tlg2040,tlg2040.tlg002,tlg2040.tlg002.perseus-grc1.xml,"Basil, Saint, Bishop of Caesarea","To Young Men, On How They Might Dervice Profit...",*polla/ me ta\ parakalou=nta/ e)sti cumbouleu=...,4969,perseus,[],grecy,A.D. 4,301.0,400.0,False,[Theologici],christian,0,,0,0
1954,tlg2040,tlg2040.tlg004,tlg2040.tlg004.perseus-grc1.xml,"Basil, Saint, Bishop of Caesarea",Epistulae,Ἀπειρηκότα με ἤδη πρὸς τὰς παρὰ τῆς λεγομένηςλ...,140861,perseus,"[[Ἀπειρηκότα, λεγομένηςλεγομένη, τύχη, ἐπηρεία...",grecy,A.D. 4,301.0,400.0,False,[Theologici],christian,62724,,9581,0


In [77]:
LAGT["sentences_n_glaux1"].apply(lambda x: x  0).sum()

np.int64(1709)

In [52]:
%%time
result = LAGT.apply(lambda row: pd.Series(get_row_data(row)), axis=1)
LAGT["sentences"] = result[0]  # Extract sentences
LAGT["lemmatized_sentences"] = result[1]
LAGT["source"] = result[2]
LAGT["lemmata_source"] = result[3]

CPU times: user 15.3 s, sys: 979 ms, total: 16.3 s
Wall time: 18.1 s


In [53]:
LAGT["lemmatized_sentences"].notnull().sum()

1696

### Adding additional files from glaux

In [54]:
not_lagt = [glaux_fn for glaux_fn in os.listdir(target_path) if glaux_fn.rpartition(".")[0] not in list(LAGT["doc_id"])]
not_lagt[:10]

[]

In [55]:
doc_id = "tlg0497.tlg005"
url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(doc_id)
soup = BeautifulSoup(requests.get(url).text)#

In [38]:
def get_dd_content(dt_text):
    dt_tag = soup.find('dt', string=dt_text)
    if dt_tag is not None:
        dd_tag = dt_tag.find_next_sibling('dd')
        if dd_tag is not None:
            return dd_tag.get_text(strip=True)  # Extract text and strip whitespace
    return None


work_title = get_dd_content("Work title:")
author = get_dd_content("Author:")

In [39]:
work_title


'Epigramma'

In [40]:
not_lagt_data = []
for fn in not_lagt:
    doc_id = fn.rpartition(".")[0]
    author_id = doc_id.partition(".")[0]
    author = ""
    title = ""
    try:
        url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(doc_id)
        soup = BeautifulSoup(requests.get(url).text)#
        try:
            author = get_dd_content("Author:")
            title = get_dd_content("Work title:")
        except:
            url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(author_id)
            soup = BeautifulSoup(requests.get(url).text)
            try:
                author = get_dd_content("Author:")
            except:
                pass
    except:
        pass
    file_path = target_path + doc_id + ".pickle"
    with open(file_path, "rb") as f:
        sentences_data = pickle.load(f)
    sentences = [sent[2] for sent in sentences_data]
    lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v"]] for sent in sentences_data]
    not_lagt_data.append({
        "author_id" : author_id,
        "doc_id" : doc_id,
        "author" : author,
        "title" : title,
        "sentences" : sentences,
        "lemmatized_sentences" : lemmatized_sentences,
        "source" : "glaux1",
        "lemmata_source" : "glaux1"}
    )

In [41]:
LAGT_glaux1 = pd.DataFrame(not_lagt_data)
len(LAGT_glaux1)

229

In [42]:
len(LAGT)

1710

In [56]:
LAGT = pd.concat([LAGT, LAGT_glaux1], ignore_index=True)

In [57]:
LAGT.sample(10)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,sentences
25,tlg0018,tlg0018.tlg020,tlg0018.tlg020.1st1K-grc1.xml,Philo Judaeus,De Abrahamo,Τῶν ἱερῶν νόμων ἐν πέντε βίβλοις ἀναγραφέντων...,25197.0,glaux1,"[[ἱερός, νόμος, βίβλος, ἀναγράφω, πρῶτος, καλέ...",glaux1,1 B.C.-A.D. 1,-100.0,100.0,False,[Philosophici/-ae],jewish,9790.0,[τῶν ἱερῶν νόμων ἐν πέντε βίβλοις ἀναγραφέντων...
1396,tlg0059,tlg0059.tlg036,tlg0059.tlg036.perseus-grc2.xml,Plato,Epistles,\n\n\nΠλάτων Διονυσίῳ εὖ πράττειν.διατρίψας ἐγ...,16876.0,glaux1,"[[Πλάτων, Διονύσιος, πράσσω], [διατρίβω, χρόνο...",glaux1,5-4 B.C.,-500.0,-301.0,False,[Philosophici/-ae],pagan,8597.0,"[Πλάτων Διονυσίῳ εὖ πράττειν. E, διατρίψας ἐγὼ..."
1560,tlg0284,tlg0284.tlg052,tlg0284.tlg052.perseus-grc2.xml,Aelius Aristides,Πρεσβευτικὸς πρὸς Ἀχιλλέα,"ἄριστε Ἀχιλλεῦ, τὸ μὲν θυμοῦσθαί σε καὶ χαλεπ...",3439.0,glaux1,"[[ἀγαθός, Ἀχιλλεύς, θυμόω, χαλεπαίνω, ὑβρίζω, ...",glaux1,A.D. 2,101.0,200.0,False,[Rhetorici],pagan,1701.0,"[ἄριστε Ἀχιλλεῦ, τὸ μὲν θυμοῦσθαί σε καὶ χαλεπ..."
214,tlg0087,tlg0087.tlg005,tlg0087.tlg005.1st1K-grc1.xml,Herodianus,Περὶ πνευμάτων,"1. Theognost. Cr. II 19, 30, Bekk. An. 1428: Π...",72.0,glaux1,"[[], [], [], [ζ, ξ, ψ, υ, ἀρχή, λέξις, εὑρίσκω...",glaux1,A.D. 2,101.0,200.0,False,"[Grammatici, Rhetorici]",pagan,23.0,"[., . ., , , . ., : Πρό τόῦ ζ ἤ ξ ἤ ψ τό υ κατ..."
205,tlg0086,tlg0086.tlg044,tlg0086.tlg044.1st1K-grc1.xml,Aristotle,Topica,"Ἡ ΜΕΝ πρόθεσις τῆς πραγματείας μέθοδον εὑρεῖν,...",44116.0,glaux1,"[[πρόθεσις, πραγματεία, μέθοδος, εὑρίσκω, δύνα...",glaux1,4 B.C.,-400.0,-301.0,False,[Philosophici/-ae],pagan,20961.0,[ἡ μὲν πρόθεσις τῆς πραγματείας μέθοδον εὑρεῖν...
1453,tlg0062,tlg0062.tlg056,tlg0062.tlg056.perseus-grc3.xml,Lucian of Samosata,Ἡρόδοτος ἢ Ἀετίων,Ἡροδότου εἴθε μὲν καὶ τὰ ἄλλα μιμήσασθαι δυνα...,961.0,glaux1,"[[Ἡρόδοτος, μιμέομαι, δυνατός, εἰμί], [φημί, π...",glaux1,A.D. 2,101.0,200.0,False,[Sophistae],pagan,500.0,[Ἡροδότου εἴθε μὲν καὶ τὰ ἄλλα μιμήσασθαι δυνα...
470,tlg0656,tlg0656.tlg001,tlg0656.tlg001.1st1K-grc1.xml,Dioscurides Pedianus,De materia medica,Πολλῶν οὐ μόνον ἀρχαίων ἀλλὰ καὶ νέων συνταξαμ...,269358.0,glaux1,"[[πολύς, ἀρχαῖος, νέος, συντάσσω, φάρμακον, σκ...",glaux1,A.D. 1,1.0,100.0,False,[Medici],pagan,87941.0,[πολλῶν οὐ μόνον ἀρχαίων ἀλλὰ καὶ νέων συνταξα...
659,tlg2022,tlg2022.tlg010,tlg2022.tlg010.1st1K-grc1.xml,Gregorius Nazianzenus,De Filio (Orat. 30),Ἐπειδή σοι τὰς μὲν ἐκ τῶν λογισμῶν στροφὰς καὶ...,15916.0,glaux1,"[[λογισμός, στροφή, πλοξ, διεσείω, δύναμις, πν...",glaux1,A.D. 4,301.0,400.0,False,[Theologici],christian,2866.0,[Ἐπειδή σοι τάς μέν ἐκ τῶν λογισμῶν στροφάς κα...
1842,tlg0242,tlg0242.tlg003,,Asius Samius 6. Jh. v. Chr,Fragmenta Epica,,,glaux1,"[[Ἀντιόπη, τίκτω, Ζῆθος, Ἀμφίων, δῖος, Ἀσωπός,...",glaux1,,,,,,,,[Ἀντιόπη δ’ ἔτεκε Ζῆθον κα- κἀμφίονα δῖον Ἀσωπ...
647,tlg2018,tlg2018.tlg011,tlg2018.tlg011.1st1K-grc1.xml,Eusebius,Onomasticon,Εὐσεβίου τοῦ Παμφίλου ἐπισκόπου Καισαρείας τῆς...,19324.0,glaux1,"[[εὐσεβῖος, παμφίλου, ἐπισκόπης, καισαρεία, πα...",glaux1,A.D. 4,301.0,400.0,False,"[Scriptores Ecclesiastici, Theologici]",christian,8602.0,[Εὐσεβίου τοῦ Παμφίλου ἐπισκόπου Καισαρείας τῆ...


In [58]:
len(LAGT)

2168

In [60]:
LAGT = LAGT.drop_duplicates(subset=['doc_id'], keep='first')

In [61]:
LAGT.to_parquet("../data/large_files/LAGT_glauxed.parquet", engine="pyarrow")