# Requirements

In [17]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import pickle
from collections import defaultdict
import requests
import unicodedata

In [2]:
# load LAGT v 3.0, as we will use it as a source for metadata
LAGT = pd.read_parquet("../data/large_files/LAGT_v3-0.parquet")

In [3]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
2,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",180,1Kgr,"[[Πινυτός, ἀντιγράφω, θαυμάζω, ἀποδέχω, Διονύσ...",grecy,,101.0,200.0,,[],christian,34
8,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαί...,77,1Kgr,"[[οὐδενός, εὐνητήρ], [μακροπτολέμοιο, μήτηρ, μ...",grecy,4-3 B.C.,-400.0,-201.0,False,[Bucolici],pagan,61
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125


In [4]:
# currently from Glaux:
sum(LAGT["lemmata_source"] == "glaux")

835

### Load newest local Glaux

In [5]:
source_dir = "../../glaux/xml/"
glaux_filenames = os.listdir(source_dir)
glaux_filenames[:10]

['0057-094.xml',
 '0632-004.xml',
 '2042-010.xml',
 '0658-002.xml',
 '0010-009.xml',
 '1342-002.xml',
 '0057-057.xml',
 '0010-021.xml',
 '0057-043.xml',
 '0007-049.xml']

In [6]:
glaux_tlgs = [re.sub(r'(\d{4})-(\d{3}).xml', r'tlg\1.tlg\2', fn) for fn in glaux_filenames]
glaux_tlgs[:10]

['tlg0057.tlg094',
 'tlg0632.tlg004',
 'tlg2042.tlg010',
 'tlg0658.tlg002',
 'tlg0010.tlg009',
 'tlg1342.tlg002',
 'tlg0057.tlg057',
 'tlg0010.tlg021',
 'tlg0057.tlg043',
 'tlg0007.tlg049']

In [7]:
LAGT["doc_id"].isin(glaux_tlgs).sum()

1173

In [8]:
not_lagt = [glaux_fn for glaux_fn in glaux_filenames if re.sub(r'(\d{4})-(\d{3}).xml', r'tlg\1.tlg\2', glaux_fn) not in list(LAGT["doc_id"])]
not_lagt[:10]

['0632-004.xml',
 '0658-002.xml',
 '1342-002.xml',
 '0336-003.xml',
 '1772-001.xml',
 '1223-001.xml',
 '0007-082a.xml',
 '0632-002a.xml',
 '0005-005.xml',
 '0261-003.xml']

In [9]:
len(not_lagt)

248

In [10]:
with open(source_dir  + not_lagt[0], "r") as f:
    soup = BeautifulSoup(f.read())

In [11]:
print(soup.prettify())

<treebank version="2" xml:lang="grc">
 <sentence analysis="auto" document_id="0632-004" id="1" struct_id="178136">
  <word div_page="114" form="εἰ" head="102947308" id="102947284" lemma="εἰ" postag="c--------" relation="AuxC">
  </word>
  <word div_page="114" form="μὲν" head="102947308" id="102947285" lemma="μέν" postag="g--------" relation="AuxY">
  </word>
  <word div_page="114" form="εὑρίσκεις" head="102947284" id="102947286" lemma="εὑρίσκω" postag="v2spia---" relation="ADV">
  </word>
  <word div_page="114" form="τοὺς" head="102947288" id="102947287" lemma="ὁ" postag="l-p---ma-" relation="ATR">
  </word>
  <word div_page="114" form="ἀστέρας" head="102947286" id="102947288" lemma="ἀστήρ" postag="n-p---ma-" relation="OBJ">
  </word>
  <word div_page="114" form="τοὺς" head="102947290" id="102947289" lemma="ὁ" postag="l-p---ma-" relation="ATR">
  </word>
  <word div_page="114" form="ἔχοντας" head="102947288" id="102947290" lemma="ἔχω" postag="v-pppama-" relation="AT

### Extract Glaux sentence level data

In [12]:
target_path = "../data/large_files/sents_data/"
try:
    os.mkdir(target_path)
except:
    pass

In [13]:
# Normalize a string to a specified Unicode form
def norm(text):
    return unicodedata.normalize('NFC', text)

In [24]:
def from_filename_to_sentence_data(fn):
    with open(source_dir    + fn, "r") as f:
        soup = BeautifulSoup(f.read())
    sentences_data = []
    for sent_n, sent in enumerate(soup.find_all("sentence")):
        sentence = ""
        sent_data = []
        start_index = 0
        for n, w in enumerate(sent.find_all("word")):
            token = norm(w["form"])
            if n == 0 or w.get("relation") in ["AuxX", "AuxK", "PUNCT"]:
                start_index = len(sentence)
                sentence += token
            else:
                start_index = len(sentence) + 1
                sentence += " " + token
        
            end_index = start_index + len(token)
            word_index = (start_index, end_index)
        
            try:
                sent_data.append((token, norm(w["lemma"]), w["postag"][0], word_index))
            except:
                sent_data.append((token, token, None, word_index))
        sentences_data.append((fn.partition(".")[0], sent_n, sentence, sent_data))
    target_fn = re.sub(r'(\d{4})-(\d{3}).xml', r'tlg\1.tlg\2', fn) + ".pickle"
    with open(target_path + target_fn, "wb") as f:
        pickle.dump(sentences_data, f)

In [25]:
%%time
for fn in glaux_filenames:
    from_filename_to_sentence_data(fn)

CPU times: user 10min 5s, sys: 6.97 s, total: 10min 12s
Wall time: 10min 28s


In [26]:
# ok, this approach does not work for all the files....
files_to_merge = [fn for fn in os.listdir(target_path) if "xml" in fn]
files_to_merge[:10]

['0632-002a.xml.pickle',
 '0541-042b.xml.pickle',
 '0627-024a.xml.pickle',
 '0007-051b.xml.pickle',
 '0007-082b.xml.pickle',
 '0632-002d.xml.pickle',
 '0632-002c.xml.pickle',
 '0093-010b.xml.pickle',
 '0096-002b.xml.pickle',
 '0007-052b.xml.pickle']

In [27]:
# let's rename and merge these files now.
file_groups = defaultdict(list)
pattern = re.compile(r'(\d{4})-(\d{3})[a-z]')

for fn in files_to_merge:
    match = pattern.match(fn)
    if match:
        doc_id = f'tlg{match.group(1)}.tlg{match.group(2)}'
        file_groups[doc_id].append(fn)

# Step 3: Iterate over groups and merge files
for doc_id, files in file_groups.items():
    files.sort()
    merged_list = []
    for fn in files:
        file_path = os.path.join(target_path, fn)
        with open(file_path, "rb") as f:
            data = pickle.load(f)
            merged_list.extend(data)

    # Step 4: Save the merged list
    target_fn = os.path.join(target_path, doc_id + ".pickle")
    with open(target_fn, "wb") as f:
        pickle.dump(merged_list, f)

In [28]:
for fn in files_to_merge:
    os.remove(os.path.join(target_path, fn))

### Add current glaux data to LAGT

In [45]:
# open a test file to explore its structure
file_path = target_path + "tlg0732.tlg010.pickle"
with open(file_path, "rb") as f:
    sentences_data = pickle.load(f)
sentences_data[:5]

[('tlg0732.tlg010',
  0,
  'Ἡ μέν πρόθεσις ἡμῖν, περί ψυχῆς εἰπεῖν τῆς τοῦ ἐν γενέσει τε καί φθορᾷ σώματος',
  [('Ἡ', 'ὁ', 'DET', (0, 1)),
   ('μέν', 'μέν', 'ADV', (2, 5)),
   ('πρόθεσις', 'πρόθεσις', 'NOUN', (6, 14)),
   ('ἡμῖν', 'ἡμεῖς', 'PRON', (15, 19)),
   (',', ',', 'PUNCT', (19, 20)),
   ('περί', 'περί', 'ADP', (21, 25)),
   ('ψυχῆς', 'ψυχή', 'NOUN', (26, 31)),
   ('εἰπεῖν', 'λέγω', 'VERB', (32, 38)),
   ('τῆς', 'ὁ', 'DET', (39, 42)),
   ('τοῦ', 'ὁ', 'DET', (43, 46)),
   ('ἐν', 'ἐν', 'ADP', (47, 49)),
   ('γενέσει', 'γένεσις', 'NOUN', (50, 57)),
   ('τε', 'τε', 'CCONJ', (58, 60)),
   ('καί', 'καί', 'CCONJ', (61, 64)),
   ('φθορᾷ', 'φθορά', 'NOUN', (65, 70)),
   ('σώματος', 'σῶμα', 'NOUN', (71, 78))]),
 ('tlg0732.tlg010', 1, ',', [(',', ',', 'PUNCT', (0, 1))]),
 ('tlg0732.tlg010',
  2,
  'τίς τέ ἐστιν αὐτῆς ἡ οὐσία καί τίνες αἱ δυνάμεις καί πόσαι, καί τίς αὐτῶν ἡ πρός ἀλλήλας διαφορά.',
  [('τίς', 'τίς', 'PRON', (0, 3)),
   ('τέ', 'τε', 'CCONJ', (4, 6)),
   ('ἐστιν', 'εἰμί', 'AUX

In [47]:
[sent[3] for sent in sentences_data][:3]

[[('Ἡ', 'ὁ', 'DET', (0, 1)),
  ('μέν', 'μέν', 'ADV', (2, 5)),
  ('πρόθεσις', 'πρόθεσις', 'NOUN', (6, 14)),
  ('ἡμῖν', 'ἡμεῖς', 'PRON', (15, 19)),
  (',', ',', 'PUNCT', (19, 20)),
  ('περί', 'περί', 'ADP', (21, 25)),
  ('ψυχῆς', 'ψυχή', 'NOUN', (26, 31)),
  ('εἰπεῖν', 'λέγω', 'VERB', (32, 38)),
  ('τῆς', 'ὁ', 'DET', (39, 42)),
  ('τοῦ', 'ὁ', 'DET', (43, 46)),
  ('ἐν', 'ἐν', 'ADP', (47, 49)),
  ('γενέσει', 'γένεσις', 'NOUN', (50, 57)),
  ('τε', 'τε', 'CCONJ', (58, 60)),
  ('καί', 'καί', 'CCONJ', (61, 64)),
  ('φθορᾷ', 'φθορά', 'NOUN', (65, 70)),
  ('σώματος', 'σῶμα', 'NOUN', (71, 78))],
 [(',', ',', 'PUNCT', (0, 1))],
 [('τίς', 'τίς', 'PRON', (0, 3)),
  ('τέ', 'τε', 'CCONJ', (4, 6)),
  ('ἐστιν', 'εἰμί', 'AUX', (7, 12)),
  ('αὐτῆς', 'αὐτός', 'PRON', (13, 18)),
  ('ἡ', 'ὁ', 'DET', (19, 20)),
  ('οὐσία', 'οὐσία', 'NOUN', (21, 26)),
  ('καί', 'καί', 'CCONJ', (27, 30)),
  ('τίνες', 'τίς', 'PRON', (31, 36)),
  ('αἱ', 'ὁ', 'DET', (37, 39)),
  ('δυνάμεις', 'δύναμις', 'NOUN', (40, 48)),
  ('καί',

In [48]:
lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v", "NOUN", "PROPN", "ADJ", "VERB"]] for sent in sentences_data]
lemmatized_sentences

[['πρόθεσις', 'ψυχή', 'λέγω', 'γένεσις', 'φθορά', 'σῶμα'],
 [],
 ['οὐσία', 'δύναμις', 'διαφορά'],
 ['δεῖ',
  'ἄλλος',
  'ἅπας',
  'πείθω',
  'προστάσσω',
  'θεός',
  'προστάττω',
  'προκηρύττομαι',
  'γιγνώσκω',
  'πύθιος',
  'θεός',
  'μόνος',
  'θεός',
  'προγινώσκω',
  'μέλλω',
  'πιστεύω',
  'προλέγοντος',
  'προαγορεύω'],
 ['γνῶσις', 'ἕκαστος', 'περιεσομένος', 'φύσις', 'βίος'],
 [],
 ['γνῶσις', '—', 'παραμυθία', ']'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['ἀλέξανδρος', 'ἀρροδισής', 'σχολικέω', 'ψυχή', 'ἀφροδισέως'],
 [],
 [],
 ['ἀλέξανδρος', 'ἀφροδισέως', 'τολύς', 'αὐτοκράτωρ', 'μείρομαι'],
 [],
 ['ἀφροδισιέως'],
 [],
 [],
 ['γίγνομαι'],
 ['προλέγω'],
 ['λόγος', 'ψυχή', 'γένεσις', 'φθορά', 'φθορά', 'προκηρῦττεμαι'],
 [],
 ['ἑκάστω', 'γνῶσις', 'ψυχή', 'ἄνθρωπος', 'ἄνθρωπος'],
 ['ἀναγκαῖος',
  'βούλομαι',
  'πείθω',
  'θεός',
  'φύσις',
  'ζῶ',
  'βίος',
  'ψυχή',
  'πρῶτος',
  'διαλαμβάνω',
  'οὗτος',
  'γιγνώσκω'],
 ['ἄλλος',
  'ἀριστοτέλης',
  'πρεσβε

In [51]:
def get_row_data(row):
    doc_id = row["doc_id"]
    source = row["source"]
    lemmata_source = row["lemmata_source"]
    try:
        file_path = target_path + doc_id + ".pickle"
        with open(file_path, "rb") as f:
            sentences_data = pickle.load(f)
        sentences = [sent[2] for sent in sentences_data]
        lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v", "NOUN", "PROPN", "ADJ", "VERB"]] for sent in sentences_data]
        source = "glaux1"
        lemmata_source = "glaux1"
    except:
        sentences = None
        lemmatized_sentences = None
    return sentences, lemmatized_sentences, source, lemmata_source

In [52]:
%%time
result = LAGT.apply(lambda row: pd.Series(get_row_data(row)), axis=1)
LAGT["sentences"] = result[0]  # Extract sentences
LAGT["lemmatized_sentences"] = result[1]
LAGT["source"] = result[2]
LAGT["lemmata_source"] = result[3]

CPU times: user 15.3 s, sys: 979 ms, total: 16.3 s
Wall time: 18.1 s


In [53]:
LAGT["lemmatized_sentences"].notnull().sum()

1696

### Adding additional files from glaux

In [54]:
not_lagt = [glaux_fn for glaux_fn in os.listdir(target_path) if glaux_fn.rpartition(".")[0] not in list(LAGT["doc_id"])]
not_lagt[:10]

[]

In [55]:
doc_id = "tlg0497.tlg005"
url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(doc_id)
soup = BeautifulSoup(requests.get(url).text)#

In [38]:
def get_dd_content(dt_text):
    dt_tag = soup.find('dt', string=dt_text)
    if dt_tag is not None:
        dd_tag = dt_tag.find_next_sibling('dd')
        if dd_tag is not None:
            return dd_tag.get_text(strip=True)  # Extract text and strip whitespace
    return None


work_title = get_dd_content("Work title:")
author = get_dd_content("Author:")

In [39]:
work_title


'Epigramma'

In [40]:
not_lagt_data = []
for fn in not_lagt:
    doc_id = fn.rpartition(".")[0]
    author_id = doc_id.partition(".")[0]
    author = ""
    title = ""
    try:
        url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(doc_id)
        soup = BeautifulSoup(requests.get(url).text)#
        try:
            author = get_dd_content("Author:")
            title = get_dd_content("Work title:")
        except:
            url = "http://data.perseus.org/catalog/urn:cts:greekLit:{}".format(author_id)
            soup = BeautifulSoup(requests.get(url).text)
            try:
                author = get_dd_content("Author:")
            except:
                pass
    except:
        pass
    file_path = target_path + doc_id + ".pickle"
    with open(file_path, "rb") as f:
        sentences_data = pickle.load(f)
    sentences = [sent[2] for sent in sentences_data]
    lemmatized_sentences = [[t[1] for t in sent[3] if t[2] in ["n", "a", "v"]] for sent in sentences_data]
    not_lagt_data.append({
        "author_id" : author_id,
        "doc_id" : doc_id,
        "author" : author,
        "title" : title,
        "sentences" : sentences,
        "lemmatized_sentences" : lemmatized_sentences,
        "source" : "glaux1",
        "lemmata_source" : "glaux1"}
    )

In [41]:
LAGT_glaux1 = pd.DataFrame(not_lagt_data)
len(LAGT_glaux1)

229

In [42]:
len(LAGT)

1710

In [56]:
LAGT = pd.concat([LAGT, LAGT_glaux1], ignore_index=True)

In [57]:
LAGT.sample(10)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,sentences
25,tlg0018,tlg0018.tlg020,tlg0018.tlg020.1st1K-grc1.xml,Philo Judaeus,De Abrahamo,Τῶν ἱερῶν νόμων ἐν πέντε βίβλοις ἀναγραφέντων...,25197.0,glaux1,"[[ἱερός, νόμος, βίβλος, ἀναγράφω, πρῶτος, καλέ...",glaux1,1 B.C.-A.D. 1,-100.0,100.0,False,[Philosophici/-ae],jewish,9790.0,[τῶν ἱερῶν νόμων ἐν πέντε βίβλοις ἀναγραφέντων...
1396,tlg0059,tlg0059.tlg036,tlg0059.tlg036.perseus-grc2.xml,Plato,Epistles,\n\n\nΠλάτων Διονυσίῳ εὖ πράττειν.διατρίψας ἐγ...,16876.0,glaux1,"[[Πλάτων, Διονύσιος, πράσσω], [διατρίβω, χρόνο...",glaux1,5-4 B.C.,-500.0,-301.0,False,[Philosophici/-ae],pagan,8597.0,"[Πλάτων Διονυσίῳ εὖ πράττειν. E, διατρίψας ἐγὼ..."
1560,tlg0284,tlg0284.tlg052,tlg0284.tlg052.perseus-grc2.xml,Aelius Aristides,Πρεσβευτικὸς πρὸς Ἀχιλλέα,"ἄριστε Ἀχιλλεῦ, τὸ μὲν θυμοῦσθαί σε καὶ χαλεπ...",3439.0,glaux1,"[[ἀγαθός, Ἀχιλλεύς, θυμόω, χαλεπαίνω, ὑβρίζω, ...",glaux1,A.D. 2,101.0,200.0,False,[Rhetorici],pagan,1701.0,"[ἄριστε Ἀχιλλεῦ, τὸ μὲν θυμοῦσθαί σε καὶ χαλεπ..."
214,tlg0087,tlg0087.tlg005,tlg0087.tlg005.1st1K-grc1.xml,Herodianus,Περὶ πνευμάτων,"1. Theognost. Cr. II 19, 30, Bekk. An. 1428: Π...",72.0,glaux1,"[[], [], [], [ζ, ξ, ψ, υ, ἀρχή, λέξις, εὑρίσκω...",glaux1,A.D. 2,101.0,200.0,False,"[Grammatici, Rhetorici]",pagan,23.0,"[., . ., , , . ., : Πρό τόῦ ζ ἤ ξ ἤ ψ τό υ κατ..."
205,tlg0086,tlg0086.tlg044,tlg0086.tlg044.1st1K-grc1.xml,Aristotle,Topica,"Ἡ ΜΕΝ πρόθεσις τῆς πραγματείας μέθοδον εὑρεῖν,...",44116.0,glaux1,"[[πρόθεσις, πραγματεία, μέθοδος, εὑρίσκω, δύνα...",glaux1,4 B.C.,-400.0,-301.0,False,[Philosophici/-ae],pagan,20961.0,[ἡ μὲν πρόθεσις τῆς πραγματείας μέθοδον εὑρεῖν...
1453,tlg0062,tlg0062.tlg056,tlg0062.tlg056.perseus-grc3.xml,Lucian of Samosata,Ἡρόδοτος ἢ Ἀετίων,Ἡροδότου εἴθε μὲν καὶ τὰ ἄλλα μιμήσασθαι δυνα...,961.0,glaux1,"[[Ἡρόδοτος, μιμέομαι, δυνατός, εἰμί], [φημί, π...",glaux1,A.D. 2,101.0,200.0,False,[Sophistae],pagan,500.0,[Ἡροδότου εἴθε μὲν καὶ τὰ ἄλλα μιμήσασθαι δυνα...
470,tlg0656,tlg0656.tlg001,tlg0656.tlg001.1st1K-grc1.xml,Dioscurides Pedianus,De materia medica,Πολλῶν οὐ μόνον ἀρχαίων ἀλλὰ καὶ νέων συνταξαμ...,269358.0,glaux1,"[[πολύς, ἀρχαῖος, νέος, συντάσσω, φάρμακον, σκ...",glaux1,A.D. 1,1.0,100.0,False,[Medici],pagan,87941.0,[πολλῶν οὐ μόνον ἀρχαίων ἀλλὰ καὶ νέων συνταξα...
659,tlg2022,tlg2022.tlg010,tlg2022.tlg010.1st1K-grc1.xml,Gregorius Nazianzenus,De Filio (Orat. 30),Ἐπειδή σοι τὰς μὲν ἐκ τῶν λογισμῶν στροφὰς καὶ...,15916.0,glaux1,"[[λογισμός, στροφή, πλοξ, διεσείω, δύναμις, πν...",glaux1,A.D. 4,301.0,400.0,False,[Theologici],christian,2866.0,[Ἐπειδή σοι τάς μέν ἐκ τῶν λογισμῶν στροφάς κα...
1842,tlg0242,tlg0242.tlg003,,Asius Samius 6. Jh. v. Chr,Fragmenta Epica,,,glaux1,"[[Ἀντιόπη, τίκτω, Ζῆθος, Ἀμφίων, δῖος, Ἀσωπός,...",glaux1,,,,,,,,[Ἀντιόπη δ’ ἔτεκε Ζῆθον κα- κἀμφίονα δῖον Ἀσωπ...
647,tlg2018,tlg2018.tlg011,tlg2018.tlg011.1st1K-grc1.xml,Eusebius,Onomasticon,Εὐσεβίου τοῦ Παμφίλου ἐπισκόπου Καισαρείας τῆς...,19324.0,glaux1,"[[εὐσεβῖος, παμφίλου, ἐπισκόπης, καισαρεία, πα...",glaux1,A.D. 4,301.0,400.0,False,"[Scriptores Ecclesiastici, Theologici]",christian,8602.0,[Εὐσεβίου τοῦ Παμφίλου ἐπισκόπου Καισαρείας τῆ...


In [58]:
len(LAGT)

2168

In [60]:
LAGT = LAGT.drop_duplicates(subset=['doc_id'], keep='first')

In [61]:
LAGT.to_parquet("../data/large_files/LAGT_glauxed.parquet", engine="pyarrow")