In [2]:
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import json
import pickle
import re
from nltk.tokenize import word_tokenize
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
file_data = json.load(open(os.path.expanduser("../../../ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(
    ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

lagt_metadata_gs = gc.open_by_url(
    "https://docs.google.com/spreadsheets/d/10pGulpiwzjUozVEVstKBwtftyDisSY9h7-kl82TVs0A/edit?usp=sharing")

In [5]:
greek_data_dir = "/srv/data/greek/"
os.listdir(greek_data_dir)

['oga_graphannis',
 'oga_sentences_2025-08',
 'oga_sentences',
 'glaux_sentences',
 'grela_v0-2.duckdb',
 'OGA',
 'glaux',
 'exprecce_sentences_2025-08',
 'exprecce',
 'grela.duckdb.tmp',
 'grela.duckdb',
 'glaux_sentences_2025-08',
 'LAGT',
 'OGA_0-2-0.zip',
 'grela_v0-2.duckdb.wal',
 'opera_graeca_adnotata_v0.2.0',
 'exprecce_sentences']

In [8]:
oga_sentences_dir = greek_data_dir + "oga_sentences_2025-08/"
glaux_sentences_dir = greek_data_dir + "glaux_sentences_2025-08/"
exprecce_sentences_dir = greek_data_dir + "exprecce_sentences_2025-08/"
sentences_data = {
    "oga": {"dir" : oga_sentences_dir},
    "glaux": {"dir" : glaux_sentences_dir},
    "exprecce": {"dir", exprecce_sentences_dir}
}
paths = [oga_sentences_dir, glaux_sentences_dir, exprecce_sentences_dir]

In [9]:
# test
# from 5th file in each source corpus
# print 5th sentence
for path in paths:
    fn = os.listdir(path)[5]
    with open(os.path.join(path, fn), "rb") as f:
        sents_data = pickle.load(f)
    print(sents_data[5])

('tlg0007.tlg074', 5, 'ἥ τε γὰρ σωφροσύνη φρόνησίς τίς ἐστιν ὥς φασι καὶ εὐβουλία, καὶ ἡ δικαιοσύνη τῆς φρονήσεως δεῖται παρούσης·', [('ἥ', 'ὅς', 'DET', {'oga_cts': '2', 'oga_tid': 't_188'}, 0, 1), ('τε', 'τε', 'CCONJ', {'oga_cts': '2', 'oga_tid': 't_189'}, 2, 4), ('γὰρ', 'γάρ', 'ADV', {'oga_cts': '2', 'oga_tid': 't_190'}, 5, 8), ('σωφροσύνη', 'σωφροσύνη', 'NOUN', {'oga_cts': '2', 'oga_tid': 't_191'}, 9, 18), ('φρόνησίς', 'φρόνησις', 'NOUN', {'oga_cts': '2', 'oga_tid': 't_192'}, 19, 27), ('τίς', 'τις', 'PRON', {'oga_cts': '2', 'oga_tid': 't_193'}, 28, 31), ('ἐστιν', 'εἰμί', 'VERB', {'oga_cts': '2', 'oga_tid': 't_194'}, 32, 37), ('ὥς', 'ὡς', 'CCONJ', {'oga_cts': '2', 'oga_tid': 't_195'}, 38, 40), ('φασι', 'φημί', 'VERB', {'oga_cts': '2', 'oga_tid': 't_196'}, 41, 45), ('καὶ', 'καί', 'CCONJ', {'oga_cts': '2', 'oga_tid': 't_197'}, 46, 49), ('εὐβουλία', 'εὐβουλία', 'NOUN', {'oga_cts': '2', 'oga_tid': 't_198'}, 50, 58), (',', ',', 'PUNCT', {'oga_cts': '2', 'oga_tid': 't_199'}, 58, 59), ('καὶ

## LAGT v3.0

In [9]:
LAGT = pd.read_parquet(greek_data_dir + "LAGT/LAGT_v3-0.parquet")

In [10]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
2,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",180,1Kgr,"[[Πινυτός, ἀντιγράφω, θαυμάζω, ἀποδέχω, Διονύσ...",grecy,,101.0,200.0,,[],christian,34
8,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαί...,77,1Kgr,"[[οὐδενός, εὐνητήρ], [μακροπτολέμοιο, μήτηρ, μ...",grecy,4-3 B.C.,-400.0,-201.0,False,[Bucolici],pagan,61
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125


In [11]:
LAGT.groupby("source").size()

source
1Kgr       949
perseus    761
dtype: int64

In [12]:
LAGT.groupby("lemmata_source").size()

lemmata_source
agdt         25
glaux       835
gorman        2
grecy       762
lxxmorph     55
morphgnt     27
pedalion      4
dtype: int64

In [13]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [14]:
LAGT3_metadata = LAGT[['doc_id', 'lemmata_source', 'tokencount']].copy()
for col in LAGT3_metadata.columns:
    if col !="doc_id":
        LAGT3_metadata.rename(columns={col : "lagt3_" + col}, inplace=True)

## LAGT v4.1

In [15]:
LAGT = pd.read_parquet(greek_data_dir + "LAGT/LAGT_v4-1.parquet")

In [16]:
LAGT.columns

Index(['author_id', 'doc_id', 'filename', 'author', 'title', 'sentences',
       'lemmatized_sentences', 'source', 'lemmata_source', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount',
       'lemmatacount'],
      dtype='object')

In [17]:
LAGT["string"] = LAGT["sentences"].apply(lambda x: " ".join(x))

In [18]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [19]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount,string,tokencount
0,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"[, , ., . . . . ., — πρός ἥν ( ,, ), ὁ Πινυτός...","[[], [], [], [], [πινυτός, ἀντιγράφω], [θαυμάζ...",glaux1,glaux1,101.0,200.0,[],[],christian,109,34,", , . . . . . . — πρός ἥν ( , ) ὁ Πινυτός ἀντι...",108
1,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,[Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαί...,"[[εὐνητήρ, μακροπτολέμον, μήτηρ, μαῖα, ἀντιπέτ...",glaux1,glaux1,-400.0,-201.0,[Bucolici],[],pagan,95,59,Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαία...,94
2,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,[ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσ...,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",glaux1,glaux1,-500.0,-401.0,[Tragici],[],pagan,21516,10315,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,21729
3,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,[Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τή...,"[[μιλήσιος, πργοςήκω, τρυφή, ἐπιδείκνυμι], [ἀρ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,3206,1496,Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τήν...,3178
4,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,"[Κατά πετρῶν σπείρεις., Πλίνθον πλύνεις., Δικτ...","[[πέτρα, σπείρω], [πλίνθος, πλύνω], [δίκτυον, ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,195,125,Κατά πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,194


In [21]:
LAGT41_metadata = LAGT[['doc_id', 'author', 'title', 'not_before',
       'not_after', 'tlg_epithet', 'source', 'genre', 'provenience', "tokencount"]].copy()
for col in LAGT41_metadata.columns:
    if col !="doc_id":
        LAGT41_metadata.rename(columns={col : "lagt4-1_" + col}, inplace=True)
LAGT41_metadata.head(5)

Unnamed: 0,doc_id,lagt4-1_author,lagt4-1_title,lagt4-1_not_before,lagt4-1_not_after,lagt4-1_tlg_epithet,lagt4-1_source,lagt4-1_genre,lagt4-1_provenience,lagt4-1_tokencount
0,ogl0001.ogl001,Pinytus,De Epistola Pinyti ad Dionysium,101.0,200.0,[],glaux1,[],christian,108
1,tlg0005.tlg003,Theocritus,Syrinx,-400.0,-201.0,[Bucolici],glaux1,[],pagan,94
2,tlg0006.tlg020,Euripides,Fragmenta,-500.0,-401.0,[Tragici],glaux1,[],pagan,21729
3,tlg0007.tlg146,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,3178
4,tlg0007.tlg147,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,194


## Glaux metadata

In [22]:
glaux_metadata = pd.read_csv("/srv/data/greek/glaux/metadata.txt", sep="\t")
glaux_metadata.head(5)

Unnamed: 0,GLAUX_TEXT_ID,TLG,STARTDATE,ENDDATE,AUTHOR_STANDARD,TITLE_STANDARD,GENRE_STANDARD,DIALECT,SOURCE,SOURCE_LICENSE,SOURCE_FORMAT,TOKENS,TM_TEXT
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913


In [23]:
glaux_tlg = "0012-001"
groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()

In [24]:
def tlg_parsing(glaux_tlg):
    groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()
    tlg_id = "tlg"+ groups[0] + ".tlg" + groups[1]
    return tlg_id
glaux_metadata["tlg_id"] = glaux_metadata["TLG"].apply(lambda x: tlg_parsing(x))

In [25]:
for col in glaux_metadata.columns:
    glaux_metadata.rename(columns={col : "glaux_" + col}, inplace=True)
glaux_metadata.head(5)

Unnamed: 0,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_STARTDATE,glaux_ENDDATE,glaux_AUTHOR_STANDARD,glaux_TITLE_STANDARD,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_SOURCE,glaux_SOURCE_LICENSE,glaux_SOURCE_FORMAT,glaux_TOKENS,glaux_TM_TEXT,glaux_tlg_id
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511,tlg0012.tlg001
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512,tlg0012.tlg002
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612,tlg0012.tlg003
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768,tlg1351.tlg001
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913,tlg1547.tlg001


In [26]:
glaux_metadata["doc_id"] = glaux_metadata["glaux_tlg_id"]

## OGA metadata

In [27]:
# load the following file with beautiful soup
filepath = "/srv/data/greek/opera_graeca_adnotata_v0.2.0/work_chronology/texts/chronology_greek_works_plus_date_label.xml"
with open(filepath, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'xml')

In [28]:
# Extract records
records = []
for record in soup.find_all("record"):
    record_data = {}
    for field in record.find_all():
        record_data[field.name] = field.text.strip()  # Use tag name as key, text content as value
    records.append(record_data)

# Convert to DataFrame
oga_metadata = pd.DataFrame(records)

In [29]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1


In [30]:
len(oga_metadata)

1911

In [31]:
oga_metadata["formatted_work_date"].tolist()[:10]

['+0101-01/+0200-12',
 '—0249-01/—0049-12',
 '—0274-01/—0249-12',
 '+0364-01/+0378-12',
 '+0280-01/+0350-12',
 '—0299-01/—0200-12',
 '—0430-01/—0410-12',
 '+0222-01/+0235-12',
 '—0299-01/—0259-12',
 '—0299-01/—0259-12']

In [32]:
def parse_dates(formatted_date):
    # Split the string into not_before and not_after parts
    not_before, not_after = formatted_date.split("/")

    # Convert not_before and not_after consistently
    not_before = not_before.replace("+", "").replace("—", "-")
    if not_before[0]=="-":
        not_before = int(not_before[:5])
    else:
        not_before = int(not_before[:4])

    # Replace em dash and +, handle full string
    not_after = not_after.replace("+", "").replace("—", "-")
    if not_after[0]=="-":
        not_after = int(not_after[:5])
    else:
        not_after = int(not_after[:4])# Replace em dash and +, handle full string
    return not_before, not_after



In [33]:
parse_dates("—0299-01/—0200-12")

(-299, -200)

In [34]:
oga_metadata["not_before"], oga_metadata["not_after"] = zip(*oga_metadata["formatted_work_date"].apply(parse_dates))

In [35]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label,not_before,not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [36]:
for col in oga_metadata.columns:
    oga_metadata.rename(columns={col : "oga_" + col}, inplace=True)
oga_metadata.head(5)

Unnamed: 0,oga_id,oga_urn_cts,oga_title_labels,oga_title_from_print_edition,oga_print_edition,oga_author,oga_estimated_work_date,oga_is_temporary_work_date,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,oga_date_label,oga_not_before,oga_not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [37]:
oga_metadata["doc_id"] = oga_metadata["oga_urn_cts"]

In [38]:
sentences_path = "/srv/data/greek/oga_sentences/"
os.listdir(sentences_path)[:10]

['tlg1264.tlg001.pickle',
 'tlg0007.tlg121.pickle',
 'pta0100.pta008.pickle',
 'tlg0527.tlg020.pickle',
 'tlg0540.tlg019.pickle',
 'tlg0026.tlg004.pickle',
 'tlg0018.tlg020.pickle',
 'tlg0540.tlg015.pickle',
 'tlg2042.tlg086.pickle',
 'pta0001.pta005.pickle']

In [39]:
doc_id = oga_metadata["doc_id"][0]
with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
    sentences_data = pickle.load(f)

In [40]:
sum([len(sent[3]) for sent in sentences_data])

109

In [41]:
def count_oga_tokens(doc_id):
    try:
        with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
            sentences_data = pickle.load(f)
        return sum([len(sent[3]) for sent in sentences_data])
    except:
        return None

In [42]:
oga_metadata["oga_tokencount"] = oga_metadata["doc_id"].apply(count_oga_tokens)

In [43]:
oga_metadata.columns

Index(['oga_id', 'oga_urn_cts', 'oga_title_labels',
       'oga_title_from_print_edition', 'oga_print_edition', 'oga_author',
       'oga_estimated_work_date', 'oga_is_temporary_work_date',
       'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology',
       'oga_formatted_work_date', 'oga_date_label', 'oga_not_before',
       'oga_not_after', 'doc_id', 'oga_tokencount'],
      dtype='object')

In [44]:
oga_metadata = oga_metadata[['oga_id', 'oga_title_labels', 'oga_title_from_print_edition', 'oga_print_edition', 'oga_author', 'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology', 'oga_formatted_work_date', 'oga_not_before', 'oga_not_after', 'doc_id', 'oga_tokencount']]

## Merge metadata

In [45]:
from functools import reduce

In [48]:
#LAGT3_metadata['LAGT3?'] = True
LAGT41_metadata['LAGT4-1?'] = True
glaux_metadata['GLAUX?'] = True
oga_metadata['OGA?'] = True
dataframes = [LAGT41_metadata, glaux_metadata, oga_metadata]

In [49]:
gr_metadata_merged = reduce(lambda left, right: pd.merge(left, right, on="doc_id", how="outer"), dataframes)

In [50]:
presence_columns = ["LAGT4-1?", "GLAUX?", "OGA?"]
gr_metadata_merged[presence_columns] = gr_metadata_merged[presence_columns].fillna(False).astype(bool)

  gr_metadata_merged[presence_columns] = gr_metadata_merged[presence_columns].fillna(False).astype(bool)


In [51]:
gr_metadata_merged.columns

Index(['doc_id', 'lagt4-1_author', 'lagt4-1_title', 'lagt4-1_not_before',
       'lagt4-1_not_after', 'lagt4-1_tlg_epithet', 'lagt4-1_source',
       'lagt4-1_genre', 'lagt4-1_provenience', 'lagt4-1_tokencount',
       'LAGT4-1?', 'glaux_GLAUX_TEXT_ID', 'glaux_TLG', 'glaux_STARTDATE',
       'glaux_ENDDATE', 'glaux_AUTHOR_STANDARD', 'glaux_TITLE_STANDARD',
       'glaux_GENRE_STANDARD', 'glaux_DIALECT', 'glaux_SOURCE',
       'glaux_SOURCE_LICENSE', 'glaux_SOURCE_FORMAT', 'glaux_TOKENS',
       'glaux_TM_TEXT', 'glaux_tlg_id', 'GLAUX?', 'oga_id', 'oga_title_labels',
       'oga_title_from_print_edition', 'oga_print_edition', 'oga_author',
       'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology',
       'oga_formatted_work_date', 'oga_not_before', 'oga_not_after',
       'oga_tokencount', 'OGA?'],
      dtype='object')

In [52]:
gr_metadata_merged = gr_metadata_merged[[
    'doc_id',
    'LAGT3?',
    'LAGT4-1?',
    'GLAUX?',
    'OGA?',
    'lagt4-1_author',
    'glaux_AUTHOR_STANDARD',
    'oga_author',
    'lagt4-1_title',
    'glaux_TITLE_STANDARD',
    'oga_title_labels',
    'oga_title_from_print_edition',
    'lagt4-1_not_before',
    'lagt4-1_not_after',
    'glaux_STARTDATE',
    'glaux_ENDDATE',
    'oga_not_before',
    'oga_not_after',
    'oga_date_source',
    'oga_date_source_link',
    'oga_comment_on_chronology',
    'oga_formatted_work_date',
    'lagt3_tokencount',
    'lagt4-1_tokencount',
    'glaux_TOKENS',
    'oga_tokencount',
    'lagt4-1_source',
    'lagt4-1_tlg_epithet',
    'lagt4-1_genre',
    'lagt4-1_provenience',
    'glaux_GENRE_STANDARD',
    'glaux_DIALECT',
    'glaux_GLAUX_TEXT_ID',
    'glaux_TLG',
    'glaux_SOURCE',
    'glaux_SOURCE_FORMAT',
    'glaux_TM_TEXT',
    'oga_id',
    'oga_print_edition',
]]

KeyError: "['LAGT3?', 'lagt3_tokencount'] not in index"

In [53]:
gr_metadata_merged.sample(10)

Unnamed: 0,doc_id,lagt4-1_author,lagt4-1_title,lagt4-1_not_before,lagt4-1_not_after,lagt4-1_tlg_epithet,lagt4-1_source,lagt4-1_genre,lagt4-1_provenience,lagt4-1_tokencount,LAGT4-1?,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_STARTDATE,glaux_ENDDATE,glaux_AUTHOR_STANDARD,glaux_TITLE_STANDARD,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_SOURCE,glaux_SOURCE_LICENSE,glaux_SOURCE_FORMAT,glaux_TOKENS,glaux_TM_TEXT,glaux_tlg_id,GLAUX?,oga_id,oga_title_labels,oga_title_from_print_edition,oga_print_edition,oga_author,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,oga_not_before,oga_not_after,oga_tokencount,OGA?
1445,tlg0649.tlg003,Lesbonax (Rhetorician),Προτρεπτικὸς Β̄,101.0,200.0,[Rhetorici],glaux1,[],pagan,941.0,True,,,,,,,,,,,,,,,False,760.0,Protrepticus B,Προτρεπτικὸς Β̄,Lesbonax. Lesbonactis sophistae quae supersunt...,Lesbonax (Rhetorician),"Huitink, L. (2021). Chapter 21 Early Orators. ...",https://doi.org/10.1163/9789004498815_023,Safe date range.,+0101-01/+0200-12,101.0,200.0,951.0,True
283,tlg0010.tlg002,Isocrates,Against Callimachus,-500.0,-301.0,[Oratores],glaux1,[],pagan,4267.0,True,295.0,0010-002,-500.0,-301.0,Isocrates,In Callimachum,Oratory,Attic,Perseus,CC BY-SA 4.0,XML,4063.0,11170,tlg0010.tlg002,True,165.0,Against Callimachus,Against Callimachus,Isocrates. Isocrates with an English Translati...,Isocrates,"(2000). Isocrates I. New York, USA: University...",https://doi.org/10.7560/752375,From the speech itself this date looks more li...,—0401-01/—0401-12,-401.0,-401.0,4106.0,True
1246,tlg0551.tlg009,Appian,Λιβυκή,1.0,200.0,[Historici/-ae],glaux1,[],pagan,29561.0,True,1022.0,0551-009,1.0,200.0,Appianus,Libyca,History,,Perseus,CC BY-SA 4.0,XML,28924.0,,tlg0551.tlg009,True,670.0,Punic Wars,Λιβυκή,"Appianus. Appiani Historia romana, Volume 1. M...",Appianus of Alexandria,"Britannica, T. Editors of Encyclopaedia (2024,...",https://www.britannica.com/biography/Appian-of...,This date is based on the period of the flouri...,+0101-01/+0200-12,101.0,200.0,29082.0,True
1008,tlg0284.tlg002,Aelius Aristides,Ἀθηνᾶ,101.0,200.0,[Rhetorici],glaux1,[],pagan,2618.0,True,1236.0,0284-002,101.0,200.0,Aelius Aristides,Ἀθηνᾶ,Oratory,Attic/Koine,Perseus,CC BY-SA 4.0,XML,2534.0,11387,tlg0284.tlg002,True,1802.0,Orationes 2,Ἀθηνᾶ,"Aristides. Vol. 1. Dindorf, Wilhelm, editor. L...","Aristides, Aelius",,,temporary date of the author life,+0117-01/+0187-12,117.0,187.0,2590.0,True
841,tlg0085.tlg006,Aeschylus,Χοηφóρoι,-600.0,-401.0,[Tragici],glaux1,[],pagan,7131.0,True,123.0,0085-006,-600.0,-401.0,Aeschylus,Choephoroe,Tragedy,Attic,Perseus,CC BY-SA 4.0,XML,6589.0,5492,tlg0085.tlg006,True,512.0,Libation Bearers,Χοηφóρoι,"Aeschyli Tragoediae. Sidgwick, Arthur, editor....",Aeschylus,"Peter Burian, Jacques Bromberg, A Companion to...",https://onlinelibrary.wiley.com/doi/epub/10.10...,Secure date according to Burian.,—0457-01/—0457-12,-457.0,-457.0,6773.0,True
1733,tlg2005.tlg001,,Martyrium Pionii presbyteri et sodalium,301.0,400.0,,exprecce,[Hagiogr.],christian,4827.0,True,,,,,,,,,,,,,,,False,,,,,,,,,,,,,False
209,tlg0007.tlg079,Plutarch,Τῶν ἑπτὰ σοφῶν συμπόσιον,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,10960.0,True,1134.0,0007-079,1.0,200.0,Plutarchus,Septem sapientium convivium,Philosophic Dialogue,Attic/Koine,https://el.wikisource.org,CC-BY-SA 3.0,TXT,10614.0,569,tlg0007.tlg079,True,109.0,Septem sapientium convivium,The Dinner of the Seven Wise Men,"Plutarch. Moralia, Vol. II. Babbitt, Frank Col...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,96.0,120.0,9900.0,True
2081,tlg4034.tlg003,,,,,,,,,,False,,,,,,,,,,,,,,,False,1563.0,In Libros De Partibus Animalium Commentaria,In Libros De Partibus Animalium Commentaria,Michaelis Ephesii in libros de partibus animal...,Michael of Ephesus,"Gottschalk, H. (. (2006). Aristotle, commentat...",https://doi.org/10.1163/1574-9347_bnp_e136730,,+1101-01/+1200-12,1101.0,1200.0,47587.0,True
1275,tlg0555.tlg008,Clement of Alexandria,ὁ Προτρεπτικὸς εἰς ὑπομονὴν ἢ πρὸς τοὺς νεωστὶ...,101.0,300.0,[Theologici],glaux1,[],christian,776.0,True,1709.0,0555-008,101.0,300.0,Clemens Alexandrinus,Fragmenta,Theology,Koine,Perseus,CC BY-SA 4.0,XML,777.0,2026 / 16027,tlg0555.tlg008,True,1876.0,Exhortation to Endurance or To the Newly Bapti...,ὁ Προτρεπτικὸς εἰς ὑπομονὴν ἢ πρὸς τοὺς νεωστὶ...,"Clement of Alexandria. Butterworth, G.W., edit...",Clement of Alexandria,,,temporary date of the author life,+0180-01/+0221-12,180.0,221.0,791.0,True
1918,tlg2200.tlg00438,Libanius,Oratio 38,301.0,400.0,"[Rhetorici, Sophistae]",1Kgr,[],pagan,2732.0,True,,,,,,,,,,,,,,,False,1417.0,Oratio 38,Oratio 38,"Libanius, Oratio 38, Libanii Opera Vol 3, Orat...",Libanius,"Cribiore, R. (2013). Libanius the Sophist: Rhe...",https://www.degruyter.com/document/doi/10.7591...,,+0350-01/+0393-12,350.0,393.0,2360.0,True


# Deduplicating

In [54]:
# Extract only the necessary columns
subset_df = gr_metadata_merged[["doc_id", "glaux_TLG"]]

# Group by 'doc_id' and aggregate 'glaux_TLG' values into a list
doc_glaux_tlgs_dict = gr_metadata_merged.groupby("doc_id")["glaux_TLG"].apply(list).to_dict()
doc_glaux_tokens_dict = gr_metadata_merged.groupby("doc_id")["glaux_TOKENS"].apply(np.sum).to_dict()

In [55]:
doc_glaux_tlgs_dict["tlg0007.tlg051"] # both texts together

['0007-051a', '0007-051b']

In [56]:
doc_glaux_tokens_dict["tlg0007.tlg051"] # sums of tokens from all components

15147.0

In [57]:
gr_metadata_merged["glaux_tlg_ids"] = gr_metadata_merged["doc_id"].apply(lambda x: doc_glaux_tlgs_dict[x])
gr_metadata_merged["glaux_tokens_sum"] = gr_metadata_merged["doc_id"].apply(lambda x: doc_glaux_tokens_dict[x])

In [58]:
# Get duplicate rows based on the "doc_id" column
duplicates = gr_metadata_merged[
    gr_metadata_merged.duplicated(subset=["doc_id"], keep=False)]
# View the duplicated rows
duplicates

Unnamed: 0,doc_id,lagt4-1_author,lagt4-1_title,lagt4-1_not_before,lagt4-1_not_after,lagt4-1_tlg_epithet,lagt4-1_source,lagt4-1_genre,lagt4-1_provenience,lagt4-1_tokencount,LAGT4-1?,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_STARTDATE,glaux_ENDDATE,glaux_AUTHOR_STANDARD,glaux_TITLE_STANDARD,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_SOURCE,glaux_SOURCE_LICENSE,glaux_SOURCE_FORMAT,glaux_TOKENS,glaux_TM_TEXT,glaux_tlg_id,GLAUX?,oga_id,oga_title_labels,oga_title_from_print_edition,oga_print_edition,oga_author,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,oga_not_before,oga_not_after,oga_tokencount,OGA?,glaux_tlg_ids,glaux_tokens_sum
179,tlg0007.tlg051,Plutarch,Agis and Cleomenes,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,15317.0,True,1104.0,0007-051a,1.0,200.0,Plutarchus,Agis et Cleomenes,Biography,Attic/Koine,Perseus,CC BY-SA 4.0,XML,5068.0,,tlg0007.tlg051,True,81.0,Agis and Cleomenes,Agis and Cleomenes,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,96.0,116.0,15116.0,True,"[0007-051a, 0007-051b]",15147.0
180,tlg0007.tlg051,Plutarch,Agis and Cleomenes,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,15317.0,True,1105.0,0007-051b,1.0,200.0,Plutarchus,Agis et Cleomenes,Biography,Attic/Koine,Perseus,CC BY-SA 4.0,XML,10079.0,,tlg0007.tlg051,True,81.0,Agis and Cleomenes,Agis and Cleomenes,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,96.0,116.0,15116.0,True,"[0007-051a, 0007-051b]",15147.0
181,tlg0007.tlg052,Plutarch,Tiberius and Caius Gracchus,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,10398.0,True,1106.0,0007-052a,1.0,200.0,Plutarchus,Tiberius et Gaius Gracchus,Biography,Attic/Koine,Perseus,CC BY-SA 4.0,XML,5629.0,,tlg0007.tlg052,True,82.0,Tiberius and Caius Gracchus,Tiberius and Caius Gracchus,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,96.0,116.0,10312.0,True,"[0007-052a, 0007-052b]",10289.0
182,tlg0007.tlg052,Plutarch,Tiberius and Caius Gracchus,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,10398.0,True,1107.0,0007-052b,1.0,200.0,Plutarchus,Tiberius et Gaius Gracchus,Biography,Attic/Koine,Perseus,CC BY-SA 4.0,XML,4660.0,,tlg0007.tlg052,True,82.0,Tiberius and Caius Gracchus,Tiberius and Caius Gracchus,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,96.0,116.0,10312.0,True,"[0007-052a, 0007-052b]",10289.0
212,tlg0007.tlg082,Plutarch,Ἀποφθέγματα Λακωνικά,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,3261.0,True,1137.0,0007-082,1.0,200.0,Plutarchus,Apophthegmata Laconica,Philosophy,Attic/Koine,Perseus,CC BY-SA 4.0,XML,16577.0,570 / 6523 / 6524,tlg0007.tlg082,True,112.0,Apophthegmata Laconica,Sayings of Spartans,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,96.0,120.0,16123.0,True,"[0007-082, 0007-082a, 0007-082b]",19795.0
213,tlg0007.tlg082,Plutarch,Ἀποφθέγματα Λακωνικά,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,3261.0,True,1138.0,0007-082a,1.0,200.0,Plutarchus,Apophthegmata Laconica,Philosophy,Attic/Koine,Perseus,CC BY-SA 4.0,XML,1843.0,,tlg0007.tlg082,True,112.0,Apophthegmata Laconica,Sayings of Spartans,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,96.0,120.0,16123.0,True,"[0007-082, 0007-082a, 0007-082b]",19795.0
214,tlg0007.tlg082,Plutarch,Ἀποφθέγματα Λακωνικά,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,3261.0,True,1139.0,0007-082b,1.0,200.0,Plutarchus,Apophthegmata Laconica,Philosophy,Attic/Koine,Perseus,CC BY-SA 4.0,XML,1375.0,,tlg0007.tlg082,True,112.0,Apophthegmata Laconica,Sayings of Spartans,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...",Plutarch,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,96.0,120.0,16123.0,True,"[0007-082, 0007-082a, 0007-082b]",19795.0
218,tlg0007.tlg084,Plutarchus,Aetia Romana et Graeca,1.0,200.0,,glaux1,[Polyhist.],pagan,22743.0,True,1141.0,0007-084a,1.0,200.0,Plutarchus,Aetia Romana et Graeca,Polyhistory,Attic/Koine,https://el.wikisource.org,CC-BY-SA 3.0,TXT,15310.0,,tlg0007.tlg084,True,,,,,,,,,,,,,False,"[0007-084a, 0007-084b]",22114.0
219,tlg0007.tlg084,Plutarchus,Aetia Romana et Graeca,1.0,200.0,,glaux1,[Polyhist.],pagan,22743.0,True,1142.0,0007-084b,1.0,200.0,Plutarchus,Aetia Romana et Graeca,Polyhistory,Attic/Koine,https://el.wikisource.org,CC-BY-SA 3.0,TXT,6804.0,,tlg0007.tlg084,True,,,,,,,,,,,,,False,"[0007-084a, 0007-084b]",22114.0
940,tlg0093.tlg010,Theophrastus Eresius,Fragmenta,-400.0,-201.0,,glaux1,"[Nat. Hist. , Phil.]",pagan,10907.0,True,650.0,0093-010a,-400.0,-201.0,Theophrastus,Fragmenta,Biology,Attic/Koine,First1K,CC BY-SA 4.0,XML,6153.0,,tlg0093.tlg010,True,,,,,,,,,,,,,False,"[0093-010a, 0093-010b]",10468.0


In [59]:
# Deduplicate based on "doc_id" and keep the first occurrence
gr_metadata_merged = gr_metadata_merged.drop_duplicates(subset=["doc_id"], keep="first")

In [60]:
sum(gr_metadata_merged["GLAUX?"])

1408

In [61]:
sum(gr_metadata_merged["OGA?"])

1911

In [62]:
# in both GLAUX and OGA
sum((gr_metadata_merged["GLAUX?"] & gr_metadata_merged["OGA?"]))

1183

In [63]:
# only in GLAUX
sum((gr_metadata_merged["GLAUX?"] & ~gr_metadata_merged["OGA?"]))

225

In [64]:
# only in OGA
sum((~gr_metadata_merged["GLAUX?"] & gr_metadata_merged["OGA?"]))

728

In [68]:
sum((gr_metadata_merged["LAGT4-1?"] & ~gr_metadata_merged["OGA?"] & ~gr_metadata_merged["GLAUX?"]))

24

In [None]:
def load_from_sentences(row):
    doc_id = row["doc_id"]
    if row["OGA?"]:
        filepath = os.path.join(path, doc_id+".pickle")

    with open(filepath, "r") as f:
        sents_data = pickle.load(f)


In [103]:


gr_metadata_merged[gr_metadata_merged["LAGT4-1?"] & ~gr_metadata_merged["OGA?"] & ~gr_metadata_merged["GLAUX?"]]

Unnamed: 0,doc_id,LAGT3?,LAGT4-1?,GLAUX?,OGA?,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,glaux_STARTDATE,glaux_ENDDATE,oga_not_before,oga_not_after,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,lagt3_tokencount,lagt4-1_tokencount,glaux_TOKENS,oga_tokencount,lagt4-1_tlg_epithet,lagt4-1_genre,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition,glaux_tlg_ids,glaux_tokens_sum
1068,tlg0304.tlg001,False,True,False,False,,,,Acta et martyrium Apollonii,,,,101.0,400.0,,,,,,,,,,2123.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1094,tlg0389.tlg001,False,True,False,False,,,,Martyrdom of Peter,,,,101.0,200.0,,,,,,,,,,2834.0,,,,"[Apocryph., Hagiogr.]",,,christian,,,,,,,,[nan],0.0
1095,tlg0390.tlg001,False,True,False,False,,,,"Martyrium sanctorum Carpi, Papyli et Agathonicae",,,,101.0,200.0,,,,,,,,,,1079.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1331,tlg0593.tlg003,False,True,False,False,Gorgias,,,Fragmenta (Fragment 11a),,,,-500.0,-301.0,,,,,,,,,,3062.0,,,,[Test.],,,pagan,,,,,,,,[nan],0.0
1449,tlg0653.tlg003,False,True,False,False,Aratus Soleus,,,Epigrammata,,,,-400.0,-201.0,,,,,,,,,,65.0,,,,[Epigr.],,,pagan,,,,,,,,[nan],0.0
1498,tlg1157.tlg003,False,True,False,False,,,,Apocalypsis Esdrae,,,,1.0,200.0,,,,,,,,,,2992.0,,,,[],,,christian,,,,,,,,[nan],0.0
1569,tlg1352.tlg001,False,True,False,False,,,,The Letter of the Churches of Vienne and Lyons,,,,101.0,200.0,,,,,,,,,,4161.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1698,tlg1804.tlg003,False,True,False,False,Ninus,,,Fragmenta A-B (P. Berol. 6926),,,,-100.0,-1.0,,,,,,,,,,1344.0,,,,[Narr. Fict.],,,pagan,,,,,,,,[nan],0.0
1733,tlg2005.tlg001,False,True,False,False,,,,Martyrium Pionii presbyteri et sodalium,,,,301.0,400.0,,,,,,,,,,4827.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1735,tlg2008.tlg001,False,True,False,False,,,,Martyrium Cononis,,,,401.0,,,,,,,,,,,1053.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0


In [102]:
# only in OGA
sum((gr_metadata_merged["LAGT4-1?"] & ~gr_metadata_merged["OGA?"] & ~gr_metadata_merged["GLAUX?"]))

24

In [159]:
set_with_dataframe(lagt_metadata_gs.add_worksheet("gr_metadata_merged", 1, 1), gr_metadata_merged)

In [160]:
gr_metadata_merged.to_csv("../data/gr_metadata_merged.csv", index=False)

In [161]:
test = pd.read_csv("https://raw.githubusercontent.com/sdam-au/LAGT/refs/heads/master/data/gr_metadata_merged.csv")

In [162]:
test.head(5)

Unnamed: 0,doc_id,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,...,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition
0,ggm0001.ggm001,,,Anonymous,,,Anametresis Pontou,Anametresis Pontou,,,...,,,,,,,,,1607.0,"Anonymous. Geographi graeci minores, Volume 1...."
1,ogl0001.ogl001,Pinytus,,"Pinytus, Saint, Bishop of Knossos",De Epistola Pinyti ad Dionysium,,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,101.0,200.0,...,,,christian,,,,,,1.0,"Pinytus, Saint, Bishop of Knossos. Reliquiae S..."
2,pta0001.pta001,,,Severian of Gabala,,,De fide et lege naturae,Severianus Gabalensis: De fide et lege naturae...,,,...,,,,,,,,,1694.0,"Vatikan, Biblioteca Apostolica Vaticana, graec..."
3,pta0001.pta002,,,Severian of Gabala,,,De paenitentia et compunctione,Severianus Gabalensis: De paenitentia et compu...,,,...,,,,,,,,,1695.0,"Jean-Paul Migne (ed.), Patrologia Graeca. Volu..."
4,pta0001.pta003,,,Severian of Gabala,,,In ascensionem domini nostri Iesu Christi et i...,Severianus Gabalensis: In ascensionem domini n...,,,...,,,,,,,,,1696.0,"Richard W. Bishop/Nathalie Rambault, Severian ..."
