In [80]:
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import json
import pickle
import re
from nltk.tokenize import word_tokenize
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [63]:
pd.set_option('display.max_columns', None)

In [3]:

file_data = json.load(open(os.path.expanduser("../../../ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(
    ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

lagt_metadata_gs = gc.open_by_url(
    "https://docs.google.com/spreadsheets/d/10pGulpiwzjUozVEVstKBwtftyDisSY9h7-kl82TVs0A/edit?usp=sharing")

In [4]:
greek_data_dir = "/srv/data/greek/"
os.listdir(greek_data_dir)

['oga_graphannis',
 'oga_sentences',
 'glaux_sentences',
 'OGA',
 'glaux',
 'exprecce',
 'LAGT',
 'OGA_0-2-0.zip',
 'opera_graeca_adnotata_v0.2.0',
 'exprecce_sentences']

In [7]:
oga_sentences_dir = greek_data_dir + "oga_sentences/"
glaux_sentences_dir = greek_data_dir + "glaux_sentences/"
exprecce_sentences_dir = greek_data_dir + "exprecce_sentences/"
sentences_data = {
    "oga": {"dir" : oga_sentences_dir},
    "glaux": {"dir" : glaux_sentences_dir},
    "exprecce": {"dir", exprecce_sentences_dir}
}
paths = [oga_sentences_dir, glaux_sentences_dir, exprecce_sentences_dir]

In [8]:
for path in paths:
    fn = os.listdir(path)[5]
    with open(os.path.join(path, fn), "rb") as f:
        sents_data = pickle.load(f)
    print(sents_data[5])

('tlg0026.tlg004', 5, 'ταῦτα δὲ ἐπείθοντο κατὰ μῆνιν Ἀπόλλωνος αὐτοῖς συμβεβηκέναι , ταφέντος ἐν τῇ νήσῳ τινὸς τῶν ἐπιodd .', [('ταῦτα', 'οὗτος', 'p', (0, 5), '1_2'), ('δὲ', 'δέ', 'd', (6, 8), '1_2'), ('ἐπείθοντο', 'πείθω', 'v', (9, 18), '1_2'), ('κατὰ', 'κατά', 'r', (19, 23), '1_2'), ('μῆνιν', 'μῆνις', 'n', (24, 29), '1_2'), ('Ἀπόλλωνος', 'ἀπόλλων', 'n', (30, 39), '1_2'), ('αὐτοῖς', 'αὐτός', 'p', (40, 46), '1_2'), ('συμβεβηκέναι', 'συμβαίνω', 'v', (47, 59), '1_2'), (',', ',', 'u', (60, 61), '1_2'), ('ταφέντος', 'θάπτω', 'v', (62, 70), '1_2'), ('ἐν', 'ἐν', 'r', (71, 73), '1_2'), ('τῇ', 'ὁ', 'l', (74, 76), '1_2'), ('νήσῳ', 'νῆσος', 'n', (77, 81), '1_2'), ('τινὸς', 'τις', 'p', (82, 87), '1_2'), ('τῶν', 'ὁ', 'l', (88, 91), '1_2'), ('ἐπιodd', 'ἐπιόψ', 'v', (92, 98), '1_2'), ('.', '.', 'u', (99, 100), '1_2')], '1st1K-grc1')
('tlg0540.tlg015', 5, 'νῦν δὲ τούτων οὐδὲν ἐποίησαν, ἀλλ’ ἐν μὲν τῷ στρατοπέδῳ περιεώρων αὐτὸν ὑπὸ πάντων προπηλακιζόμενον κα- κἀν τοῖς ἱπποτοξόταις ἱππεύοντα, ἐπειδὴ δὲ

## LAGT v3.0

In [9]:
LAGT = pd.read_parquet(greek_data_dir + "LAGT/LAGT_v3-0.parquet")

In [10]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
2,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",180,1Kgr,"[[Πινυτός, ἀντιγράφω, θαυμάζω, ἀποδέχω, Διονύσ...",grecy,,101.0,200.0,,[],christian,34
8,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαί...,77,1Kgr,"[[οὐδενός, εὐνητήρ], [μακροπτολέμοιο, μήτηρ, μ...",grecy,4-3 B.C.,-400.0,-201.0,False,[Bucolici],pagan,61
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125


In [11]:
LAGT.groupby("source").size()

source
1Kgr       949
perseus    761
dtype: int64

In [12]:
LAGT.groupby("lemmata_source").size()

lemmata_source
agdt         25
glaux       835
gorman        2
grecy       762
lxxmorph     55
morphgnt     27
pedalion      4
dtype: int64

In [13]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [14]:
LAGT3_metadata = LAGT[['doc_id', 'lemmata_source', 'tokencount']].copy()
for col in LAGT3_metadata.columns:
    if col !="doc_id":
        LAGT3_metadata.rename(columns={col : "lagt3_" + col}, inplace=True)

## LAGT v4.1

In [16]:
LAGT = pd.read_parquet(greek_data_dir + "LAGT/LAGT_v4-1.parquet")

In [17]:
LAGT.columns

Index(['author_id', 'doc_id', 'filename', 'author', 'title', 'sentences',
       'lemmatized_sentences', 'source', 'lemmata_source', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount',
       'lemmatacount'],
      dtype='object')

In [18]:
LAGT["string"] = LAGT["sentences"].apply(lambda x: " ".join(x))

In [19]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [20]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount,string,tokencount
0,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"[, , ., . . . . ., — πρός ἥν ( ,, ), ὁ Πινυτός...","[[], [], [], [], [πινυτός, ἀντιγράφω], [θαυμάζ...",glaux1,glaux1,101.0,200.0,[],[],christian,109,34,", , . . . . . . — πρός ἥν ( , ) ὁ Πινυτός ἀντι...",108
1,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,[Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαί...,"[[εὐνητήρ, μακροπτολέμον, μήτηρ, μαῖα, ἀντιπέτ...",glaux1,glaux1,-400.0,-201.0,[Bucolici],[],pagan,95,59,Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαία...,94
2,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,[ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσ...,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",glaux1,glaux1,-500.0,-401.0,[Tragici],[],pagan,21516,10315,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,21729
3,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,[Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τή...,"[[μιλήσιος, πργοςήκω, τρυφή, ἐπιδείκνυμι], [ἀρ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,3206,1496,Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τήν...,3178
4,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,"[Κατά πετρῶν σπείρεις., Πλίνθον πλύνεις., Δικτ...","[[πέτρα, σπείρω], [πλίνθος, πλύνω], [δίκτυον, ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,195,125,Κατά πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,194


In [104]:
LAGT41_metadata = LAGT[['doc_id', 'author', 'title', 'not_before',
       'not_after', 'tlg_epithet', 'source', 'genre', 'provenience', "tokencount"]].copy()
for col in LAGT41_metadata.columns:
    if col !="doc_id":
        LAGT41_metadata.rename(columns={col : "lagt4-1_" + col}, inplace=True)
LAGT41_metadata.head(5)

Unnamed: 0,doc_id,lagt4-1_author,lagt4-1_title,lagt4-1_not_before,lagt4-1_not_after,lagt4-1_tlg_epithet,lagt4-1_source,lagt4-1_genre,lagt4-1_provenience,lagt4-1_tokencount
0,ogl0001.ogl001,Pinytus,De Epistola Pinyti ad Dionysium,101.0,200.0,[],glaux1,[],christian,108
1,tlg0005.tlg003,Theocritus,Syrinx,-400.0,-201.0,[Bucolici],glaux1,[],pagan,94
2,tlg0006.tlg020,Euripides,Fragmenta,-500.0,-401.0,[Tragici],glaux1,[],pagan,21729
3,tlg0007.tlg146,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,3178
4,tlg0007.tlg147,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,1.0,200.0,"[Biographi, Philosophici/-ae]",glaux1,[],pagan,194


## Glaux metadata

In [22]:
glaux_metadata = pd.read_csv("/srv/data/greek/glaux/metadata.txt", sep="\t")
glaux_metadata.head(5)

Unnamed: 0,GLAUX_TEXT_ID,TLG,STARTDATE,ENDDATE,AUTHOR_STANDARD,TITLE_STANDARD,GENRE_STANDARD,DIALECT,SOURCE,SOURCE_LICENSE,SOURCE_FORMAT,TOKENS,TM_TEXT
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913


In [23]:
glaux_tlg = "0012-001"
groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()

In [24]:
def tlg_parsing(glaux_tlg):
    groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()
    tlg_id = "tlg"+ groups[0] + ".tlg" + groups[1]
    return tlg_id
glaux_metadata["tlg_id"] = glaux_metadata["TLG"].apply(lambda x: tlg_parsing(x))

In [25]:
for col in glaux_metadata.columns:
    glaux_metadata.rename(columns={col : "glaux_" + col}, inplace=True)
glaux_metadata.head(5)

Unnamed: 0,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_STARTDATE,glaux_ENDDATE,glaux_AUTHOR_STANDARD,glaux_TITLE_STANDARD,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_SOURCE,glaux_SOURCE_LICENSE,glaux_SOURCE_FORMAT,glaux_TOKENS,glaux_TM_TEXT,glaux_tlg_id
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511,tlg0012.tlg001
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512,tlg0012.tlg002
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612,tlg0012.tlg003
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768,tlg1351.tlg001
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913,tlg1547.tlg001


In [26]:
glaux_metadata["doc_id"] = glaux_metadata["glaux_tlg_id"]

## OGA metadata

In [27]:
# load the following file with beautiful soup
filepath = "/srv/data/greek/opera_graeca_adnotata_v0.2.0/work_chronology/texts/chronology_greek_works_plus_date_label.xml"
with open(filepath, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'xml')

In [28]:
# Extract records
records = []
for record in soup.find_all("record"):
    record_data = {}
    for field in record.find_all():
        record_data[field.name] = field.text.strip()  # Use tag name as key, text content as value
    records.append(record_data)

# Convert to DataFrame
oga_metadata = pd.DataFrame(records)

In [29]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1


In [60]:
len(oga_metadata)

1911

In [30]:
oga_metadata["formatted_work_date"].tolist()[:10]

['+0101-01/+0200-12',
 '—0249-01/—0049-12',
 '—0274-01/—0249-12',
 '+0364-01/+0378-12',
 '+0280-01/+0350-12',
 '—0299-01/—0200-12',
 '—0430-01/—0410-12',
 '+0222-01/+0235-12',
 '—0299-01/—0259-12',
 '—0299-01/—0259-12']

In [31]:
def parse_dates(formatted_date):
    # Split the string into not_before and not_after parts
    not_before, not_after = formatted_date.split("/")

    # Convert not_before and not_after consistently
    not_before = not_before.replace("+", "").replace("—", "-")
    if not_before[0]=="-":
        not_before = int(not_before[:5])
    else:
        not_before = int(not_before[:4])

    # Replace em dash and +, handle full string
    not_after = not_after.replace("+", "").replace("—", "-")
    if not_after[0]=="-":
        not_after = int(not_after[:5])
    else:
        not_after = int(not_after[:4])# Replace em dash and +, handle full string
    return not_before, not_after



In [32]:
parse_dates("—0299-01/—0200-12")

(-299, -200)

In [33]:
oga_metadata["not_before"], oga_metadata["not_after"] = zip(*oga_metadata["formatted_work_date"].apply(parse_dates))

In [34]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label,not_before,not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [35]:
for col in oga_metadata.columns:
    oga_metadata.rename(columns={col : "oga_" + col}, inplace=True)
oga_metadata.head(5)

Unnamed: 0,oga_id,oga_urn_cts,oga_title_labels,oga_title_from_print_edition,oga_print_edition,oga_author,oga_estimated_work_date,oga_is_temporary_work_date,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,oga_date_label,oga_not_before,oga_not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [36]:
oga_metadata["doc_id"] = oga_metadata["oga_urn_cts"]

In [37]:
sentences_path = "/srv/data/greek/oga_sentences/"
os.listdir(sentences_path)[:10]

['tlg1264.tlg001.pickle',
 'tlg0007.tlg121.pickle',
 'pta0100.pta008.pickle',
 'tlg0527.tlg020.pickle',
 'tlg0540.tlg019.pickle',
 'tlg0026.tlg004.pickle',
 'tlg0018.tlg020.pickle',
 'tlg0540.tlg015.pickle',
 'tlg2042.tlg086.pickle',
 'pta0001.pta005.pickle']

In [38]:
doc_id = oga_metadata["doc_id"][0]
with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
    sentences_data = pickle.load(f)

In [39]:
sum([len(sent[3]) for sent in sentences_data])

109

In [40]:
def count_oga_tokens(doc_id):
    try:
        with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
            sentences_data = pickle.load(f)
        return sum([len(sent[3]) for sent in sentences_data])
    except:
        return None

In [41]:
oga_metadata["oga_tokencount"] = oga_metadata["doc_id"].apply(count_oga_tokens)

In [42]:
oga_metadata.columns

Index(['oga_id', 'oga_urn_cts', 'oga_title_labels',
       'oga_title_from_print_edition', 'oga_print_edition', 'oga_author',
       'oga_estimated_work_date', 'oga_is_temporary_work_date',
       'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology',
       'oga_formatted_work_date', 'oga_date_label', 'oga_not_before',
       'oga_not_after', 'doc_id', 'oga_tokencount'],
      dtype='object')

In [43]:
oga_metadata = oga_metadata[['oga_id', 'oga_title_labels', 'oga_title_from_print_edition', 'oga_print_edition', 'oga_author', 'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology', 'oga_formatted_work_date', 'oga_not_before', 'oga_not_after', 'doc_id', 'oga_tokencount']]

## Merge metadata

In [105]:
from functools import reduce

In [106]:
LAGT3_metadata['LAGT3?'] = True
LAGT41_metadata['LAGT4-1?'] = True
glaux_metadata['GLAUX?'] = True
oga_metadata['OGA?'] = True
dataframes = [LAGT3_metadata, LAGT41_metadata, glaux_metadata, oga_metadata]

In [107]:
gr_metadata_merged = reduce(lambda left, right: pd.merge(left, right, on="doc_id", how="outer"), dataframes)


In [108]:
presence_columns = ["LAGT3?", "LAGT4-1?", "GLAUX?", "OGA?"]
gr_metadata_merged[presence_columns] = gr_metadata_merged[presence_columns].fillna(False).astype(bool)

  gr_metadata_merged[presence_columns] = gr_metadata_merged[presence_columns].fillna(False).astype(bool)


In [109]:
gr_metadata_merged.columns

Index(['doc_id', 'lagt3_lemmata_source', 'lagt3_tokencount', 'LAGT3?',
       'lagt4-1_author', 'lagt4-1_title', 'lagt4-1_not_before',
       'lagt4-1_not_after', 'lagt4-1_tlg_epithet', 'lagt4-1_source',
       'lagt4-1_genre', 'lagt4-1_provenience', 'lagt4-1_tokencount',
       'LAGT4-1?', 'glaux_GLAUX_TEXT_ID', 'glaux_TLG', 'glaux_STARTDATE',
       'glaux_ENDDATE', 'glaux_AUTHOR_STANDARD', 'glaux_TITLE_STANDARD',
       'glaux_GENRE_STANDARD', 'glaux_DIALECT', 'glaux_SOURCE',
       'glaux_SOURCE_LICENSE', 'glaux_SOURCE_FORMAT', 'glaux_TOKENS',
       'glaux_TM_TEXT', 'glaux_tlg_id', 'GLAUX?', 'oga_id', 'oga_title_labels',
       'oga_title_from_print_edition', 'oga_print_edition', 'oga_author',
       'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology',
       'oga_formatted_work_date', 'oga_not_before', 'oga_not_after',
       'oga_tokencount', 'OGA?'],
      dtype='object')

In [110]:
gr_metadata_merged = gr_metadata_merged[[
    'doc_id',
    'LAGT3?',
    'LAGT4-1?',
    'GLAUX?',
    'OGA?',
    'lagt4-1_author',
    'glaux_AUTHOR_STANDARD',
    'oga_author',
    'lagt4-1_title',
    'glaux_TITLE_STANDARD',
    'oga_title_labels',
    'oga_title_from_print_edition',
    'lagt4-1_not_before',
    'lagt4-1_not_after',
    'glaux_STARTDATE',
    'glaux_ENDDATE',
    'oga_not_before',
    'oga_not_after',
    'oga_date_source',
    'oga_date_source_link',
    'oga_comment_on_chronology',
    'oga_formatted_work_date',
    'lagt3_tokencount',
    'lagt4-1_tokencount',
    'glaux_TOKENS',
    'oga_tokencount',
    'lagt4-1_source',
    'lagt4-1_tlg_epithet',
    'lagt4-1_genre',
    'lagt4-1_provenience',
    'glaux_GENRE_STANDARD',
    'glaux_DIALECT',
    'glaux_GLAUX_TEXT_ID',
    'glaux_TLG',
    'glaux_SOURCE',
    'glaux_SOURCE_FORMAT',
    'glaux_TM_TEXT',
    'oga_id',
    'oga_print_edition',
]]

In [111]:
gr_metadata_merged.sample(10)

Unnamed: 0,doc_id,LAGT3?,LAGT4-1?,GLAUX?,OGA?,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,glaux_STARTDATE,glaux_ENDDATE,oga_not_before,oga_not_after,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,lagt3_tokencount,lagt4-1_tokencount,glaux_TOKENS,oga_tokencount,lagt4-1_source,lagt4-1_tlg_epithet,lagt4-1_genre,lagt4-1_provenience,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition
29,pta0001.pta028,False,False,False,True,,,Severian of Gabala,,,In theophaniam,Severianus Gabalensis: In theophaniam,,,,,400.0,409.0,,,temporary date of the author life,+0400-01/+0409-12,,,,2331.0,,,,,,,,,,,,1721.0,Daria Coșcodan (Translator)
1881,tlg2063.tlg005,False,True,True,False,Gregorius Thaumaturgus,Gregorius Thaumaturgus,,Epistula canonica,Epistula canonica,,,201.0,300.0,201.0,300.0,,,,,,,,1222.0,1205.0,,glaux1,,[Epist.],christian,Epistolography,,1857.0,2063-005,https://el.orthodoxwiki.org,TXT,15905.0,,
392,tlg0014.tlg037,True,True,True,True,Demosthenes,Demosthenes,Demosthenes,παραγραφὴ πρὸς Πανταίνετον,Contra Pantaenetum,Against Pantaenetus,παραγραφὴ πρὸς Πανταίνετον,-400.0,-301.0,-400.0,-301.0,-346.0,-346.0,"Martin, Gunther (ed.), The Oxford Handbook of ...",https://doi.org/10.1093/oxfordhb/9780198713852...,This speech was most probably composed soon af...,—0346-01/—0346-12,4383.0,4809.0,4479.0,4561.0,glaux1,[Oratores],[],pagan,Oratory,Attic,513.0,0014-037,Perseus,XML,11209.0,249.0,"Demosthenis. Orationes. Vol. II, Part 2. Renni..."
1340,tlg0614.tlg001,True,True,True,True,Babrius,Valerius Babrius,Babrius,Fabulae Aesopeae,Mythiambi Aesopici,Fabulae Aesopeae,Fabulae Aesopeae,101.0,200.0,101.0,200.0,101.0,200.0,"Britannica, T. Editors of Encyclopaedia (2024,...",https://www.britannica.com/biography/Babrius,Babrius was active during the 2nd century AD a...,+0101-01/+0200-12,102879.0,14584.0,13928.0,14394.0,glaux1,[Scriptores Fabularum],[],pagan,Lyric poetry,,1320.0,0614-001,First1K,XML,646.0,722.0,"Babrius. Babrii Fabulae Aesopeae. Schneidewin,..."
1806,tlg2042.tlg007,True,True,True,True,Origenes,Origenes,Origen,Exhortatio ad martyrium,Exhortatio ad martyrium,Exhortatio ad martyrium,Exhortatio ad martyrium,101.0,300.0,101.0,300.0,220.0,254.0,"Baltes, M. (., Lakmann, M. (., & Markschies, C...",https://doi.org/10.1163/1574-9347_bnp_e900669,Origenes wrote these works during his time in ...,+0220-01/+0254-12,14248.0,14836.0,14701.0,14838.0,glaux1,[Theologici],[],christian,Theology,Koine,1730.0,2042-007,Perseus,XML,6450.0,988.0,"Origenes. Origenes Werke, Vol 1, Koetschau, Pa..."
182,tlg0007.tlg052,True,True,True,True,Plutarch,Plutarchus,Plutarch,Tiberius and Caius Gracchus,Tiberius et Gaius Gracchus,Tiberius and Caius Gracchus,Tiberius and Caius Gracchus,1.0,200.0,1.0,200.0,96.0,116.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,10276.0,10398.0,4660.0,10312.0,glaux1,"[Biographi, Philosophici/-ae]",[],pagan,Biography,Attic/Koine,1107.0,0007-052b,Perseus,XML,,82.0,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be..."
440,tlg0018.tlg008,True,True,True,True,Philo Judaeus,Philo Judaeus,Philo Judaeus,Quod deus sit immutabilis,Quod deus sit immutabilis,Quod Deus Sit Immutabilis,Quod Deus Sit Immutabilis,-100.0,100.0,-100.0,100.0,10.0,35.0,"IEHOFF, M. R. (2018). Philo of Alexandria: An ...",https://doi.org/10.2307/j.ctt1z27jf9,Niehoff considers that Philo wrote his allegor...,+0010-01/+0035-12,19171.0,10459.0,10252.0,10887.0,glaux1,[Philosophici/-ae],[],jewish,Theology,Koine,881.0,0018-008,https://el.wikisource.org,TXT,902.0,292.0,"Philo Judaeus. Wendland, Paul, editor. Opera q..."
198,tlg0007.tlg068,True,True,True,True,Plutarch,Plutarchus,Plutarch,Πῶς δεῖ τὸν νέον ποιημάτων ἀκούειν.,Quomodo adolescens poetas audire debeat,Quomodo adolescens poetas audire debeat,How the Young Man Should Study Poetry,1.0,200.0,1.0,200.0,96.0,120.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,11789.0,12007.0,11557.0,11076.0,glaux1,"[Biographi, Philosophici/-ae]",[],pagan,Philosophy,Attic/Koine,1123.0,0007-068,https://el.wikisource.org,TXT,5760.0,98.0,"Plutarch. Moralia, Vol. I. Babbitt, Frank Cole..."
961,tlg0131.tlg001,False,True,True,False,Archimelus Epigrammaticus,Archimelus,,Epigramma,Epigramma,,,-300.0,-201.0,-300.0,-201.0,,,,,,,,51.0,48.0,,glaux1,,[Epigr.],pagan,Lyric poetry,,676.0,0131-001,Perseus,XML,7074.0,,
2094,tlg4086.tlg001,True,True,False,True,Agathodaemon,,Agathodaemon,Fragmentum alchemicum,,Fragmentum alchemicum_Alchemical Fragment,Fragmentum alchemicum,1.0,200.0,,,300.0,300.0,"Agathodaemon, Wikipedia",https://en.wikipedia.org/wiki/Agathodaemon_(al...,,+0300-01/+0300-12,17.0,17.0,,17.0,1Kgr,[Alchemistae],[],pagan,,,,,,,,1576.0,"Collection des anciens alchimistes grecs, Vol...."


# Deduplicating

In [112]:
# Extract only the necessary columns
subset_df = gr_metadata_merged[["doc_id", "glaux_TLG"]]

# Group by 'doc_id' and aggregate 'glaux_TLG' values into a list
doc_glaux_tlgs_dict = gr_metadata_merged.groupby("doc_id")["glaux_TLG"].apply(list).to_dict()
doc_glaux_tokens_dict = gr_metadata_merged.groupby("doc_id")["glaux_TOKENS"].apply(np.sum).to_dict()

In [113]:
doc_glaux_tlgs_dict["tlg0007.tlg051"] # both texts together

['0007-051a', '0007-051b']

In [114]:
doc_glaux_tokens_dict["tlg0007.tlg051"] # sums of tokens from all components

15147.0

In [85]:
gr_metadata_merged["glaux_tlg_ids"] = gr_metadata_merged["doc_id"].apply(lambda x: doc_glaux_tlgs_dict[x])
gr_metadata_merged["glaux_tokens_sum"] = gr_metadata_merged["doc_id"].apply(lambda x: doc_glaux_tokens_dict[x])

In [87]:
# Get duplicate rows based on the "doc_id" column
duplicates = gr_metadata_merged[
    gr_metadata_merged.duplicated(subset=["doc_id"], keep=False)]
# View the duplicated rows
duplicates

Unnamed: 0,doc_id,LAGT3?,LAGT4-1?,GLAUX?,OGA?,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,glaux_STARTDATE,glaux_ENDDATE,oga_not_before,oga_not_after,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,lagt3_tokencount,lagt4-1_tokencount,glaux_TOKENS,oga_tokencount,lagt4-1_tlg_epithet,lagt4-1_genre,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition,glaux_tlg_ids,glaux_tokens_sum
179,tlg0007.tlg051,True,True,True,True,Plutarch,Plutarchus,Plutarch,Agis and Cleomenes,Agis et Cleomenes,Agis and Cleomenes,Agis and Cleomenes,1.0,200.0,1.0,200.0,96.0,116.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,15066.0,15317.0,5068.0,15116.0,"[Biographi, Philosophici/-ae]",[],Biography,Attic/Koine,pagan,1104.0,0007-051a,Perseus,XML,,81.0,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...","[0007-051a, 0007-051b]",15147.0
180,tlg0007.tlg051,True,True,True,True,Plutarch,Plutarchus,Plutarch,Agis and Cleomenes,Agis et Cleomenes,Agis and Cleomenes,Agis and Cleomenes,1.0,200.0,1.0,200.0,96.0,116.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,15066.0,15317.0,10079.0,15116.0,"[Biographi, Philosophici/-ae]",[],Biography,Attic/Koine,pagan,1105.0,0007-051b,Perseus,XML,,81.0,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...","[0007-051a, 0007-051b]",15147.0
181,tlg0007.tlg052,True,True,True,True,Plutarch,Plutarchus,Plutarch,Tiberius and Caius Gracchus,Tiberius et Gaius Gracchus,Tiberius and Caius Gracchus,Tiberius and Caius Gracchus,1.0,200.0,1.0,200.0,96.0,116.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,10276.0,10398.0,5629.0,10312.0,"[Biographi, Philosophici/-ae]",[],Biography,Attic/Koine,pagan,1106.0,0007-052a,Perseus,XML,,82.0,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...","[0007-052a, 0007-052b]",10289.0
182,tlg0007.tlg052,True,True,True,True,Plutarch,Plutarchus,Plutarch,Tiberius and Caius Gracchus,Tiberius et Gaius Gracchus,Tiberius and Caius Gracchus,Tiberius and Caius Gracchus,1.0,200.0,1.0,200.0,96.0,116.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Jones believes that the mention of Quintus Sos...,+0096-01/+0116-12,10276.0,10398.0,4660.0,10312.0,"[Biographi, Philosophici/-ae]",[],Biography,Attic/Koine,pagan,1107.0,0007-052b,Perseus,XML,,82.0,"Plutarch. Plutarch's Lives, Vol. X. Perrin, Be...","[0007-052a, 0007-052b]",10289.0
212,tlg0007.tlg082,True,True,True,True,Plutarch,Plutarchus,Plutarch,Ἀποφθέγματα Λακωνικά,Apophthegmata Laconica,Apophthegmata Laconica,Sayings of Spartans,1.0,200.0,1.0,200.0,96.0,120.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,15784.0,3261.0,16577.0,16123.0,"[Biographi, Philosophici/-ae]",[],Philosophy,Attic/Koine,pagan,1137.0,0007-082,Perseus,XML,570 / 6523 / 6524,112.0,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...","[0007-082, 0007-082a, 0007-082b]",19795.0
213,tlg0007.tlg082,True,True,True,True,Plutarch,Plutarchus,Plutarch,Ἀποφθέγματα Λακωνικά,Apophthegmata Laconica,Apophthegmata Laconica,Sayings of Spartans,1.0,200.0,1.0,200.0,96.0,120.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,15784.0,3261.0,1843.0,16123.0,"[Biographi, Philosophici/-ae]",[],Philosophy,Attic/Koine,pagan,1138.0,0007-082a,Perseus,XML,,112.0,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...","[0007-082, 0007-082a, 0007-082b]",19795.0
214,tlg0007.tlg082,True,True,True,True,Plutarch,Plutarchus,Plutarch,Ἀποφθέγματα Λακωνικά,Apophthegmata Laconica,Apophthegmata Laconica,Sayings of Spartans,1.0,200.0,1.0,200.0,96.0,120.0,"C.P. Jones, Towards a chronology of Plutarch's...",https://www.cambridge.org/core/journals/journa...,Approximate for most philosophical works accor...,+0096-01/+0120-12,15784.0,3261.0,1375.0,16123.0,"[Biographi, Philosophici/-ae]",[],Philosophy,Attic/Koine,pagan,1139.0,0007-082b,Perseus,XML,,112.0,"Plutarch. Moralia, Vol. III. Babbitt, Frank Co...","[0007-082, 0007-082a, 0007-082b]",19795.0
218,tlg0007.tlg084,False,True,True,False,Plutarchus,Plutarchus,,Aetia Romana et Graeca,Aetia Romana et Graeca,,,1.0,200.0,1.0,200.0,,,,,,,,22743.0,15310.0,,,[Polyhist.],Polyhistory,Attic/Koine,pagan,1141.0,0007-084a,https://el.wikisource.org,TXT,,,,"[0007-084a, 0007-084b]",22114.0
219,tlg0007.tlg084,False,True,True,False,Plutarchus,Plutarchus,,Aetia Romana et Graeca,Aetia Romana et Graeca,,,1.0,200.0,1.0,200.0,,,,,,,,22743.0,6804.0,,,[Polyhist.],Polyhistory,Attic/Koine,pagan,1142.0,0007-084b,https://el.wikisource.org,TXT,,,,"[0007-084a, 0007-084b]",22114.0
940,tlg0093.tlg010,False,True,True,False,Theophrastus Eresius,Theophrastus,,Fragmenta,Fragmenta,,,-400.0,-201.0,-400.0,-201.0,,,,,,,,10907.0,6153.0,,,"[Nat. Hist. , Phil.]",Biology,Attic/Koine,pagan,650.0,0093-010a,First1K,XML,,,,"[0093-010a, 0093-010b]",10468.0


In [88]:
# Deduplicate based on "doc_id" and keep the first occurrence
gr_metadata_merged = gr_metadata_merged.drop_duplicates(subset=["doc_id"], keep="first")

In [96]:
sum(gr_metadata_merged["GLAUX?"])

1408

In [97]:
sum(gr_metadata_merged["OGA?"])

1911

In [98]:
# in both GLAUX and OGA
sum((gr_metadata_merged["GLAUX?"] & gr_metadata_merged["OGA?"]))

1183

In [99]:
# only in GLAUX
sum((gr_metadata_merged["GLAUX?"] & ~gr_metadata_merged["OGA?"]))

225

In [100]:
# only in OGA
sum((~gr_metadata_merged["GLAUX?"] & gr_metadata_merged["OGA?"]))

728

In [103]:

gr_metadata_merged[gr_metadata_merged["LAGT4-1?"] & ~gr_metadata_merged["OGA?"] & ~gr_metadata_merged["GLAUX?"]]

Unnamed: 0,doc_id,LAGT3?,LAGT4-1?,GLAUX?,OGA?,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,glaux_STARTDATE,glaux_ENDDATE,oga_not_before,oga_not_after,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,lagt3_tokencount,lagt4-1_tokencount,glaux_TOKENS,oga_tokencount,lagt4-1_tlg_epithet,lagt4-1_genre,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition,glaux_tlg_ids,glaux_tokens_sum
1068,tlg0304.tlg001,False,True,False,False,,,,Acta et martyrium Apollonii,,,,101.0,400.0,,,,,,,,,,2123.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1094,tlg0389.tlg001,False,True,False,False,,,,Martyrdom of Peter,,,,101.0,200.0,,,,,,,,,,2834.0,,,,"[Apocryph., Hagiogr.]",,,christian,,,,,,,,[nan],0.0
1095,tlg0390.tlg001,False,True,False,False,,,,"Martyrium sanctorum Carpi, Papyli et Agathonicae",,,,101.0,200.0,,,,,,,,,,1079.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1331,tlg0593.tlg003,False,True,False,False,Gorgias,,,Fragmenta (Fragment 11a),,,,-500.0,-301.0,,,,,,,,,,3062.0,,,,[Test.],,,pagan,,,,,,,,[nan],0.0
1449,tlg0653.tlg003,False,True,False,False,Aratus Soleus,,,Epigrammata,,,,-400.0,-201.0,,,,,,,,,,65.0,,,,[Epigr.],,,pagan,,,,,,,,[nan],0.0
1498,tlg1157.tlg003,False,True,False,False,,,,Apocalypsis Esdrae,,,,1.0,200.0,,,,,,,,,,2992.0,,,,[],,,christian,,,,,,,,[nan],0.0
1569,tlg1352.tlg001,False,True,False,False,,,,The Letter of the Churches of Vienne and Lyons,,,,101.0,200.0,,,,,,,,,,4161.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1698,tlg1804.tlg003,False,True,False,False,Ninus,,,Fragmenta A-B (P. Berol. 6926),,,,-100.0,-1.0,,,,,,,,,,1344.0,,,,[Narr. Fict.],,,pagan,,,,,,,,[nan],0.0
1733,tlg2005.tlg001,False,True,False,False,,,,Martyrium Pionii presbyteri et sodalium,,,,301.0,400.0,,,,,,,,,,4827.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0
1735,tlg2008.tlg001,False,True,False,False,,,,Martyrium Cononis,,,,401.0,,,,,,,,,,,1053.0,,,,[Hagiogr.],,,christian,,,,,,,,[nan],0.0


In [102]:
# only in OGA
sum((gr_metadata_merged["LAGT4-1?"] & ~gr_metadata_merged["OGA?"] & ~gr_metadata_merged["GLAUX?"]))

24

In [159]:
set_with_dataframe(lagt_metadata_gs.add_worksheet("gr_metadata_merged", 1, 1), gr_metadata_merged)

In [160]:
gr_metadata_merged.to_csv("../data/gr_metadata_merged.csv", index=False)

In [161]:
test = pd.read_csv("https://raw.githubusercontent.com/sdam-au/LAGT/refs/heads/master/data/gr_metadata_merged.csv")

In [162]:
test.head(5)

Unnamed: 0,doc_id,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,...,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition
0,ggm0001.ggm001,,,Anonymous,,,Anametresis Pontou,Anametresis Pontou,,,...,,,,,,,,,1607.0,"Anonymous. Geographi graeci minores, Volume 1...."
1,ogl0001.ogl001,Pinytus,,"Pinytus, Saint, Bishop of Knossos",De Epistola Pinyti ad Dionysium,,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,101.0,200.0,...,,,christian,,,,,,1.0,"Pinytus, Saint, Bishop of Knossos. Reliquiae S..."
2,pta0001.pta001,,,Severian of Gabala,,,De fide et lege naturae,Severianus Gabalensis: De fide et lege naturae...,,,...,,,,,,,,,1694.0,"Vatikan, Biblioteca Apostolica Vaticana, graec..."
3,pta0001.pta002,,,Severian of Gabala,,,De paenitentia et compunctione,Severianus Gabalensis: De paenitentia et compu...,,,...,,,,,,,,,1695.0,"Jean-Paul Migne (ed.), Patrologia Graeca. Volu..."
4,pta0001.pta003,,,Severian of Gabala,,,In ascensionem domini nostri Iesu Christi et i...,Severianus Gabalensis: In ascensionem domini n...,,,...,,,,,,,,,1696.0,"Richard W. Bishop/Nathalie Rambault, Severian ..."
