In [128]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import json
import pickle
import re
from nltk.tokenize import word_tokenize
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [34]:

file_data = json.load(open(os.path.expanduser("../../../ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(
    ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

lagt_metadata_gs = gc.open_by_url(
    "https://docs.google.com/spreadsheets/d/10pGulpiwzjUozVEVstKBwtftyDisSY9h7-kl82TVs0A/edit?usp=sharing")

## LAGT v3.0

In [79]:
LAGT = pd.read_parquet("https://zenodo.org/records/10684841/files/LAGT_v3-0.parquet?download=1")

In [81]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
2,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",180,1Kgr,"[[Πινυτός, ἀντιγράφω, θαυμάζω, ἀποδέχω, Διονύσ...",grecy,,101.0,200.0,,[],christian,34
8,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαί...,77,1Kgr,"[[οὐδενός, εὐνητήρ], [μακροπτολέμοιο, μήτηρ, μ...",grecy,4-3 B.C.,-400.0,-201.0,False,[Bucolici],pagan,61
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125


In [87]:
LAGT.groupby("source").size()

source
1Kgr       949
perseus    761
dtype: int64

In [88]:
LAGT.groupby("lemmata_source").size()

lemmata_source
agdt         25
glaux       835
gorman        2
grecy       762
lxxmorph     55
morphgnt     27
pedalion      4
dtype: int64

In [82]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [90]:
LAGT3_metadata = LAGT[['doc_id', 'lemmata_source', 'tokencount']].copy()
for col in LAGT3_metadata.columns:
    if col !="doc_id":
        LAGT3_metadata.rename(columns={col : "lagt3_" + col}, inplace=True)

## LAGT v4.1

In [91]:
LAGT = pd.read_parquet("../data/large_files/LAGT_v4-1.parquet")

In [92]:
LAGT.columns

Index(['author_id', 'doc_id', 'filename', 'author', 'title', 'sentences',
       'lemmatized_sentences', 'source', 'lemmata_source', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount',
       'lemmatacount'],
      dtype='object')

In [93]:
LAGT["string"] = LAGT["sentences"].apply(lambda x: " ".join(x))

In [94]:
LAGT["string"].tolist()[:3]

[', , . . . . . . — πρός ἥν ( , ) ὁ Πινυτός ἀντιγράφων, θαυμάζει μέν καί ἀποδέχεται τόν Διονύσιον· ἀντιπαρακαλεῖ δέ στεῤῥοτέρας ἤδη ποτέ μεταδιδόναι τροφῆς, τελειοτέροις γράμμασιν εἰσαῦθις τόν παῤ αὐτῷ λαόν ὑποθρέψαντα, ὡς μή διατέλους τοῖς γαλακτώδεσιν ἐνδιατρίβοντες λόγοις, τῇ νηπιώδει ἀγωγῇ λάθοιεν καταγηράσαντες. δι᾿ ἧς ἐπιστολῆς καί ἡ τοῦ Πινυτοῦ περί τήν πίστιν ὀρθοδοξία τε καί φροντίς τῆς τῶν ὑπηκόων ὡφελείας, τό, τε λόγιον καί ἡ περί τά θεῖα σύνεσις , ὡς δι᾿ ἀκριβεστάτης ἀναδείκνυται εἰκόνος. , : , . : , . . , , , . . . , , .',
 'Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαίας ἀντιπέτροιο θοόν τέκεν ἰθυντῆρα, οὐχί Κεράσταν ὅν ποτε θρέψατο ταυροπάτωρ, ἀλλ᾽ ου πειλιπές αἶθε πάρος φρένα τέρμα σάκους, οὔνομ᾽ Ὅλον, δίζων, ὅς τᾶς μέροπος πόθον κούρας γηρυγόνας ἔχε τᾶς ἀνεμώδεος, ὅς Μοίσᾳ λιγύ πᾶξεν ἰοστεφάνῳ ἕλκος, ἄγαλμα πόθοιο πυρισμαράγου, ὅς σβέσεν ἀνορέαν ἰσαυδέα παπποφόνου Τυρίας τ᾽ ἐξήλασεν. ᾧ τόδε τυφλοφόρων ἐρατόν πῆμα Πάρις θέτο Σιμιχίδας· ψυχάν ᾇ, βροτοβάμων, στήτας οἶστρε

In [95]:
LAGT["tokencount"] = LAGT["string"].apply(lambda x: len(word_tokenize(x)))

In [96]:
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount,string,tokencount
0,ogl0001,ogl0001.ogl001,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium,"[, , ., . . . . ., — πρός ἥν ( ,, ), ὁ Πινυτός...","[[], [], [], [], [πινυτός, ἀντιγράφω], [θαυμάζ...",glaux1,glaux1,101.0,200.0,[],[],christian,109,34,", , . . . . . . — πρός ἥν ( , ) ὁ Πινυτός ἀντι...",108
1,tlg0005,tlg0005.tlg003,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,[Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαί...,"[[εὐνητήρ, μακροπτολέμον, μήτηρ, μαῖα, ἀντιπέτ...",glaux1,glaux1,-400.0,-201.0,[Bucolici],[],pagan,95,59,Οὐδενός εὐνάτειρα Μακροπτολέμοιο δέ μάτηρ μαία...,94
2,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,[ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσ...,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",glaux1,glaux1,-500.0,-401.0,[Tragici],[],pagan,21516,10315,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,21729
3,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,[Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τή...,"[[μιλήσιος, πργοςήκω, τρυφή, ἐπιδείκνυμι], [ἀρ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,3206,1496,Οἴκοι τά Μιλήσια: ἐπί τῶν ὅποι μή προςήκει τήν...,3178
4,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,"[Κατά πετρῶν σπείρεις., Πλίνθον πλύνεις., Δικτ...","[[πέτρα, σπείρω], [πλίνθος, πλύνω], [δίκτυον, ...",glaux1,glaux1,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,195,125,Κατά πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,194


In [97]:
LAGT41_metadata = LAGT[['doc_id', 'author', 'title', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', "tokencount"]].copy()
for col in LAGT41_metadata.columns:
    if col !="doc_id":
        LAGT41_metadata.rename(columns={col : "lagt4-1_" + col}, inplace=True)
LAGT41_metadata.head(5)

Unnamed: 0,lagt4-1_doc_id,lagt4-1_author,lagt4-1_title,lagt4-1_not_before,lagt4-1_not_after,lagt4-1_tlg_epithet,lagt4-1_genre,lagt4-1_provenience,lagt4-1_tokencount
0,ogl0001.ogl001,Pinytus,De Epistola Pinyti ad Dionysium,101.0,200.0,[],[],christian,108
1,tlg0005.tlg003,Theocritus,Syrinx,-400.0,-201.0,[Bucolici],[],pagan,94
2,tlg0006.tlg020,Euripides,Fragmenta,-500.0,-401.0,[Tragici],[],pagan,21729
3,tlg0007.tlg146,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,3178
4,tlg0007.tlg147,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,1.0,200.0,"[Biographi, Philosophici/-ae]",[],pagan,194


## Glaux metadata

In [99]:
glaux_metadata = pd.read_csv("/srv/data/greek/glaux/metadata.txt", sep="\t")
glaux_metadata.head(5)

Unnamed: 0,GLAUX_TEXT_ID,TLG,STARTDATE,ENDDATE,AUTHOR_STANDARD,TITLE_STANDARD,GENRE_STANDARD,DIALECT,SOURCE,SOURCE_LICENSE,SOURCE_FORMAT,TOKENS,TM_TEXT
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913


In [102]:
glaux_tlg = "0012-001"
groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()

('0012', '001')

In [103]:
def tlg_parsing(glaux_tlg):
    groups = re.search(r"(\d{4})\-(\d{3})", glaux_tlg).groups()
    tlg_id = "tlg"+ groups[0] + ".tlg" + groups[1]
    return tlg_id
glaux_metadata["tlg_id"] = glaux_metadata["TLG"].apply(lambda x: tlg_parsing(x))

In [104]:
for col in glaux_metadata.columns:
    glaux_metadata.rename(columns={col : "glaux_" + col}, inplace=True)
glaux_metadata.head(5)

Unnamed: 0,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_STARTDATE,glaux_ENDDATE,glaux_AUTHOR_STANDARD,glaux_TITLE_STANDARD,glaux_GENRE_STANDARD,glaux_DIALECT,glaux_SOURCE,glaux_SOURCE_LICENSE,glaux_SOURCE_FORMAT,glaux_TOKENS,glaux_TM_TEXT,glaux_tlg_id
0,1,0012-001,-800,-701,Homerus,Ilias,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,129604,511,tlg0012.tlg001
1,2,0012-002,-800,-701,Homerus,Odyssea,Epic poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,104364,512,tlg0012.tlg002
2,3,0012-003,-800,-701,Homerus,Epigrammata,Lyric poetry,Ionic/Epic,Perseus,CC BY-SA 4.0,XML,26,12612,tlg0012.tlg003
3,4,1351-001,-800,-701,Epigoni,Epigoni,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,60,13805 / 15768,tlg1351.tlg001
4,5,1547-001,-800,-701,Oedipodea,Oedipodea,Epic poetry,Ionic/Epic,https://sententiaeantiquae.com,,TXT,15,12913,tlg1547.tlg001


In [107]:
glaux_metadata["doc_id"] = glaux_metadata["glaux_tlg_id"]

## OGA metadata

In [47]:
# load the following file with beautiful soup
filepath = "/srv/data/greek/opera_graeca_adnotata_v0.2.0/work_chronology/texts/chronology_greek_works_plus_date_label.xml"
with open(filepath, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'xml')

In [48]:
# Extract records
records = []
for record in soup.find_all("record"):
    record_data = {}
    for field in record.find_all():
        record_data[field.name] = field.text.strip()  # Use tag name as key, text content as value
    records.append(record_data)

# Convert to DataFrame
oga_metadata = pd.DataFrame(records)

In [49]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1


In [50]:
oga_metadata["formatted_work_date"].tolist()[:10]

['+0101-01/+0200-12',
 '—0249-01/—0049-12',
 '—0274-01/—0249-12',
 '+0364-01/+0378-12',
 '+0280-01/+0350-12',
 '—0299-01/—0200-12',
 '—0430-01/—0410-12',
 '+0222-01/+0235-12',
 '—0299-01/—0259-12',
 '—0299-01/—0259-12']

In [65]:
def parse_dates(formatted_date):
    # Split the string into not_before and not_after parts
    not_before, not_after = formatted_date.split("/")

    # Convert not_before and not_after consistently
    not_before = not_before.replace("+", "").replace("—", "-")
    if not_before[0]=="-":
        not_before = int(not_before[:5])
    else:
        not_before = int(not_before[:4])

    # Replace em dash and +, handle full string
    not_after = not_after.replace("+", "").replace("—", "-")
    if not_after[0]=="-":
        not_after = int(not_after[:5])
    else:
        not_after = int(not_after[:4])# Replace em dash and +, handle full string
    return not_before, not_after



In [67]:
parse_dates("—0299-01/—0200-12")

(-299, -200)

In [68]:
oga_metadata["not_before"], oga_metadata["not_after"] = zip(*oga_metadata["formatted_work_date"].apply(parse_dates))

In [69]:
oga_metadata.head(5)

Unnamed: 0,id,urn_cts,title_labels,title_from_print_edition,print_edition,author,estimated_work_date,is_temporary_work_date,date_source,date_source_link,comment_on_chronology,formatted_work_date,date_label,not_before,not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [70]:
for col in oga_metadata.columns:
    oga_metadata.rename(columns={col : "oga_" + col}, inplace=True)
oga_metadata.head(5)

Unnamed: 0,oga_id,oga_urn_cts,oga_title_labels,oga_title_from_print_edition,oga_print_edition,oga_author,oga_estimated_work_date,oga_is_temporary_work_date,oga_date_source,oga_date_source_link,oga_comment_on_chronology,oga_formatted_work_date,oga_date_label,oga_not_before,oga_not_after
0,1,ogl0001.ogl001,De Epistola Pinyti ad Dionysium,De Epistola Pinyti ad Dionysium,"Pinytus, Saint, Bishop of Knossos. Reliquiae S...","Pinytus, Saint, Bishop of Knossos",2nd Century AD (101-200),No,"Kritiko Agiologio, Holy Archibisophry of Crete",http://www.iak.gr/gr/ekklisia-kritis/kritiko_a...,Church of Crete Saints' Days Catalogue. The da...,+0101-01/+0200-12,p2_2,101,200
1,2,stoa0033a.tlg028,De mundo,De mundo,"pseudo-Aristotle, De mundo, Aristotelis Opera,...",Pseudo-Aristotle,250 BC-50BC,No,Pseudo-Aristotle: De Mundo (On the Cosmos). Ca...,https://www.cambridge.org/core/books/pseudoari...,Written between the middle of the 3rd and the ...,—0249-01/—0049-12,m3_2/m2_1/m2_2/m1_1,-249,-49
2,3,stoa0033a.tlg043,De spiritu,De spiritu,"pseudo-Aristotle, De spiritu, Aristotelis Oper...",Pseudo-Aristotle,275 BC-250 BC,No,Pseud-Aristotelian De Spiritu: A New Case agai...,,Follows Jaeger's proposed chronology.,—0274-01/—0249-12,m3_1,-274,-249
3,4,stoa0121.stoa001,Breviarium historiae romanae,Breviarium historiae romanae,Eutropius. Breviarium historiae romanae. Droys...,Eutropius,364 AD-378 AD,No,"Eutropius, Livius.org, 2020",https://www.livius.org/articles/person/eutropius/,Proposes an exact date of 369 AD.,+0364-01/+0378-12,p4_2,364,378
4,5,stoa0146d.stoa001,Acta Archelai,Acta Archelai,"Hegemonius. Acta Archelai. Beeson, Charles Hen...",Hegemonius,280 AD - 350 AD,No,"Archelaos, Wikisource",https://de.wikisource.org/wiki/RE:Archelaos_40,Based on the fact that he was bishop around 28...,+0280-01/+0350-12,p4_1,280,350


In [105]:
oga_metadata["doc_id"] = oga_metadata["oga_urn_cts"]

In [127]:
sentences_path = "/srv/data/greek/oga_sentences/"
os.listdir(sentences_path)[:10]

['tlg1264.tlg001.pickle',
 'tlg0007.tlg121.pickle',
 'pta0100.pta008.pickle',
 'tlg0527.tlg020.pickle',
 'tlg0540.tlg019.pickle',
 'tlg0026.tlg004.pickle',
 'tlg0018.tlg020.pickle',
 'tlg0540.tlg015.pickle',
 'tlg2042.tlg086.pickle',
 'pta0001.pta005.pickle']

In [130]:
doc_id = oga_metadata["doc_id"][0]
with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
    sentences_data = pickle.load(f)

In [133]:
sum([len(sent[3]) for sent in sentences_data])

109

In [136]:
def count_oga_tokens(doc_id):
    try:
        with open(os.path.join(sentences_path, doc_id + ".pickle"), "rb") as f:
            sentences_data = pickle.load(f)
        return sum([len(sent[3]) for sent in sentences_data])
    except:
        return None

In [137]:
oga_metadata["oga_tokencount"] = oga_metadata["doc_id"].apply(count_oga_tokens)

In [116]:
oga_metadata.columns

Index(['oga_id', 'oga_urn_cts', 'oga_title_labels',
       'oga_title_from_print_edition', 'oga_print_edition', 'oga_author',
       'oga_estimated_work_date', 'oga_is_temporary_work_date',
       'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology',
       'oga_formatted_work_date', 'oga_date_label', 'oga_not_before',
       'oga_not_after', 'doc_id'],
      dtype='object')

In [138]:
oga_metadata = oga_metadata[['oga_id', 'oga_title_labels', 'oga_title_from_print_edition', 'oga_print_edition', 'oga_author', 'oga_date_source', 'oga_date_source_link', 'oga_comment_on_chronology', 'oga_formatted_work_date', 'oga_not_before', 'oga_not_after', 'doc_id', 'oga_tokencount']]

## Merge metadata

In [139]:
from functools import reduce

In [140]:
dataframes = [LAGT3_metadata, LAGT41_metadata, glaux_metadata, oga_metadata]

In [141]:
gr_metadata_merged = reduce(lambda left, right: pd.merge(left, right, on="doc_id", how="outer"), dataframes)


In [142]:
gr_metadata_merged.columns

Index(['doc_id', 'lagt3_lemmata_source', 'lagt3_tokencount', 'lagt4-1_author',
       'lagt4-1_title', 'lagt4-1_not_before', 'lagt4-1_not_after',
       'lagt4-1_tlg_epithet', 'lagt4-1_genre', 'lagt4-1_provenience',
       'lagt4-1_tokencount', 'glaux_GLAUX_TEXT_ID', 'glaux_TLG',
       'glaux_STARTDATE', 'glaux_ENDDATE', 'glaux_AUTHOR_STANDARD',
       'glaux_TITLE_STANDARD', 'glaux_GENRE_STANDARD', 'glaux_DIALECT',
       'glaux_SOURCE', 'glaux_SOURCE_LICENSE', 'glaux_SOURCE_FORMAT',
       'glaux_TOKENS', 'glaux_TM_TEXT', 'glaux_tlg_id', 'oga_id',
       'oga_title_labels', 'oga_title_from_print_edition', 'oga_print_edition',
       'oga_author', 'oga_date_source', 'oga_date_source_link',
       'oga_comment_on_chronology', 'oga_formatted_work_date',
       'oga_not_before', 'oga_not_after', 'oga_tokencount'],
      dtype='object')

In [145]:
gr_metadata_merged = gr_metadata_merged[[
    'doc_id',
    'lagt4-1_author',
    'glaux_AUTHOR_STANDARD',
    'oga_author',
    'lagt4-1_title',
    'glaux_TITLE_STANDARD',
    'oga_title_labels',
    'oga_title_from_print_edition',
    'lagt4-1_not_before',
    'lagt4-1_not_after',
    'glaux_STARTDATE',
    'glaux_ENDDATE',
    'oga_not_before',
    'oga_not_after',
    'oga_date_source',
    'oga_date_source_link',
    'oga_comment_on_chronology',
    'oga_formatted_work_date',
    'lagt3_tokencount',
    'lagt4-1_tokencount',
    'glaux_TOKENS',
    'oga_tokencount',
    'lagt4-1_tlg_epithet',
    'lagt4-1_genre',
    'glaux_GENRE_STANDARD',
    'glaux_DIALECT',
    'lagt4-1_provenience',
    'glaux_GLAUX_TEXT_ID',
    'glaux_TLG',
    'glaux_SOURCE',
    'glaux_SOURCE_FORMAT',
    'glaux_TM_TEXT',
    'oga_id',
    'oga_print_edition',
]]

In [146]:
gr_metadata_merged.sample(10)

Unnamed: 0,doc_id,lagt4-1_author,glaux_AUTHOR_STANDARD,oga_author,lagt4-1_title,glaux_TITLE_STANDARD,oga_title_labels,oga_title_from_print_edition,lagt4-1_not_before,lagt4-1_not_after,...,glaux_GENRE_STANDARD,glaux_DIALECT,lagt4-1_provenience,glaux_GLAUX_TEXT_ID,glaux_TLG,glaux_SOURCE,glaux_SOURCE_FORMAT,glaux_TM_TEXT,oga_id,oga_print_edition
150,tlg0007.tlg022,Plutarch,Plutarchus,Plutarch,Μάρκελλος,Marcellus,Marcellus,Marcellus,1.0,200.0,...,Biography,Attic/Koine,pagan,1075.0,0007-022,Perseus,XML,5775.0,52.0,"Plutarch. Plutarch's Lives, Vol. V. Perrin, Be..."
590,tlg0057.tlg016,Galen,Galenus,Galen,De uteri dissectione,De uteri dissectione,De uteri dissectione,De uteri dissectione,101.0,200.0,...,Medicine,Attic/Koine,pagan,1379.0,0057-016,First1K,XML,12536.0,1101.0,"Galen, De uteri dissectione, Claudii Galeni Op..."
1683,tlg1799.tlg007,Euclid,,Euclid,Data,,Data,Data,-300.0,-201.0,...,,,pagan,,,,,,898.0,"Euclid. Euclidis Opera Omnia, Volume 6. Menge,..."
1085,tlg0365.tlg001,Apollodorus Lyricus,Apollodorus,,Fragmenta,Fragmentum,,,-600.0,-501.0,...,Lyric poetry,Ionic/Epic,pagan,88.0,0365-001,Loeb (public domain),TXT,10797.0,,
1716,tlg2001.tlg040,Themistius,,Themistius,In libros Aristotelis de Anima paraphrasis,,In Aristotelis Libros De Anima Paraphrasis,In Aristotelis Libros De Anima Paraphrasis,301.0,400.0,...,,,pagan,,,,,,924.0,Themistius. Themistii in libros Aristotelis de...
1256,tlg0552.tlg004,Archimède,Archimedes,Archimedes,De lineis spiralibus,De lineis spiralibus,De lineis spiralibus,De lineis spiralibus,-300.0,-201.0,...,Mathematics,Doric,pagan,666.0,0552-004,First1K,XML,5366.0,680.0,"Archimedes. Archimède, Volume 2. Mugler, Charl..."
1186,tlg0537.tlg012,Epicurus,,Epicurus,Epistula Tertia ad Menoeceum,,Epistula ad Menoeceum,Epistula Tertia ad Menoeceum,-400.0,-201.0,...,,,pagan,,,,,,1295.0,Epicurus. Epicuri epistulae tres et ratae sent...
2034,tlg3129.ogl001,,,Theophylactus,,,Commentary on St. Cyril In XII Prophetas,"Quae Theopylacti, Pro D. Cyrilli Commentariis ...",,,...,,,,,,,,,1521.0,"Quae Theopylacti, Pro D. Cyrilli Commentariis ..."
899,tlg0087.tlg007,Herodianus,,Aelius Herodianus,Περὶ Ἰλιακῆς προσῳδίας,,Περὶ Ἰλιακῆς προσῳδίας,Περὶ Ἰλιακῆς προσῳδίας,101.0,200.0,...,,,pagan,,,,,,534.0,"Aelius Herodianus, Περὶ Ἰλιακῆς προσῳδίας, Gra..."
561,tlg0035.tlg005,Moschus,Moschus,Moschus,Fragmenta,Fragmenta,Fragmenta,Fragmenta,-200.0,-101.0,...,Lyric poetry,Doric,pagan,762.0,0035-005,Perseus,XML,12732.0,1083.0,"Moschus. The Greek Bucolic Poets. Edmonds, J. ..."


In [147]:
set_with_dataframe(lagt_metadata_gs.add_worksheet("gr_metadata_merged", 1, 1), gr_metadata_merged)

In [149]:
gr_metadata_merged.to_csv("../data/gr_metadata_merged.csv", index=False)