In [1]:
import os
import pandas as pd
import json
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [2]:
LAGT = pd.read_parquet("../data/large_files/LAGT_grecy.parquet")

In [3]:
LAGT[LAGT["doc_id"].isin(["tlg4163.tlg001", "tlg5451.tlg001", "tlg2679.tlg002"])]

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
1874,tlg4163,tlg4163.tlg001,,Telegonia,Telegonia,[ἤσθιεν ἁρπαλέως κρέα τ’ ἄσπετα καὶ μέθυ ἡδύ.],"[[ἐσθίω, κρέας, ἄσπετος, μέθυ, ἡδύς]]",glaux1,glaux1,,,,,,9,5
1885,tlg2679,tlg2679.tlg002,,,,"[εὐλογητὸς εἶ, κύριε ὁ θεός, ὁ δοὺς τῷ Σολομῶν...","[[εὐλογητός, εἰμί, κύριος, θεός, δίδωμι, Σολομ...",glaux1,glaux1,,,,,,87,38
1943,tlg5451,tlg5451.tlg001,,,Passio sancti Sabae Gothi (sub auctore Athanar...,"[Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου., Ἡ ἐκκλη...","[[μαρτύριος, ἅγιος, σάβας, γότθης], [ἐκκλησία,...",exprecce,grecy,301.0,400.0,,Hagiogr.,christian,1892,841


In [4]:
file_data = json.load(open(os.path.expanduser("../../ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(
    ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

lagt_metadata_gs = gc.open_by_url(
    "https://docs.google.com/spreadsheets/d/10pGulpiwzjUozVEVstKBwtftyDisSY9h7-kl82TVs0A/edit?usp=sharing")

In [5]:
data = lagt_metadata_gs.worksheet("additions_v1").get_all_records()
data[:5]

[{'doc_id': 'tlg0002.tlg001',
  'doc_id_comment': '',
  'corrected_doc_id': '',
  'author': 'Theognis',
  'title': 'Elegiae',
  'not_before': -600,
  'not_after': -501,
  'provenience': 'pagan',
  'Genre1': 'Eleg.',
  'Genre2': '',
  'Genre3': ''},
 {'doc_id': 'tlg0003.tlg002',
  'doc_id_comment': '',
  'corrected_doc_id': '',
  'author': 'Thucydides Atheniensis',
  'title': 'Epigramma',
  'not_before': -500,
  'not_after': -401,
  'provenience': 'pagan',
  'Genre1': 'Epigr.',
  'Genre2': '',
  'Genre3': ''},
 {'doc_id': 'tlg0004.tlg002',
  'doc_id_comment': '',
  'corrected_doc_id': '',
  'author': 'Diogenes Laertius',
  'title': 'Epigrammata (App. Anth.)',
  'not_before': 201,
  'not_after': 300,
  'provenience': 'pagan',
  'Genre1': 'Epigr.',
  'Genre2': '',
  'Genre3': ''},
 {'doc_id': 'tlg0005.tlg005',
  'doc_id_comment': '',
  'corrected_doc_id': '',
  'author': 'Theocritus',
  'title': 'Epigrammata',
  'not_before': -400,
  'not_after': -201,
  'provenience': 'pagan',
  'Genre1'

In [6]:
restructured_data = {}
for row in data:
    doc_id = row.pop("doc_id", None)  # Get the 'doc_id' and remove it from the record
    if doc_id:
        restructured_data[doc_id] = row 

In [None]:
# for renaming the files according to our new doc_ids
target_path = "../data/large_files/sents_data_jsons/"

In [7]:

for doc_id, value_dict in restructured_data.items():
    # Find the corresponding row in LAGT DataFrame
    row_index = LAGT[LAGT['doc_id'] == doc_id].index
    if not row_index.empty:
        row_index = row_index[0]
        # Update values in LAGT DataFrame wherever the value in restructured_data is not empty
        for col, new_value in value_dict.items():
            if isinstance(new_value, str):
                new_value = new_value.replace("?", "")
            if new_value not in (None, ''):
                try: 
                    new_value = int(new_value)
                except:
                    pass
                if col == "corrected_doc_id":
                    # rename our json files accordingly!
                    os.rename(target_path + doc_id + ".json", target_path + new_value + ".json", )
                    col = "doc_id"
                LAGT.at[row_index, col] = new_value

In [80]:
LAGT[LAGT["doc_id"].isin(["tlg4163.tlg001", "tlg5451.tlg001", "tlg2679.tlg002"])]

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount,Genre1,Genre2,Genre3,doc_id_comment
1874,tlg4163,tlg4163.tlg001,,Telegonia,Telegonia,[ἤσθιεν ἁρπαλέως κρέα τ’ ἄσπετα καὶ μέθυ ἡδύ.],"[[ἐσθίω, κρέας, ἄσπετος, μέθυ, ἡδύς]]",glaux1,glaux1,-600.0,-501.0,,,pagan,9,5,,,,
1885,tlg2679,tlg2679.tlg002,,,Testamentum Salomonis (recensiones A et B) (ms...,"[εὐλογητὸς εἶ, κύριε ὁ θεός, ὁ δοὺς τῷ Σολομῶν...","[[εὐλογητός, εἰμί, κύριος, θεός, δίδωμι, Σολομ...",glaux1,glaux1,201.0,300.0,,,christian,87,38,,,,
1943,tlg5451,tlg5451.tlg001,,,Passio sancti Sabae Gothi (sub auctore Athanar...,"[Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου., Ἡ ἐκκλη...","[[μαρτύριος, ἅγιος, σάβας, γότθης], [ἐκκλησία,...",exprecce,grecy,301.0,400.0,,Hagiogr.,christian,1892,841,,,,


In [8]:
jsons_ids = [f.rpartition(".")[0] for f in os.listdir("../data/large_files/sents_data_jsons/")]
for doc_id in ["tlg4163.tlg001", "tlg5451.tlg001", "tlg2679.tlg002", "tlg0653.tlg003", "tlg0653.tlg002"]:
    if doc_id not in jsons_ids:
        print(doc_id + "not present")
    else:
        print(doc_id + "present")

tlg4163.tlg001present
tlg5451.tlg001present
tlg2679.tlg002present
tlg0653.tlg003present
tlg0653.tlg002not present


In [9]:
def combine_genres(row):
    genres = [row['Genre1'], row['Genre2'], row['Genre3']]
    return [genre for genre in genres if pd.notna(genre)]

# Apply the function and update the 'genre' column
LAGT['genre'] = LAGT.apply(lambda row: combine_genres(row) if pd.isna(row['genre']) else [row['genre']], axis=1)

LAGT = LAGT.drop(columns=['Genre1', 'Genre2', 'Genre3', 'doc_id_comment'])


In [10]:
# "tlg0653.tlg002" should not exist anymore
# we should see ""tlg0653.tlg003" instead...
LAGT[LAGT["doc_id"].isin(["tlg4163.tlg001", "tlg5451.tlg001", "tlg2679.tlg002", "tlg0653.tlg003", "tlg0653.tlg002"])]

Unnamed: 0,author_id,doc_id,filename,author,title,sentences,lemmatized_sentences,source,lemmata_source,not_before,not_after,tlg_epithet,genre,provenience,wordcount,lemmatacount
1778,tlg0653,tlg0653.tlg003,,Aratus Soleus,Epigrammata,"[αἰάζω Διότιμον, ὃς ἐν πέτραισι κάθηται, Γαργα...","[[αἰάζω, Διότιμος, πέτρα, κάθημαι, Γαργαρής, π...",glaux1,glaux1,-400.0,-201.0,,[Epigr.],pagan,65,31
1874,tlg4163,tlg4163.tlg001,,Telegonia,Telegonia,[ἤσθιεν ἁρπαλέως κρέα τ’ ἄσπετα καὶ μέθυ ἡδύ.],"[[ἐσθίω, κρέας, ἄσπετος, μέθυ, ἡδύς]]",glaux1,glaux1,-600.0,-501.0,,[],pagan,9,5
1885,tlg2679,tlg2679.tlg002,,,Testamentum Salomonis (recensiones A et B) (ms...,"[εὐλογητὸς εἶ, κύριε ὁ θεός, ὁ δοὺς τῷ Σολομῶν...","[[εὐλογητός, εἰμί, κύριος, θεός, δίδωμι, Σολομ...",glaux1,glaux1,201.0,300.0,,[],christian,87,38
1943,tlg5451,tlg5451.tlg001,,,Passio sancti Sabae Gothi (sub auctore Athanar...,"[Μαρτύριον τοῦ ἁγίου Σάβα τοῦ Γότθου., Ἡ ἐκκλη...","[[μαρτύριος, ἅγιος, σάβας, γότθης], [ἐκκλησία,...",exprecce,grecy,301.0,400.0,,[Hagiogr.],christian,1892,841


In [11]:
version = "v4-1"

In [19]:
LAGT.to_parquet("../data/large_files/LAGT_{}.parquet".format(version))

In [21]:
# save metadata for future usage
LAGT[['author_id', 'doc_id', 'filename', 'author', 'title', 'source', 'lemmata_source', 'not_before',
       'not_after', 'tlg_epithet', 'genre', 'provenience', 'wordcount',
       'lemmatacount']].to_csv("../data/LAGT_{}_metadata.csv".format(version), index=False)