# Setup

In [1]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata

import nltk
from nltk.collocations import *

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [2]:
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
PIA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KxOx7Be9fj3lDcEPgQhQ-Iqcn9p367-MMD6RMXe8rks/edit?usp=sharing")
PIA_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1e94wyelg6dftQ4zxbq1xvwxWAI-BhcYXtclDW-YTnrw/edit?usp=sharing")

In [4]:
PIA_data.worksheets()

[<Worksheet 'Sheet1' id:0>,
 <Worksheet 'Aristotle_λύπη' id:1718022011>,
 <Worksheet 'Hippocrates_λύπη' id:309444251>,
 <Worksheet 'Hippocrates_ἀλγηδών' id:844140843>,
 <Worksheet 'Hippocrates_πόνος' id:1101646330>,
 <Worksheet 'Hippocrates_ὀδύνη' id:287961093>,
 <Worksheet 'Aristotle_ἀλγηδών' id:1146799700>,
 <Worksheet 'Aristotle_πόνος' id:1073775859>,
 <Worksheet 'Aristotle_ὀδύνη' id:438127122>,
 <Worksheet 'keyterms_overview' id:1994560877>,
 <Worksheet 'c_aristotelicum_OVERVIEW_NEW' id:984710941>,
 <Worksheet 'c_hippocraticum_OVERVIEW_NEW' id:1964402831>]

# Load the main dataset of ancient Greek texts

In [5]:
AGT = sddk.read_file("SDAM_root/SDAM_data/AGT/AGT_20201124.json", "df", conf)

In [6]:
AGT.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,n_sentences,lemmatized_sentences,tlg_epithet,lemmata,lemmata_wordcount
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,3252,"[[ἄρχω, Φοῖβος, παλαιγενεών, κλέος, φώς, μιμνή...",Epici/-ae,"[ἄρχω, Φοῖβος, παλαιγενεών, κλέος, φώς, μιμνήσ...",23231
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,6068,"[[θουκυδιδής, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",Historici/-ae,"[θουκυδιδής, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",72357
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,10245,"[[φιλοσοφία, ἔργον, ἔνειμι, φάσις, βάρβαρος, ἄ...",Biographi,"[φιλοσοφία, ἔργον, ἔνειμι, φάσις, βάρβαρος, ἄρ...",56868
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,1982,"[[ψιθύρισμα, πίτυς, αἰπολέ, τῆνος, παγαισί, με...",Bucolici,"[ψιθύρισμα, πίτυς, αἰπολέ, τῆνος, παγαισί, μελ...",11483
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,152,"[[ῥόδον, δροσοεντά, καταπυκνός, ἐκεινά, ἕρπυλλ...",Bucolici,"[ῥόδον, δροσοεντά, καταπυκνός, ἐκεινά, ἕρπυλλο...",1053


# Data Extraction and Overview



In [7]:
### how many documents we have
len(AGT)

1457

In [8]:
### let's identify our main author of interest

c_hippocraticum = AGT[AGT["author_id"].str.startswith("tlg0627")]
len(c_hippocraticum) ### old value: 53

52

In [9]:
c_hippocraticum["wordcount"].sum() # old value: 443514, then 333446

333443

In [10]:
c_aristotelicum = AGT[AGT["author_id"].str.startswith("tlg0086")]
len(c_aristotelicum) ### originally we had 27

35

In [11]:
c_aristotelicum["wordcount"].sum() # 857024

840271

However in the case of Aristotle, we are interested only in a subselection of works associated with his name




In [12]:
# read back the manually coded data
c_aristotelicum_coded = get_as_dataframe(PIA_data.worksheet("c_aristotelicum_OVERVIEW_NEW"))
c_aristotelicum_coded.head()

Unnamed: 0,filename,include?,author,title,author_id,doc_id,wordcount,lemmata_wordcount,num_of_sents
0,tlg0086.tlg001.1st1K-grc2.xml,y,Aristotle,Aristotelis Analytica Priora et Posteriora; Ar...,tlg0086,tlg0086.tlg001,59772,12287,3384
1,tlg0086.tlg002.1st1K-grc2.xml,y,Aristotle,De anima; Aritoteles De anima,tlg0086,tlg0086.tlg002,20988,5579,1250
2,tlg0086.tlg003.perseus-grc1.xml,,Aristotle,Athenian Constitution; Machine readable text; ...,tlg0086,tlg0086.tlg003,16536,4243,817
3,tlg0086.tlg005.1st1K-grc1.xml,y,Aristotle,De caelo; Aristoteles De coelo et De generatio...,tlg0086,tlg0086.tlg005,31395,8370,1856
4,tlg0086.tlg006.1st1K-grc1.xml,y,Aristotle,Categoriae; Aristotelis Opera,tlg0086,tlg0086.tlg006,10317,2865,646


In [13]:
### we can check whether the new and old version have the same name
print(c_aristotelicum_coded["doc_id"].tolist())
print(c_aristotelicum["doc_id"].tolist())

['tlg0086.tlg001', 'tlg0086.tlg002', 'tlg0086.tlg003', 'tlg0086.tlg005', 'tlg0086.tlg006', 'tlg0086.tlg008', 'tlg0086.tlg009', 'tlg0086.tlg010', 'tlg0086.tlg013', 'tlg0086.tlg014', 'tlg0086.tlg015', 'tlg0086.tlg016', 'tlg0086.tlg017', 'tlg0086.tlg018', 'tlg0086.tlg020', 'tlg0086.tlg021', 'tlg0086.tlg022', 'tlg0086.tlg024', 'tlg0086.tlg025', 'tlg0086.tlg026', 'tlg0086.tlg029', 'tlg0086.tlg030', 'tlg0086.tlg031', 'tlg0086.tlg034', 'tlg0086.tlg035', 'tlg0086.tlg037', 'tlg0086.tlg038', 'tlg0086.tlg040', 'tlg0086.tlg041', 'tlg0086.tlg042', 'tlg0086.tlg043', 'tlg0086.tlg044', 'tlg0086.tlg045', 'tlg0086.tlg052', 'tlg0086.tlg054']
['tlg0086.tlg001', 'tlg0086.tlg002', 'tlg0086.tlg003', 'tlg0086.tlg005', 'tlg0086.tlg006', 'tlg0086.tlg008', 'tlg0086.tlg009', 'tlg0086.tlg010', 'tlg0086.tlg013', 'tlg0086.tlg014', 'tlg0086.tlg015', 'tlg0086.tlg016', 'tlg0086.tlg017', 'tlg0086.tlg018', 'tlg0086.tlg020', 'tlg0086.tlg021', 'tlg0086.tlg022', 'tlg0086.tlg024', 'tlg0086.tlg025', 'tlg0086.tlg026', 'tlg0086

In [14]:
# 1) add the "include?" column from the old overview to our new dataset 
c_aristotelicum["include?"] = c_aristotelicum_coded["include?"].tolist()
# 2) use the "include?" column for filtering the new dataset
# 3) drop "include?" column 
c_aristotelicum = c_aristotelicum[c_aristotelicum["include?"]=="y"].drop("include?", axis=1)
c_aristotelicum.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,n_sentences,lemmatized_sentences,tlg_epithet,lemmata,lemmata_wordcount
685,tlg0086.tlg001.1st1K-grc2.xml,Aristotle,Aristotelis Analytica Priora et Posteriora,59614,tlg0086,tlg0086.tlg001,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,3710,"[[λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστήμη, ἀπο...",Philosophici/-ae,"[λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστήμη, ἀποδ...",25547
686,tlg0086.tlg002.1st1K-grc2.xml,Aristotle,De anima,20912,tlg0086,tlg0086.tlg002,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,1317,"[[καλός, τίμιος, εἰδησίν, ὑπολαμβάνω, ἕτερος, ...",Philosophici/-ae,"[καλός, τίμιος, εἰδησίν, ὑπολαμβάνω, ἕτερος, ἕ...",9455
688,tlg0086.tlg005.1st1K-grc1.xml,Aristotle,De caelo,30794,tlg0086,tlg0086.tlg005,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,2276,"[[φυσεώς, ἐπιστημή, πλειστή, φαίνω, σῶμα, μεγε...",Philosophici/-ae,"[φυσεώς, ἐπιστημή, πλειστή, φαίνω, σῶμα, μεγεθ...",13662
689,tlg0086.tlg006.1st1K-grc1.xml,Aristotle,Categoriae,10316,tlg0086,tlg0086.tlg006,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,660,"[[λέγω, ὄνομα, μόνος, κοινός, λόγος, οὐσία, ἕτ...",Philosophici/-ae,"[λέγω, ὄνομα, μόνος, κοινός, λόγος, οὐσία, ἕτε...",4775
690,tlg0086.tlg008.1st1K-grc1.xml,Aristotle,De divinatione per somnum,1194,tlg0086,tlg0086.tlg008,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,68,"[[μαντικής, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐ...",Philosophici/-ae,"[μαντικής, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐν...",542


In [15]:
len(c_aristotelicum)

27

In [16]:
c_aristotelicum["wordcount"].sum()

785703

In [18]:
#set_with_dataframe(PIA_overview.add_worksheet("c_aristotelicum_OVERVIEW_NEW", rows=1, cols=1), c_aristotelicum.drop(["lemmata", "string", "lemmatized_sentences"], axis=1))

# Export the subcorpora into sciencedata

We will export the files into Vojtěch's directory on sciencedata.dk, which will be later on made public.

In [19]:
conf = sddk.configure()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [0]:
###publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
### sddk.write_file("https://sciencedata.dk/public" + publicfolder + "/c_hippocraticum.json", c_hippocraticum)


Type shared folder name or press Enter to skip: 
sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
endpoint variable has been configured to: https://sciencedata.dk/files/
The path is not valid. Try different path and filename: public_data/PIA
A file with the same name ("PIA") already exists in this location.
Press Enter to overwrite it or choose different path and filename: public_data/PIA/c_hippocraticum.json
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/public_data/PIA/c_hippocraticum.json"


In [17]:
sddk.write_file("public_data/PIA/c_hippocraticum.json", c_hippocraticum, conf)

A file with the same name ("c_hippocraticum.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/public_data/PIA/c_hippocraticum.json"


In [18]:
sddk.write_file("public_data/PIA/c_aristotelicum.json", c_aristotelicum, conf)

A file with the same name ("c_aristotelicum.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/public_data/PIA/c_aristotelicum.json"
