In [2]:
import requests
import pandas as pd
import numpy as np
import sddk
import zipfile
import io

In [3]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [14]:
PHI = sddk.read_file("SDAM_data/PHI/PHI_enriched_2020-08-27.json", "df", conf)

In [15]:
# unfortunately, moving the dataset between Python and R causes that cells in most columns of the dataframe contain a LIST OF VALUES (of length 1) and not the VALUE itself.
# if this is the case, uncomment the last two rows to do one simple transformation

def lists_to_values(list_or_value):
  if isinstance(list_or_value, list):
    value = list_or_value[0]
  else: 
    value = list_or_value
  if not bool(value): # if is it empty dict or list
    value = np.nan
  return value
for column in PHI.columns:
  PHI[column] = PHI.apply(lambda row: lists_to_values(row[column]), axis=1)

In [16]:
PHI.head(5)

Unnamed: 0,URL,Book,Text,hdr1,hdr2,tildeinfo,note,lines,metadata,data,filename,PHI_ID
0,/text/1?location=1701&patt=&bookid=4&offset=0&...,IG I³,1,Regions\nAttica (IG I-III),IG I³\n1,Att. — Ath.: Akr. — stoich. 35 — c. 510-500 a....,,12.0,1\n\n\n\n5\n\n\n\n\n10\n\n,ἔδοχσεν το͂ι δέμοι· τ̣[ὸς ἐ Σ]αλαμ̣[ῖνι κλερόχ...,IG-I³.csv,1
1,/text/2?location=1701&patt=&bookid=4&offset=0&...,IG I³,2,Regions\nAttica (IG I-III),IG I³\n2,Att. — non-stoich. — c. 500 a.,,14.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n\n,[․․8-9․․․]ν̣ βολ — — — — — — — — — —\n[․6-7․․]...,IG-I³.csv,2
2,/text/3?location=1701&patt=&bookid=4&offset=0&...,IG I³,3,Regions\nAttica (IG I-III),IG I³\n3,Att. — stoich. 21 — 490-480 a.,,13.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n,[․]αρ[․․․․]ι ℎερακλειο[․․5․․]\n[․]αρ̣ο#⁷[․] τι...,IG-I³.csv,3
3,/text/4?location=1701&patt=&bookid=4&offset=0&...,IG I³,4,Regions\nAttica (IG I-III),IG I³\n4,Att. — stoich. 38 — 485/4 a.,,56.0,face A.1\n\n\n\n5\n\n\n\n\n10\n\n\n\n\n15\n\n\...,[․․․․․․․․․․․․․․․․․․38․․․․․․․․․․․․․․․․․․]\n[․․․...,IG-I³.csv,4
4,/text/5?location=1701&patt=&bookid=4&offset=0&...,IG I³,5,Regions\nAttica (IG I-III),IG I³\n5,Att. — c. 500 a.,,6.0,1\n\n\n\n5\n,[ἔδοχσε]ν [⋮ τε͂ι βολε͂ι] ⋮ καὶ [τ]ο͂ι δέμοι ⋮...,IG-I³.csv,5


# Pythia data

In [17]:
url = "SDAM_data/PHI/PHI_pythia/phi-plaintext.zip"
resp = conf[0].get(conf[1] + url)

In [18]:
zipped = zipfile.ZipFile(io.BytesIO(resp.content))

In [19]:
namelist = zipped.namelist()
namelist = [file for file in namelist if (".txt" in file) and (file[0] != "_")]
namelist[:10]

['phi-plaintext/285586.txt',
 'phi-plaintext/186989.txt',
 'phi-plaintext/180320.txt',
 'phi-plaintext/284840.txt',
 'phi-plaintext/261172.txt',
 'phi-plaintext/3644.txt',
 'phi-plaintext/249826.txt',
 'phi-plaintext/152266.txt',
 'phi-plaintext/239319.txt',
 'phi-plaintext/5235.txt']

In [20]:
### to do below some matching, we have to take care of appropriate encoding
from unicodedata import normalize
def normalize_string(string):
    try:
        return normalize("NFC", string)
    except: 
        return ""

In [21]:
phi_pythia_dict = {}
for filename in namelist:
    phi_number = filename.rpartition("/")[2].partition(".txt")[0]
    phi_string = normalize_string(zipped.read(filename).decode())
    phi_pythia_dict[phi_number] =  phi_string

In [22]:
def get_pythia_text_by_number(phi_number):
    try: return phi_pythia_dict[str(phi_number)]
    except: return ""

In [23]:
PHI["string_pythia"] = PHI["Text"].apply(get_pythia_text_by_number)  

In [24]:
PHI.head(10)

Unnamed: 0,URL,Book,Text,hdr1,hdr2,tildeinfo,note,lines,metadata,data,filename,PHI_ID,string_pythia
0,/text/1?location=1701&patt=&bookid=4&offset=0&...,IG I³,1,Regions\nAttica (IG I-III),IG I³\n1,Att. — Ath.: Akr. — stoich. 35 — c. 510-500 a....,,12.0,1\n\n\n\n5\n\n\n\n\n10\n\n,ἔδοχσεν το͂ι δέμοι· τ̣[ὸς ἐ Σ]αλαμ̣[ῖνι κλερόχ...,IG-I³.csv,1,ἔδοχσεν τοι δέμοι τ[ὸς ἐ σ]αλαμ[ῖνι κλερόχ]ος ...
1,/text/2?location=1701&patt=&bookid=4&offset=0&...,IG I³,2,Regions\nAttica (IG I-III),IG I³\n2,Att. — non-stoich. — c. 500 a.,,14.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n\n,[․․8-9․․․]ν̣ βολ — — — — — — — — — —\n[․6-7․․]...,IG-I³.csv,2,[--------9---]ν βολ ---------- [------7--] α ἑ...
2,/text/3?location=1701&patt=&bookid=4&offset=0&...,IG I³,3,Regions\nAttica (IG I-III),IG I³\n3,Att. — stoich. 21 — 490-480 a.,,13.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n,[․]αρ[․․․․]ι ℎερακλειο[․․5․․]\n[․]αρ̣ο#⁷[․] τι...,IG-I³.csv,3,[-]αρ[----]ι ἑρακλειο[-----] [-]αρο [-] τιθένα...
3,/text/4?location=1701&patt=&bookid=4&offset=0&...,IG I³,4,Regions\nAttica (IG I-III),IG I³\n4,Att. — stoich. 38 — 485/4 a.,,56.0,face A.1\n\n\n\n5\n\n\n\n\n10\n\n\n\n\n15\n\n\...,[․․․․․․․․․․․․․․․․․․38․․․․․․․․․․․․․․․․․․]\n[․․․...,IG-I³.csv,4,[--------------------------------------] [----...
4,/text/5?location=1701&patt=&bookid=4&offset=0&...,IG I³,5,Regions\nAttica (IG I-III),IG I³\n5,Att. — c. 500 a.,,6.0,1\n\n\n\n5\n,[ἔδοχσε]ν [⋮ τε͂ι βολε͂ι] ⋮ καὶ [τ]ο͂ι δέμοι ⋮...,IG-I³.csv,5,[ἔδοχσε]ν [ τει βολει] καὶ [τ]οι δέμοι ὅτε παρ...
5,/text/6?location=1701&patt=&bookid=4&offset=0&...,IG I³,6,Regions\nAttica (IG I-III),IG I³\n6,Att. — stoich. 23/11 — ante 460 a.,,160.0,face A.BM 309.1\n\n\n\n5\n\n\n\n\n10\n\n\n\n\n...,— — — — — — — — — — — — —\n[․․․․․․15․․․․․․․] δ...,IG-I³.csv,6,------------- [---------------] δραχμεισ[ι ---...
6,/text/7?location=1701&patt=&bookid=4&offset=0&...,IG I³,7,Regions\nAttica (IG I-III),IG I³\n7,Att. — stoich. 40 — 460-450,,28.0,frg. a.1\n\n\n\n5\n\n\n\n\n10\n\n\n\n13\n\n\nf...,[ἔδοχσεν τε͂]ι βο[λ]ε͂[ι καὶ το͂ι δέμοι· ․․6․․...,IG-I³.csv,7,[ἔδοχσεν τε]ι βο[λ]ε[ι καὶ τοι δέμοι ------ ἐπ...
7,/text/8?location=1701&patt=&bookid=4&offset=0&...,IG I³,8,Regions\nAttica (IG I-III),IG I³\n8,Att. — stoich. 32 — 460-450,,26.0,frg. a.1\n\n\n\n5\n\n\n\n\n10\n\n\nfrg. b.12\n...,[․․5․․]#⁷ον ℎὰ ο[․․․․․․․․․21․․․․․․․․․․]\nα περ...,IG-I³.csv,8,[-----] ον ἃ ο[---------------------] α περὶ τ...
8,/text/9?location=1701&patt=&bookid=4&offset=0&...,IG I³,9,Regions\nAttica (IG I-III),IG I³\n9,Att. — stoich. 24 — c. 458 a.,,17.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n\n\n15\n\n,[ἔδοχσεν τε͂ι βο]λε̣͂ι καὶ το͂[ι δέμ]-\n[οι· ․...,IG-I³.csv,9,[ἔδοχσεν τει βο]λει καὶ το[ι δέμοι ---ντὶς ἐπρ...
9,/text/10?location=1701&patt=&bookid=4&offset=0...,IG I³,10,Regions\nAttica (IG I-III),IG I³\n10,Att. — stoich. 22 — 469-450,,28.0,1\n\n\n\n5\n\n\n\n\n10\n\n\n\n\n15\n\n\n\n\n20...,[ἔδο]ξεν τῆι βολῆι καὶ τῶι δ[ή]-\n[μωι· Ἀ]καμα...,IG-I³.csv,10,[ἔδο]ξεν τῆι βολῆι καὶ τῶι δ[ήμωι ἀ]καμαντὶς [...


In [None]:
sddk.write_file("SDAM_data/PHI/PHI_with_pythia_20201211.json", PHI, conf)