This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [1]:
### REQUIREMENTS
import numpy as np
import math
import pandas as pd

import sys
### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import re

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import getpass

import datetime as dt
# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

import sddk

Now we turn to the download section of the EDH website, where we can find zip archives containing xml files with individual inscriptions. Instead of downloading them manually, we will download them directly into our Python environment.

In [3]:
# extract the download page
resp = requests.get("https://edh-www.adw.uni-heidelberg.de/data/export", headers={"User-Agent" : ""})
url_text = resp.text

In [4]:
# extract urls of individual zip archives for download
download_urls = re.findall("download\/edhEpidocDump_HD.+", url_text)
download_urls

['download/edhEpidocDump_HD000001-HD010000.zip',
 'download/edhEpidocDump_HD010001-HD020000.zip',
 'download/edhEpidocDump_HD020001-HD030000.zip',
 'download/edhEpidocDump_HD030001-HD040000.zip',
 'download/edhEpidocDump_HD040001-HD050000.zip',
 'download/edhEpidocDump_HD050001-HD060000.zip',
 'download/edhEpidocDump_HD060001-HD070000.zip',
 'download/edhEpidocDump_HD070001-HD082046.zip']

In [5]:
url_base = "https://edh-www.adw.uni-heidelberg.de/"

url = url_base + download_urls[0]
resp = requests.get(url, headers={"User-Agent" : ""})
zipped = zipfile.ZipFile(io.BytesIO(resp.content))
### names of all files within the zipped directory
namelist = zipped.namelist()

In [68]:
# define function for data parsing
def get_data_from_filename(filename, zipped):
    try:
        soup = BeautifulSoup(zipped.read(filename))
        xml_data = {} 
        xml_data["idno_uri"] = soup.find("idno", attrs={"type" : "URI"}).get_text()
        xml_data["idno_tm"] = soup.find("idno", attrs={"type" : "TM"}).get_text()
        placenames_refs = []
        try: 
            placenames = soup.find_all("placename")
            for placename in placenames:
                placenames_refs.append(placename["ref"])
        except: placenames_refs = []
        xml_data["placenames_refs"] = placenames_refs
        text_tag = soup.find("div", attrs={"type" : "edition"})
        xml_data["text_edition"] = " ".join(text_tag.get_text().splitlines()[1:])
        xml_data["origdate_text"] = soup.find("origdate").get_text().replace("\n", "")
        xml_data["origdate_attrs"] = soup.origdate.attrs
        try: xml_data["layout_execution"] = soup.layout.rs.attrs
        except: xml_data["layout_execution"] = ""
        try: xml_data["layout_execution_text"] = soup.layout.rs.get_text()
        except: xml_data["layout_execution_text"] = ""
        try: xml_data["support_objecttype"] = soup.support.find("objecttype")["ref"]
        except: xml_data["support_objecttype"] = ""
        try: xml_data ["support_objecttype_text"] = soup.support.objecttype.get_text()
        except: xml_data ["support_objecttype_text"] = ""
        try: xml_data["support_material"] = soup.support.find("material")["ref"]
        except: xml_data["support_material"] = ""    
        try: xml_data["support_material_text"] = soup.support.material.get_text()
        except: xml_data["support_material_text"] = ""
        try: xml_data["support_decoration"] = soup.support.rs.attrs
        except: xml_data["support_decoration"] = ""
        try: xml_data["support_decoration_text"] = soup.support.rs.get_text()
        except: xml_data["support_decoration_text"] = ""
        try: xml_data["keywords_term"] = soup.keywords.find("term")["ref"]
        except: xml_data["keywords_term"] = ""
        try: xml_data["keywords_term_text"] = soup.keywords.get_text().replace("\n", "")
        except: xml_data["keywords_term_text"] = ""
        return xml_data
    except:
        pass

In [69]:
# test with first ten files
edh_xml_data = []

for filename in namelist[:10]:
    edh_xml_data.append(get_data_from_filename(filename, zipped))


In [71]:
pd.DataFrame(edh_xml_data)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text_edition,origdate_text,origdate_attrs,layout_execution,layout_execution_text,support_objecttype,support_objecttype_text,support_material,support_material_text,support_decoration,support_decoration_text,keywords_term,keywords_term_text
0,http://edh-www.adw.uni-heidelberg.de/edh/insch...,251193,"[http://www.trismegistos.org/place/033152, htt...",Dis Manibus Noniae Publi filiae Optatae et Cai...,71 AD – 130 AD,"{'notbefore-custom': '0071', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/257,Tafel,,"Marmor, geädert / farbig","{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
1,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265631,"[http://www.trismegistos.org/place/000172, htt...",Caius Sextius Paris qui vixit annis LXX ...,51 AD – 200 AD,"{'notbefore-custom': '0051', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/257,Tafel,http://www.eagle-network.eu/voc/material/lod/48,Marmor,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
2,http://edh-www.adw.uni-heidelberg.de/edh/insch...,220675,"[http://www.trismegistos.org/place/025443, htt...",Publio Mummio Publi filio Galeria Sisennae Rut...,131 AD – 170 AD,"{'notbefore-custom': '0131', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/57,Statuenbasis,http://www.eagle-network.eu/voc/material/lod/48,Marmor,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/69,Ehreninschrift
3,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222102,"[http://www.trismegistos.org/place/025443, htt...",AVSLLA Marci Porci Nigri serva dominae Veneri ...,151 AD – 200 AD,"{'notbefore-custom': '0151', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/29,Altar,http://www.eagle-network.eu/voc/material/lod/60,Kalkstein,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/80,Weihinschrift
4,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265629,"[http://www.trismegistos.org/place/000172, htt...",libertus Successus Luci libertus Irenaeus C...,1 AD – 200 AD,"{'notbefore-custom': '0001', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/250,Stele,http://www.eagle-network.eu/voc/material/lod/138,unbestimmt,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
5,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222924,"[http://www.trismegistos.org/place/025443, htt...",Dis Manibus sacrum Memmia Auctina annorum LXX...,71 AD – 150 AD,"{'notbefore-custom': '0071', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/250,Stele,http://www.eagle-network.eu/voc/material/lod/60,Kalkstein,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
6,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265588,"[http://www.trismegistos.org/place/000172, htt...",Clodia Marci filia,100 BC – 51 BC,"{'notbefore-custom': '-0100', 'notafter-custom...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/257,Tafel,http://www.eagle-network.eu/voc/material/lod/71,Travertin,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
7,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265611,"[http://www.trismegistos.org/place/000172, htt...",Dis Manibus Caio Satrio Xantho Cai Satri Rufi ...,101 AD – 200 AD,"{'notbefore-custom': '0101', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/257,Tafel,http://www.eagle-network.eu/voc/material/lod/48,Marmor,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
8,http://edh-www.adw.uni-heidelberg.de/edh/insch...,168722,[],ABCDEFX,201 AD – 300 AD,"{'notbefore-custom': '0201', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/276,Tessera,http://www.eagle-network.eu/voc/material/lod/108,"Blei, Zinn","{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/76,Defixio
9,http://edh-www.adw.uni-heidelberg.de/edh/insch...,244297,[],Dis Manibus Luci Asini Poli Secundus et Orphae...,101 AD – 200 AD,"{'notbefore-custom': '0101', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/78,Urne,http://www.eagle-network.eu/voc/material/lod/138,unbestimmt,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift


In [73]:
%%time

url_base = "https://edh-www.adw.uni-heidelberg.de/"
edh_xml_data = []

for d_url in download_urls:
    url = url_base + d_url
    print(url)
    resp = requests.get(url, headers={'User-Agent': ''})
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    ### names of all files within the zipped directory
    namelist = zipped.namelist()[1:]
    for filename in namelist:
        try:
            edh_xml_data.append(get_data_from_filename(filename, zipped))
        except:
            pass
        ### index "0" is for main directory

https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD000001-HD010000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD010001-HD020000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD020001-HD030000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD030001-HD040000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD040001-HD050000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD050001-HD060000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD060001-HD070000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD070001-HD082046.zip
CPU times: user 12min 37s, sys: 5.77 s, total: 12min 43s
Wall time: 30min 24s


In [74]:
# remove empty
#edh_xml_data = [elem for elem in edh_xml_data if elem != None]
# how many we have
# last time we had 81143
len(edh_xml_data)

81148

In [75]:
edh_xml_data_f = [el for el in edh_xml_data if el != None]

In [76]:
len(edh_xml_data_f)

81143

In [77]:
# make a dataframe from 
edh_xml_data_df = pd.DataFrame(edh_xml_data_f)
edh_xml_data_df.head(5)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text_edition,origdate_text,origdate_attrs,layout_execution,layout_execution_text,support_objecttype,support_objecttype_text,support_material,support_material_text,support_decoration,support_decoration_text,keywords_term,keywords_term_text
0,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265631,"[http://www.trismegistos.org/place/000172, htt...",Caius Sextius Paris qui vixit annis LXX ...,51 AD – 200 AD,"{'notbefore-custom': '0051', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/257,Tafel,http://www.eagle-network.eu/voc/material/lod/48,Marmor,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
1,http://edh-www.adw.uni-heidelberg.de/edh/insch...,220675,"[http://www.trismegistos.org/place/025443, htt...",Publio Mummio Publi filio Galeria Sisennae Rut...,131 AD – 170 AD,"{'notbefore-custom': '0131', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/57,Statuenbasis,http://www.eagle-network.eu/voc/material/lod/48,Marmor,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/69,Ehreninschrift
2,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222102,"[http://www.trismegistos.org/place/025443, htt...",AVSLLA Marci Porci Nigri serva dominae Veneri ...,151 AD – 200 AD,"{'notbefore-custom': '0151', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/29,Altar,http://www.eagle-network.eu/voc/material/lod/60,Kalkstein,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/80,Weihinschrift
3,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265629,"[http://www.trismegistos.org/place/000172, htt...",libertus Successus Luci libertus Irenaeus C...,1 AD – 200 AD,"{'notbefore-custom': '0001', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/250,Stele,http://www.eagle-network.eu/voc/material/lod/138,unbestimmt,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift
4,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222924,"[http://www.trismegistos.org/place/025443, htt...",Dis Manibus sacrum Memmia Auctina annorum LXX...,71 AD – 150 AD,"{'notbefore-custom': '0071', 'notafter-custom'...","{'type': 'execution', 'ref': 'http://www.eagle...",unbestimmt,http://www.eagle-network.eu/voc/objtyp/lod/250,Stele,http://www.eagle-network.eu/voc/material/lod/60,Kalkstein,"{'type': 'decoration', 'ref': 'http://www.eagl...",nein,http://www.eagle-network.eu/voc/typeins/lod/92,Grabinschrift


In [78]:
### configure session and groupurl
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/


In [79]:
sddk.write_file("SDAM_data/EDH/edh_xml_data_2020-09-17.json", edh_xml_data_df, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/EDH/edh_xml_data_2020-09-17.json"
