This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [2]:
### REQUIREMENTS
import numpy as np
import math
import pandas as pd

import sys
### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import re

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import getpass

import datetime as dt
# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

import sddk

Now we turn to the download section of the EDH website, where we can find zip archives containing xml files with individual inscriptions. Instead of downloading them manually, we will download them directly into our Python environment.

In [27]:
# extract the download page
resp = requests.get("https://edh-www.adw.uni-heidelberg.de/data/export")
url_text = resp.text

In [28]:
# extract urls of individual zip archives for download
download_urls = re.findall("download\/edhEpidocDump_HD.+", url_text)
download_urls

['download/edhEpidocDump_HD000001-HD010000.zip',
 'download/edhEpidocDump_HD010001-HD020000.zip',
 'download/edhEpidocDump_HD020001-HD030000.zip',
 'download/edhEpidocDump_HD030001-HD040000.zip',
 'download/edhEpidocDump_HD040001-HD050000.zip',
 'download/edhEpidocDump_HD050001-HD060000.zip',
 'download/edhEpidocDump_HD060001-HD070000.zip',
 'download/edhEpidocDump_HD070001-HD082046.zip']

In [30]:
# define function for data parsing
def get_data_from_filename(filename, zipped):
    try:
        soup = BeautifulSoup(zipped.read(filename))
        idno_uri = soup.find("idno", attrs={"type" : "URI"}).get_text()
        idno_tm = soup.find("idno", attrs={"type" : "TM"}).get_text()
        placenames_refs = []
        try: 
          placenames = soup.find_all("placename")
          for placename in placenames:
            placenames_refs.append(placename["ref"])
        except: placenames_refs = []
        text_tag = soup.find("div", attrs={"type" : "edition"})
        commentary = soup.find("div", attrs={"type" : "commentary"}).get_text()
        text = " ".join(text_tag.get_text().splitlines()[1:])
        origdate_text = soup.find("origdate").get_text().replace("\n", "")
        origdate_attrs = soup.origdate.attrs
        objecttype = [soup.find("objecttype").get_text(), str(soup.find("objecttype")).partition("lod/")[2].partition("\">")[0]]
        return [idno_uri, idno_tm, placenames_refs, text, origdate_text, origdate_attrs, objecttype, commentary]
    except:
        pass

In [32]:
%%time

# to make a request for downloading a file, we have to use headers to somehow specify who we are-
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

url_base = "https://edh-www.adw.uni-heidelberg.de/"
edh_xml_data = []

for d_url in download_urls:
    url = url_base + d_url
    print(url)
    resp = requests.get(url, headers=headers)
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    ### names of all files within the zipped directory
    namelist = zipped.namelist()[1:]
    for filename in namelist:
        try:
            edh_xml_data.append(get_data_from_filename(filename, zipped))
        except:
            pass
        ### index "0" is for main directory

https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD000001-HD010000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD010001-HD020000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD020001-HD030000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD030001-HD040000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD040001-HD050000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD050001-HD060000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD060001-HD070000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD070001-HD082046.zip
CPU times: user 11min 22s, sys: 4.89 s, total: 11min 27s
Wall time: 28min 47s


In [40]:
# remove empty
edh_xml_data = [elem for elem in edh_xml_data if elem != None]
# how many we have
len(edh_xml_data)

81143

In [41]:
# take a look at the first two entries
edh_xml_data[:2]

[['http://edh-www.adw.uni-heidelberg.de/edh/inschrift/HD000002',
  '265631',
  ['http://www.trismegistos.org/place/000172',
   'http://www.trismegistos.org/place/000172',
   'http://www.geonames.org/',
   'http://www.geonames.org/',
   'http://www.geonames.org/countries/it/'],
  'Caius Sextius Paris qui vixit annis LXX             ',
  '51 AD – 200 AD                            ',
  {'notbefore-custom': '0051',
   'notafter-custom': '0200',
   'datingmethod': 'http://en.wikipedia.org/wiki/Julian_calendar'},
  ['Tafel', '257'],
  '\n AE 1983: Breite: 35 cm.\n'],
 ['http://edh-www.adw.uni-heidelberg.de/edh/inschrift/HD000003',
  '220675',
  ['http://www.trismegistos.org/place/025443',
   'http://www.trismegistos.org/place/002036',
   'http://www.geonames.org/2510394',
   'http://www.geonames.org/',
   'http://www.geonames.org/countries/es/'],
  'Publio Mummio Publi filio Galeria Sisennae Rutiliano Xviro stlitibus iudicandis  ',
  '131 AD – 170 AD                            ',
  {'notbefo

In [42]:
# make a dataframe from 
edh_xml_data_df = pd.DataFrame(edh_xml_data, columns=["idno_uri", "idno_tm", "placenames_refs", "text", "origdate_text", "origdate_attrs","objecttype", "commentary"])
edh_xml_data_df.head(5)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text,origdate_text,origdate_attrs,objecttype,commentary
0,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265631,"[http://www.trismegistos.org/place/000172, htt...",Caius Sextius Paris qui vixit annis LXX ...,51 AD – 200 AD,"{'notbefore-custom': '0051', 'notafter-custom'...","[Tafel, 257]",\n AE 1983: Breite: 35 cm.\n
1,http://edh-www.adw.uni-heidelberg.de/edh/insch...,220675,"[http://www.trismegistos.org/place/025443, htt...",Publio Mummio Publi filio Galeria Sisennae Rut...,131 AD – 170 AD,"{'notbefore-custom': '0131', 'notafter-custom'...","[Statuenbasis, 57]",\n (B): [S]isenna ist falscher Kasus; folgende...
2,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222102,"[http://www.trismegistos.org/place/025443, htt...",AVSLLA Marci Porci Nigri serva dominae Veneri ...,151 AD – 200 AD,"{'notbefore-custom': '0151', 'notafter-custom'...","[Altar, 29]",\n Material: lokaler grauer Kalkstein. (B): St...
3,http://edh-www.adw.uni-heidelberg.de/edh/insch...,265629,"[http://www.trismegistos.org/place/000172, htt...",libertus Successus Luci libertus Irenaeus C...,1 AD – 200 AD,"{'notbefore-custom': '0001', 'notafter-custom'...","[Stele, 250]",\n (B): Z. 3: C(ai) l(ibertae) Tyches.\n
4,http://edh-www.adw.uni-heidelberg.de/edh/insch...,222924,"[http://www.trismegistos.org/place/025443, htt...",Dis Manibus sacrum Memmia Auctina annorum LXX...,71 AD – 150 AD,"{'notbefore-custom': '0071', 'notafter-custom'...","[Stele, 250]",\n Der Stein ist aus 2 aneinanderpassenden Fra...


In [43]:
### configure session and groupurl
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [44]:
sddk.write_file("SDAM_data/EDH/edh_xml_data_2020-09-14.json", edh_xml_data_df, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/EDH/edh_xml_data_2020-09-14.json"
