This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [2]:
### REQUIREMENTS
import numpy as np
import math
import pandas as pd

import sys
### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import re

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import getpass

import datetime as dt
# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

import sddk

Now we turn to the download section of the EDH website, where we can find zip archives containing xml files with individual inscriptions. Instead of downloading them manually, we will download them directly into our Python environment.

In [27]:
# extract the download page
resp = requests.get("https://edh-www.adw.uni-heidelberg.de/data/export", headers=headers)
url_text = resp.text

In [28]:
# extract urls of individual zip archives for download
download_urls = re.findall("download\/edhEpidocDump_HD.+", url_text)
download_urls

['download/edhEpidocDump_HD000001-HD010000.zip',
 'download/edhEpidocDump_HD010001-HD020000.zip',
 'download/edhEpidocDump_HD020001-HD030000.zip',
 'download/edhEpidocDump_HD030001-HD040000.zip',
 'download/edhEpidocDump_HD040001-HD050000.zip',
 'download/edhEpidocDump_HD050001-HD060000.zip',
 'download/edhEpidocDump_HD060001-HD070000.zip',
 'download/edhEpidocDump_HD070001-HD082046.zip']

In [30]:
# define function for data parsing
def get_data_from_filename(filename, zipped):
    try:
        soup = BeautifulSoup(zipped.read(filename))
        idno_uri = soup.find("idno", attrs={"type" : "URI"}).get_text()
        idno_tm = soup.find("idno", attrs={"type" : "TM"}).get_text()
        placenames_refs = []
        try: 
          placenames = soup.find_all("placename")
          for placename in placenames:
            placenames_refs.append(placename["ref"])
        except: placenames_refs = []
        text_tag = soup.find("div", attrs={"type" : "edition"})
        commentary = soup.find("div", attrs={"type" : "commentary"}).get_text()
        text = " ".join(text_tag.get_text().splitlines()[1:])
        origdate_text = soup.find("origdate").get_text().replace("\n", "")
        origdate_attrs = soup.origdate.attrs
        objecttype = [soup.find("objecttype").get_text(), str(soup.find("objecttype")).partition("lod/")[2].partition("\">")[0]]
        return [idno_uri, idno_tm, placenames_refs, text, origdate_text, origdate_attrs, objecttype, commentary]
    except:
        pass

In [None]:
%%time

# to make a request for downloading a file, we have to use headers to somehow specify who we are-
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

url_base = "https://edh-www.adw.uni-heidelberg.de/"
edh_xml_data = []

for d_url in download_urls:
    url = url_base + d_url
    print(url)
    resp = requests.get(url, headers=headers)
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    ### names of all files within the zipped directory
    namelist = zipped.namelist()[1:]
    for filename in namelist:
        try:
            edh_xml_data.append(get_data_from_filename(filename, zipped))
        except:
            pass
        ### index "0" is for main directory

https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD000001-HD010000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD010001-HD020000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD020001-HD030000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD030001-HD040000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD040001-HD050000.zip


In [None]:
# how many we have
len(edh_xml_data)

In [None]:
# take a look at the first two entries
edh_xml_data[:2]

In [None]:
# make a dataframe from 
edh_xml_data_df = pd.DataFrame(edh_xml_data, columns=["idno_uri", "idno_tm", "placenames_refs", "text", "origdate_text", "origdate_attrs","objecttype", "commentary"])
edh_xml_data_df.head(5)

In [3]:
### configure session and groupurl
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [None]:
sddk.write_file("SDAM_data/EDH/edh_xml_data_2020-09-14.json", edh_xml_data_df, conf)


# Playground with lists&dicts

In [32]:
df = sddk.read_file("SDAM_data/EDH/edh_xml_data_2020-06-23.json", "df", conf)
df.head(5)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text,origdate_text,origdate_attrs,objecttype,commentary
0,http://edh-www.adw.uni-heidelberg.de/edh/insch...,214091,"[http://www.trismegistos.org/place/019860, htt...",In hoc tumulo meserecordia! Christi requiescet...,551 AD – 600 AD,"{'notbefore-custom': '0551', 'notafter-custom'...","[Tafel, 257]",\n\n
1,http://edh-www.adw.uni-heidelberg.de/edh/insch...,415668,"[http://www.trismegistos.org/place/016530, htt...",Alae Noricorum Iulius Quintus Vagdaevercusti v...,71 AD – 130 AD,"{'notbefore-custom': '0071', 'notafter-custom'...","[Tafel, 257]",\n\n
2,http://edh-www.adw.uni-heidelberg.de/edh/insch...,192497,"[http://www.trismegistos.org/place/029481, htt...",Imperator Caesar Lucius Septimius Severus Pius...,197 AD,"{'notbefore-custom': '0197', 'datingmethod': '...","[unbestimmt, 2]",\n Textwiedergabe nach Eck.\n
3,http://edh-www.adw.uni-heidelberg.de/edh/insch...,199108,"[http://www.trismegistos.org/place/015771, htt...",Saturno Meddensi Augusto sacru...,,{'datingmethod': 'http://en.wikipedia.org/wiki...,"[unbestimmt, 2]",\n\n
4,http://edh-www.adw.uni-heidelberg.de/edh/insch...,201034,"[http://www.trismegistos.org/place/015771, htt...",Lucio Naevio Quadratiano legato Augusti pro pr...,193 AD,"{'notbefore-custom': '0193', 'datingmethod': '...","[unbestimmt, 2]",\n Datierung: L. Naevius Quadratianus war 193 ...


In [42]:
origdate_list = df["origdate_attrs"].tolist()

In [49]:
new_list = []
for el in origdate_list:
    if "notbefore-custom" in el.keys():
        new_list.append(el["notbefore-custom"])
    else:
        new_list.append("")

#  [el["notbefore-custom"] for el in origdate_list if "notbefore-custom" in el.keys() else: ]

In [50]:
new_list

['0551',
 '0071',
 '0197',
 '',
 '0193',
 '0151',
 '0501',
 '0457',
 '',
 '0001',
 '',
 '0250',
 '0137',
 '0208',
 '0296',
 '0122',
 '0171',
 '0175',
 '',
 '',
 '',
 '0101',
 '',
 '0001',
 '',
 '0069',
 '0071',
 '',
 '0151',
 '',
 '0131',
 '0171',
 '0001',
 '-0027',
 '0151',
 '',
 '',
 '0301',
 '0151',
 '',
 '',
 '0171',
 '0001',
 '0101',
 '0001',
 '',
 '0001',
 '0101',
 '0300',
 '0107',
 '0101',
 '0151',
 '0001',
 '0101',
 '',
 '',
 '0301',
 '',
 '0001',
 '0151',
 '0305',
 '0198',
 '0101',
 '0180',
 '0001',
 '',
 '0238',
 '0201',
 '',
 '',
 '0101',
 '0201',
 '0101',
 '0101',
 '0151',
 '0042',
 '0031',
 '0001',
 '',
 '',
 '0001',
 '',
 '0101',
 '0198',
 '',
 '0001',
 '',
 '0151',
 '',
 '0171',
 '',
 '0150',
 '0101',
 '0198',
 '0051',
 '',
 '0001',
 '',
 '0151',
 '0140',
 '0151',
 '',
 '0071',
 '0151',
 '',
 '0001',
 '0171',
 '',
 '0001',
 '0131',
 '0070',
 '',
 '-0025',
 '0101',
 '0001',
 '',
 '',
 '0101',
 '',
 '0151',
 '0069',
 '',
 '0001',
 '0001',
 '0151',
 '',
 '',
 '',
 '',
 '011

In [38]:
def get_notbefore(row):
    try:
        return row["origdate_attrs"]["notbefore-custom"]
    except:
        return ""

In [40]:
df["newcolumn"] = df.apply(lambda row: get_notbefore(row), axis=1)