This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [0]:
### REQUIREMENTS
import numpy as np
import math
import pandas as pd

import sys
### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import getpass

import datetime as dt
# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

!pip install --ignore-installed --index-url https://test.pypi.org/simple/ --no-deps sddk ### our own package under construction, always install to have up-to-date version
import sddk

Looking in indexes: https://test.pypi.org/simple/
Collecting sddk
  Downloading https://test-files.pythonhosted.org/packages/af/a4/679f4a548c03e594f7e0d408143a81541d98867e9f81d99685de3190812a/sddk-0.0.6-py3-none-any.whl
Installing collected packages: sddk
Successfully installed sddk-0.0.6


In [0]:
### configure session and groupurl
s, sciencedata_groupurl = sddk.configure_session_and_url()

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
sciencedata.dk group folder (ask group owner): SDAM_root
endpoint for your group (variable 'sciencedata_groupurl') has been configured to: https://sciencedata.dk/files/SDAM_root/


# EDH_dump.zip from sciencedata.dk

In [0]:
### based on this: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
%%time
response = s.get(sciencedata_groupurl + "SDAM_data/EDH/EDH_dump.zip")
### check the response status
print(response)

<Response [200]>
CPU times: user 555 ms, sys: 522 ms, total: 1.08 s
Wall time: 13.7 s


In [0]:
%%time
zipped = zipfile.ZipFile(io.BytesIO(response.content))

CPU times: user 1.37 s, sys: 107 ms, total: 1.48 s
Wall time: 1.48 s


In [0]:
### names of all files within the zipped directory
namelist = zipped.namelist()[1:] ### index "0" is for main directory
len(namelist)

157271

In [0]:
### it is too long, there are probably some mac hidden files included as well. 
### Let's remove them
namelist = [name for name in namelist if not "._" in name]
namelist = [name for name in namelist if  ".xml" in name]
len(namelist)

78634

In [0]:
namelist[:10]

['EDH_dump/HD046916.xml',
 'EDH_dump/HD078620.xml',
 'EDH_dump/HD000943.xml',
 'EDH_dump/HD011696.xml',
 'EDH_dump/HD031546.xml',
 'EDH_dump/HD077513.xml',
 'EDH_dump/HD029309.xml',
 'EDH_dump/HD001485.xml',
 'EDH_dump/HD036229.xml',
 'EDH_dump/HD068433.xml']

In [0]:
BeautifulSoup(zipped.read(element)

In [0]:
### look how it works in generating soups
list_of_soups = []
for filename in namelist[:1000]:
  soup = BeautifulSoup(zipped.read(filename))
  list_of_soups.append(soup)

In [0]:
### look at one soup
print(list_of_soups[0].prettify)

<bound method Tag.prettify of <?xml version="1.0" encoding="UTF-8"?><?xml-model href="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?><html><body><tei xml:base="ex-epidoctemplate.xml" xml:lang="de" xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0">
<teiheader>
<filedesc>
<titlestmt>
<title>Grabinschrift auf Tafel</title>
</titlestmt>
<publicationstmt>
<authority>Epigraphische Datenbank Heidelberg</authority>
<idno type="URI">http://edh-www.adw.uni-heidelberg.de/edh/inschrift/HD046916</idno>
<idno type="TM">214091</idno><idno type="localID">HD046916</idno>
<availability>
<p>© Heidelberg Academy of Sciences and Humanities</p>
<licence target="http://creativecommons.org/licenses/by-sa/4.0/">This file is licensed under the Creative Commons Attribution-ShareAlike 4.0 license.
                    </licence>
</availability>
</publicationstmt>
<sourcedesc>
<msdesc>
<msidentifier>
<repository ref="www.trismegistos.org/">Grenobl

# Make soups and extract useful data from them

In [0]:
%%time
edh_xml_data = []
for filename in namelist:
  soup = BeautifulSoup(zipped.read(filename))
  try:
    idno_uri = soup.find("idno", attrs={"type" : "URI"}).get_text()
    idno_tm = soup.find("idno", attrs={"type" : "TM"}).get_text()
    placenames_refs = []
    try: 
      placenames = soup.find_all("placename")
      for placename in placenames:
        placenames_refs.append(placename["ref"])
    except: placenames_refs = []
    text_tag = soup.find("div", attrs={"type" : "edition"})
    commentary = soup.find("div", attrs={"type" : "commentary"}).get_text()
    text = " ".join(text_tag.get_text().splitlines()[1:])
    origdate_text = soup.find("origdate").get_text().replace("\n", "")
    objecttype = [soup.find("objecttype").get_text(), str(soup.find("objecttype")).partition("lod/")[2].partition("\">")[0]]
    edh_xml_data.append([idno_uri, idno_tm, placenames_refs, text, origdate_text, objecttype, commentary])
  except:
    pass

CPU times: user 8min 22s, sys: 113 ms, total: 8min 22s
Wall time: 8min 22s


In [0]:
%%time
edh_xml_data_df = pd.DataFrame(edh_xml_data, columns=["idno_uri", "idno_tm", "placenames_refs", "text", "origdate_text", "objecttype", "commentary"])
edh_xml_data_df.head(5)

CPU times: user 153 ms, sys: 998 µs, total: 154 ms
Wall time: 154 ms


In [0]:
len(edh_xml_data_df)

78631

In [0]:
### put your dataframe data into this folder
s.put(sciencedata_groupurl + "SDAM_data/EDH/edh_xml_data_df.json", data=edh_xml_data_df.to_json())

<Response [201]>