<a href="https://colab.research.google.com/github/sdam-au/edh_workflow/blob/master/scripts/1_2_py_EXTRACTION_edh-xml_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [1]:
### REQUIREMENTS
import numpy as np
import math
import pandas as pd

import sys
### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time
# the input data have typically a json structure
import json
import getpass

import datetime as dt
# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

In [2]:
!pip install sddk --ignore-installed ### our own package under construction, always install to have up-to-date version
import sddk

Collecting sddk
  Using cached https://files.pythonhosted.org/packages/bf/96/3ae43f2d8ac06fc16ba111916970e5a1f3b96a3e41732fa3f099e2e5cd1c/sddk-2.6-py3-none-any.whl
Collecting pandas
  Using cached https://files.pythonhosted.org/packages/c0/95/cb9820560a2713384ef49060b0087dfa2591c6db6f240215c2bce1f4211c/pandas-1.0.5-cp36-cp36m-manylinux1_x86_64.whl
Collecting numpy
  Using cached https://files.pythonhosted.org/packages/93/0b/71ae818646c1a80fbe6776d41f480649523ed31243f1f34d9d7e41d70195/numpy-1.19.0-cp36-cp36m-manylinux2010_x86_64.whl
Collecting requests
  Using cached https://files.pythonhosted.org/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl
Collecting matplotlib
  Using cached https://files.pythonhosted.org/packages/e3/a8/bfd8e9ddac55a4a80235f7ccc286e4a08c97e6c4f035f21a27bcab7a51c8/matplotlib-3.2.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting pyarrow
  Using cached https://files.pythonhosted.org/packages/ba/3f/6cac1714f

In [3]:
### configure session and groupurl
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


# EDH_dump.zip from sciencedata.dk

In [7]:
### based on this: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
%%time
response = conf[0].get(conf[1] + "SDAM_data/EDH/EDH_dump.zip")
### check the response status
print(response)

<Response [200]>
CPU times: user 576 ms, sys: 425 ms, total: 1 s
Wall time: 37.1 s


In [8]:
%%time
zipped = zipfile.ZipFile(io.BytesIO(response.content))

CPU times: user 1.35 s, sys: 129 ms, total: 1.47 s
Wall time: 1.47 s


In [9]:
### names of all files within the zipped directory
namelist = zipped.namelist()[1:] ### index "0" is for main directory
len(namelist)

157271

In [10]:
### it is too long, there are probably some mac hidden files included as well. 
### Let's remove them
namelist = [name for name in namelist if not "._" in name]
namelist = [name for name in namelist if  ".xml" in name]
len(namelist)

78634

In [12]:
### show first 10 files
namelist[:10]

['EDH_dump/HD046916.xml',
 'EDH_dump/HD078620.xml',
 'EDH_dump/HD000943.xml',
 'EDH_dump/HD011696.xml',
 'EDH_dump/HD031546.xml',
 'EDH_dump/HD077513.xml',
 'EDH_dump/HD029309.xml',
 'EDH_dump/HD001485.xml',
 'EDH_dump/HD036229.xml',
 'EDH_dump/HD068433.xml']

In [18]:
### look how it works in generating soups
list_of_soups = []
for filename in namelist[:10000]:
  soup = BeautifulSoup(zipped.read(filename))
  list_of_soups.append(soup)

In [15]:
### look at one soup
print(list_of_soups[0].prettify)

<bound method Tag.prettify of <?xml version="1.0" encoding="UTF-8"?><?xml-model href="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?><html><body><tei xml:base="ex-epidoctemplate.xml" xml:lang="de" xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0">
<teiheader>
<filedesc>
<titlestmt>
<title>Grabinschrift auf Tafel</title>
</titlestmt>
<publicationstmt>
<authority>Epigraphische Datenbank Heidelberg</authority>
<idno type="URI">http://edh-www.adw.uni-heidelberg.de/edh/inschrift/HD046916</idno>
<idno type="TM">214091</idno><idno type="localID">HD046916</idno>
<availability>
<p>© Heidelberg Academy of Sciences and Humanities</p>
<licence target="http://creativecommons.org/licenses/by-sa/4.0/">This file is licensed under the Creative Commons Attribution-ShareAlike 4.0 license.
                    </licence>
</availability>
</publicationstmt>
<sourcedesc>
<msdesc>
<msidentifier>
<repository ref="www.trismegistos.org/">Grenobl

# Make soups and extract useful data from them

In [21]:
%%time
edh_xml_data = []
for filename in namelist:
  soup = BeautifulSoup(zipped.read(filename))
  try:
    idno_uri = soup.find("idno", attrs={"type" : "URI"}).get_text()
    idno_tm = soup.find("idno", attrs={"type" : "TM"}).get_text()
    placenames_refs = []
    try: 
      placenames = soup.find_all("placename")
      for placename in placenames:
        placenames_refs.append(placename["ref"])
    except: placenames_refs = []
    text_tag = soup.find("div", attrs={"type" : "edition"})
    commentary = soup.find("div", attrs={"type" : "commentary"}).get_text()
    text = " ".join(text_tag.get_text().splitlines()[1:])
    origdate_text = soup.find("origdate").get_text().replace("\n", "")
    origdate_attrs = soup.origdate.attrs
    objecttype = [soup.find("objecttype").get_text(), str(soup.find("objecttype")).partition("lod/")[2].partition("\">")[0]]
    edh_xml_data.append([idno_uri, idno_tm, placenames_refs, text, origdate_text, origdate_attrs, objecttype, commentary])
  except:
    pass

CPU times: user 8min 48s, sys: 825 ms, total: 8min 49s
Wall time: 8min 49s


In [22]:
%%time
edh_xml_data_df = pd.DataFrame(edh_xml_data, columns=["idno_uri", "idno_tm", "placenames_refs", "text", "origdate_text", "origdate_attrs","objecttype", "commentary"])
edh_xml_data_df.head(5)

CPU times: user 189 ms, sys: 6 ms, total: 195 ms
Wall time: 194 ms


In [23]:
len(edh_xml_data_df) # 78631

78631

In [27]:
### put your dataframe data into this folder
sddk.write_file("SDAM_data/EDH/edh_xml_data_2020-06-23.json", edh_xml_data_df, conf)
# original
# s.put(sciencedata_groupurl + "SDAM_data/EDH/edh_xml_data_df.json", data=edh_xml_data_df.to_json())

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/EDH/edh_xml_data_2020-06-23.json"
