This notebook serves to get additional data from the EDH XML/epidoc files. The reason is that some information is missing in the API data.


In [2]:
### REQUIREMENTS
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import zipfile
import io
import sddk
from xmltodict3 import XmlTextToDict

Now we turn to the download section of the EDH website, where we can find zip archives containing xml files with individual inscriptions. Instead of downloading them manually, we will download them directly into our Python environment.

In [3]:
# extract the download page
resp = requests.get("https://edh.ub.uni-heidelberg.de/data/download", headers={"User-Agent" : ""})
url_text = resp.text

In [4]:
# extract urls of individual zip archives for download
download_urls = re.findall("download\/edhEpidocDump_HD\w+\-\w+\.zip", url_text)
download_urls

['download/edhEpidocDump_HD000001-HD010000.zip',
 'download/edhEpidocDump_HD010001-HD020000.zip',
 'download/edhEpidocDump_HD020001-HD030000.zip',
 'download/edhEpidocDump_HD030001-HD040000.zip',
 'download/edhEpidocDump_HD040001-HD050000.zip',
 'download/edhEpidocDump_HD050001-HD060000.zip',
 'download/edhEpidocDump_HD060001-HD070000.zip',
 'download/edhEpidocDump_HD070001-HD080000.zip',
 'download/edhEpidocDump_HD080001-HD082828.zip']

In [5]:
# check how many files we have

url_base = "https://edh-www.adw.uni-heidelberg.de/"

filenames = []
for d_url in download_urls:
    url = url_base + d_url
    print(url)
    resp = requests.get(url, headers={'User-Agent': ''})
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    ### names of all files within the zipped directory
    namelist = zipped.namelist()
    namelist = [file for file in namelist if ".xml" in file]
    filenames.extend(namelist)
len(filenames)

https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD000001-HD010000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD010001-HD020000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD020001-HD030000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD030001-HD040000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD040001-HD050000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD050001-HD060000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD060001-HD070000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD070001-HD080000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD080001-HD082828.zip


81883

In [6]:
# collecting filenames from first url for testing

url_base = "https://edh-www.adw.uni-heidelberg.de/"

url = url_base + download_urls[0]
resp = requests.get(url, headers={"User-Agent" : ""})
zipped = zipfile.ZipFile(io.BytesIO(resp.content))
### names of all files within the zipped directory
namelist = zipped.namelist()

In [7]:
len(namelist)

9928

In [8]:
def get_filecontent_from_filename(filename, zipped):
    try:
        return str(zipped.read(filename))
    except:
        pass

In [9]:
edh_filecontents = {}

for filename in namelist:
    edh_filecontents[filename] = get_filecontent_from_filename(filename, zipped)
# transform it into dataframe

In [10]:
#soup = BeautifulSoup(edh_filecontents['HD000001.xml'])

9928

In [126]:
test_str = str(edh_filecontents[namelist[3]])

True

In [140]:
soup = BeautifulSoup(edh_filecontents[namelist[1000]])
# if you already have the data:
#soup = BeautifulSoup(edh_filecontents['HD000001.xml'])


In [141]:
# let's try to parse it as xml
print(soup.prettify())

b'
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
<tei xmlns="http://www.tei-c.org/ns/1.0">
 \n
 <teiheader>
  \n
  <filedesc>
   \n
   <titlestmt>
    \n
    <title>
     Owner/artist inscription from Colonia Claudia Augusta? Agrippinensium &amp;ndash; K\xc3\xb6ln (Germania inferior)
    </title>
    \n
   </titlestmt>
   \n
   <publicationstmt>
    \n
    <authority>
     Epigraphische Datenbank Heidelberg
    </authority>
    \n
    <availability>
     \n
     <p>
      Heidelberger Akademie der Wissenschaften
     </p>
     \n
     <licence target="http://creativecommons.org/licenses/by-sa/4.0/">
      This file is licensed under the Creative Commons Attribution-ShareAlike 4.0 license.\n
     </licence>
     \n
    </availability>
    \n
    <idno type="localID">
     HD081005
    </idno>
    <idno type="URI">
     https://edh-www.adw.uni-heidelberg.de/test/edh/in

In [143]:
soup.rs

<rs ref="http://www.eagle-network.eu/voc/decor/lod/1000" type="deccoration">no</rs>

In [135]:
XmlTextToDict(str(soup.find("listperson")), ignore_namespace=True).get_dict()["listperson"]["person"]

ParseError: syntax error: line 1, column 0 (<string>)

In [20]:
soup.find_all("placename")

[<placename type="province">Baetica</placename>,
 <placename ref="#" type="ancient">Ipolcobulcula</placename>,
 <placename ref="https://www.geonames.org/2510769" type="country">Spain</placename>,
 <placename ref="http://www.geonames.org/2520151" type="modern">Carcabuey</placename>,
 <placename type="findspot"></placename>,
 <placename type="region">C\xc3\xb3rdoba</placename>]

In [23]:
placenames_refs = []
for placename in soup.find_all("placename"):
    try:
        print(placename)
        placenames_refs.append({"type" : placename["type"], "text" : placename.get_text(), "ref" : placename["ref"]})
    except:
        try:
            placenames_refs.append({"type" : placename["type"], "text" : placename.get_text()})
        except:
            pass

<placename type="province">Baetica</placename>
<placename ref="#" type="ancient">Ipolcobulcula</placename>
<placename ref="https://www.geonames.org/2510769" type="country">Spain</placename>
<placename ref="http://www.geonames.org/2520151" type="modern">Carcabuey</placename>
<placename type="findspot"></placename>
<placename type="region">C\xc3\xb3rdoba</placename>


In [98]:
placenames_refs = []
for placename in soup.find_all("placename"):
    placename_dict = {}
    try:
        placename_dict["type"] = placename["type"]
        try:
            placename_dict["text"] = placename.get_text()
        except:
            pass
        try:
            placename_dict["ref"] = placename["ref"]
        except:
            pass
    except:
        pass
    placenames_refs.append(placename_dict)
placenames_refs

[{'type': 'province', 'text': 'Baetica'},
 {'type': 'ancient', 'text': 'Ipolcobulcula', 'ref': '#'},
 {'type': 'country',
  'text': 'Spain',
  'ref': 'https://www.geonames.org/2510769'},
 {'type': 'modern',
  'text': 'Carcabuey',
  'ref': 'http://www.geonames.org/2520151'},
 {'type': 'findspot', 'text': ''},
 {'type': 'region', 'text': 'C\\xc3\\xb3rdoba'}]

In [24]:
placenames_refs

[{'type': 'province', 'text': 'Baetica'},
 {'type': 'ancient', 'text': 'Ipolcobulcula', 'ref': '#'},
 {'type': 'country',
  'text': 'Spain',
  'ref': 'https://www.geonames.org/2510769'},
 {'type': 'modern',
  'text': 'Carcabuey',
  'ref': 'http://www.geonames.org/2520151'},
 {'type': 'findspot', 'text': ''},
 {'type': 'region', 'text': 'C\\xc3\\xb3rdoba'}]

In [25]:
text_tag = soup.find("div", attrs={"type" : "edition"})
text_edition = ""
for ab in text_tag.find_all("ab"):
    text_edition += str(ab.get_text())
text_edition = " ".join(" ".join(text_edition.splitlines()).split())
print(text_edition.rstrip())

\n AVSLLA Marci Porci Nigri serva dominae Veneri aram posuit\n


In [145]:
soup.support.find("material")["ref"]

'http://www.eagle-network.eu/voc/material/lod/131'

In [148]:
for rs in soup.find_all("rs"):
    print(rs)
#    ["ref"]


<rs ref="http://www.eagle-network.eu/voc/decor/lod/1000" type="deccoration">no</rs>


In [146]:
soup.support.find("rs")["ref"]

'http://www.eagle-network.eu/voc/decor/lod/1000'

In [150]:
# define function for data parsing
def get_data_from_filename(filename):
    try:
        soup = BeautifulSoup(edh_filecontents[filename])
        xml_data = {}
        idno_uri = soup.find("idno", attrs={"type" : "URI"}).get_text()
        xml_data["idno_uri"] = idno_uri.rpartition("/")[2]
        try:
            xml_data["idno_tm"] = soup.find("idno", attrs={"type" : "TM"}).get_text()
        except:
            xml_data["idno_tm"] = None
        placenames_refs = []
        for placename in soup.find_all("placename"):
            placename_dict = {}
            try:
                placename_dict["type"] = placename["type"]
                try:
                    placename_dict["text"] = placename.get_text()
                except:
                    pass
                try:
                    placename_dict["ref"] = placename["ref"]
                except:
                    pass
            except:
                pass
            placenames_refs.append(placename_dict)
        xml_data["placenames_refs"] = placenames_refs
        text_tag = soup.find("div", attrs={"type" : "edition"})
        text_edition = ""
        for ab in text_tag.find_all("ab"):
            text_edition += str(ab.get_text())
        text_edition = " ".join(" ".join(text_edition.splitlines()).split())
        xml_data["text_edition"] = text_edition
        xml_data["origdate_text"] = soup.find("origdate").get_text().replace("\n", "")
        try: 
            layout_execution = soup.layout.find("rs")["ref"]
            xml_data["layout_execution"] = layout_execution.rpartition("/")[2]
        except:
            xml_data["layout_execution"] = None
        try:
            xml_data["layout_execution_text"] = soup.layout.rs.get_text()
        except:
            xml_data["layout_execution_text"] = None
        try: 
            support_objecttype = soup.support.find("objecttype")["ref"]
            xml_data["support_objecttype"] = support_objecttype.rpartition("/")[2]
        except:
            xml_data["support_objecttype"] = None
        try:
            xml_data ["support_objecttype_text"] = soup.support.objecttype.get_text()
        except:
            xml_data ["support_objecttype_text"] = None
        try: 
            support_material = soup.support.find("material")["ref"]
            xml_data["support_material"] = support_material.rpartition("/")[2]
        except:
            xml_data["support_material"] = None
        try:
            xml_data["support_material_text"] = soup.support.material.get_text()
        except:
            xml_data["support_material_text"] = None
        try: 
            support_decoration = soup.support.find("rs")["ref"]
            xml_data["support_decoration"] = support_decoration.rpartition("/")[2]
        except:
            xml_data["support_decoration"] = None
        try: 
            keywords_term = soup.keywords.find("term")["ref"]
            xml_data["keywords_term"] = keywords_term.rpartition("/")[2]
        except:
            xml_data["keywords_term"] = None
        try:
            xml_data["keywords_term_text"] = soup.keywords.get_text().replace("\n", "")
        except:
            xml_data["keywords_term_text"] = None
        try:
            xml_data["people"] = XmlTextToDict(str(soup.find("listperson")), ignore_namespace=True).get_dict()["listperson"]["person"]
        except:
            pass
        return xml_data
    except:
        pass

In [100]:
# test with first ten files within the namelist
edh_xml_data = []
edh_filecontents = {}
for filename in namelist[:10]:
    edh_filecontents[filename] = get_filecontent_from_filename(filename, zipped)
    edh_xml_data.append(get_data_from_filename(filename))
# transform it into dataframe
pd.DataFrame(edh_xml_data)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text_edition,origdate_text,layout_execution,layout_execution_text,support_objecttype,support_objecttype_text,support_material,support_material_text,support_decoration,keywords_term,keywords_term_text,people
0,HD080001,,"[{'type': 'province', 'text': 'Germania superi...",\n E\n,1 AD - 300 AD,,,,,60,Kalkstein,1000,143.0,unknown,
1,HD080002,,"[{'type': 'province', 'text': 'Germania superi...",\n IT\n,71 AD - 250 AD,,,#,tabula?,60,Kalkstein,1000,143.0,unknown,
2,HD080003,,"[{'type': 'province', 'text': 'Germania superi...",\n ME D\n,1 AD - 300 AD,,,,,60,Kalkstein,1000,143.0,unknown,
3,HD080004,,"[{'type': 'province', 'text': 'Germania superi...",\n centuria Semproni Arrunti Aquilae\n,1 AD - 50 AD,,,42,weapon,109,Bronze,1000,,owner/artist inscription,"[{'persname': {'name': {'@type': 'nomen', '#te..."
4,HD080005,,"[{'type': 'province', 'text': 'Germania superi...",\n CA\n,1 AD - 300 AD,,,257,tabula,60,Kalkstein,1000,143.0,unknown,
5,HD080006,,"[{'type': 'province', 'text': 'Germania superi...",\n M\n,71 AD - 250 AD,,,257,tabula,60,Kalkstein,1000,143.0,unknown,
6,HD080007,,"[{'type': 'province', 'text': 'Germania superi...",\n V T \n,71 AD - 250 AD,,,257,tabula,60,Kalkstein,1000,143.0,unknown,
7,HD080008,,"[{'type': 'province', 'text': 'Germania superi...",\n V\n,71 AD - 250 AD,,,#,tabula?,60,Kalkstein,1000,143.0,unknown,
8,HD080009,,"[{'type': 'province', 'text': 'Germania superi...",\n V S\n,71 AD - 250 AD,,,257,tabula,60,Kalkstein,1000,143.0,unknown,
9,HD080010,,"[{'type': 'province', 'text': 'Germania superi...",\n R\n,71 AD - 250 AD,,,257,tabula,60,Kalkstein,1000,143.0,unknown,


In [53]:
EDH_xml_cols = pd.DataFrame(pd.DataFrame(edh_xml_data).columns, columns=["columns"])
EDH_xml_cols

Unnamed: 0,columns
0,idno_uri
1,idno_tm
2,placenames_refs
3,text_edition
4,origdate_text
5,layout_execution
6,layout_execution_text
7,support_objecttype
8,support_objecttype_text
9,support_material


# Extract xml files content from the zip files as raw strings 

In [54]:
%%time
# main loop

edh_filecontents = {}

url_base = "https://edh-www.adw.uni-heidelberg.de/"

for d_url in download_urls:
    url = url_base + d_url
    print(url)
    resp = requests.get(url, headers={'User-Agent': ''})
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    ### names of all files within the zipped directory
    namelist = zipped.namelist()
    for filename in namelist:
        try:        
            # original: edh_xml_data.append(get_data_from_filename(filename, zipped))
            edh_filecontents[filename] = get_filecontent_from_filename(filename, zipped)
        except:
            pass
        ### index "0" is for main directory

https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD000001-HD010000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD010001-HD020000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD020001-HD030000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD030001-HD040000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD040001-HD050000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD050001-HD060000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD060001-HD070000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD070001-HD080000.zip
https://edh-www.adw.uni-heidelberg.de/download/edhEpidocDump_HD080001-HD082828.zip
CPU times: user 4.2 s, sys: 458 ms, total: 4.66 s
Wall time: 11 s


In [55]:
len(edh_filecontents)

81883

In [56]:
#let's connect to sciencedata and save the raw xmls there

In [57]:
s = sddk.cloudSession("sciencedata.dk", "SDAM_root/SDAM_data/EDH", "648597@au.dk")

connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/SDAM_data/EDH/


In [58]:
s.write_file("edh_raw_xmls_2022-11-02.json", edh_filecontents)

A file with the same name ("edh_raw_xmls_2022-11-02.json") already exists in this location.
Your <class 'dict'> object has been succesfully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/EDH/edh_raw_xmls_2022-11-02.json"


In [103]:
# read it back...
edh_filecontents = s.read_file("edh_raw_xmls_2022-11-02.json", "dict")

In [104]:
len(edh_filecontents)

81883

In [105]:
# lets look at what is within raw xml strings
# how many references to http://www.eagle-network.eu/ are there?
eagle_n = 0
for filename in edh_filecontents.keys():
    eagle_n = eagle_n + str(edh_filecontents[filename]).count("http://www.eagle-network.eu/")
eagle_n # in april 183726

183726

In [106]:
edh_xml_lens = []
for filename in edh_filecontents.keys():
    edh_xml_lens.append((filename, len(edh_filecontents[filename])))

In [107]:
# perhaps there are some strenge values
sorted(edh_xml_lens, key=lambda x: x[1])

[('HD016471.xml', 3458),
 ('HD026527.xml', 3459),
 ('HD015306.xml', 3472),
 ('HD051245.xml', 3475),
 ('HD051249.xml', 3475),
 ('HD051252.xml', 3477),
 ('HD051253.xml', 3477),
 ('HD051246.xml', 3488),
 ('HD051250.xml', 3488),
 ('HD019186.xml', 3491),
 ('HD019254.xml', 3495),
 ('HD033450.xml', 3496),
 ('HD007072.xml', 3497),
 ('HD051247.xml', 3501),
 ('HD019257.xml', 3502),
 ('HD007096.xml', 3503),
 ('HD007093.xml', 3508),
 ('HD007099.xml', 3508),
 ('HD007458.xml', 3508),
 ('HD073406.xml', 3509),
 ('HD010645.xml', 3512),
 ('HD029145.xml', 3513),
 ('HD029148.xml', 3513),
 ('HD050900.xml', 3514),
 ('HD051248.xml', 3514),
 ('HD014773.xml', 3516),
 ('HD012717.xml', 3517),
 ('HD022906.xml', 3517),
 ('HD031206.xml', 3517),
 ('HD076796.xml', 3517),
 ('HD017988.xml', 3522),
 ('HD048216.xml', 3522),
 ('HD015992.xml', 3523),
 ('HD048231.xml', 3525),
 ('HD048253.xml', 3525),
 ('HD017575.xml', 3527),
 ('HD051251.xml', 3527),
 ('HD011124.xml', 3528),
 ('HD017116.xml', 3529),
 ('HD048240.xml', 3529),


In [108]:
sorted(edh_xml_lens, key=lambda x: x[1])[-10:-1]

[('HD000721.xml', 41649),
 ('HD043295.xml', 42599),
 ('HD043289.xml', 44271),
 ('HD044445.xml', 46700),
 ('HD026775.xml', 51213),
 ('HD017350.xml', 53152),
 ('HD056719.xml', 54588),
 ('HD043480.xml', 68146),
 ('HD032316.xml', 102219)]

# Parse the xml data

In [109]:
edh_xml_data = []
for filename in edh_filecontents.keys():
    edh_xml_data.append(get_data_from_filename(filename))

In [110]:
# remove empty
#edh_xml_data = [elem for elem in edh_xml_data if elem != None]
# how many we have
# last time we had 81143 and then 81156
len(edh_xml_data)

81883

In [111]:
# look et invalid
[el for el in edh_xml_data if el == None]

[None]

In [112]:
# let's take valid only:
edh_xml_data = [elem for elem in edh_xml_data if elem != None]
len(edh_xml_data)

81882

In [113]:
# make a dataframe from
edh_xml_data_df = pd.DataFrame(edh_xml_data)
edh_xml_data_df.head(5)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text_edition,origdate_text,layout_execution,layout_execution_text,support_objecttype,support_objecttype_text,support_material,support_material_text,support_decoration,keywords_term,keywords_term_text,people
0,HD000001,,"[{'type': 'province', 'text': 'Latium et Campa...",\n Dis Manibus Noniae Publi filiae Optatae et ...,71 AD - 130 AD,,,257,tabula,,"Marmor, ge\xc3\xa4dert / farbig",1000,92,epitaph,"[{'persname': {'name': [{'@type': 'nomen', '#t..."
1,HD000002,,"[{'type': 'province', 'text': 'Roma'}, {'type'...",\n Caius Sextius Paris qui vixit annis LXX\n,51 AD - 200 AD,,,257,tabula,48.0,Marmor,1000,92,epitaph,"{'persname': {'name': [{'@type': 'praenomen', ..."
2,HD000003,,"[{'type': 'province', 'text': 'Baetica'}, {'ty...",\n Publio Mummio Publi filio Galeria Sisennae ...,131 AD - 170 AD,,,57,statue base,48.0,Marmor,1000,69,honorific inscription,"{'persname': {'name': [{'@type': 'praenomen', ..."
3,HD000004,,"[{'type': 'province', 'text': 'Baetica'}, {'ty...",\n AVSLLA Marci Porci Nigri serva dominae Vene...,151 AD - 200 AD,,,29,altar,60.0,Kalkstein,1000,372,votive inscription,"[{'persname': {'name': {'@type': 'cognomen', '..."
4,HD000005,,"[{'type': 'province', 'text': 'Roma'}, {'type'...",\n libertus Successus Luci libertus Irenaeus C...,1 AD - 200 AD,,,250,stele,,,1000,92,epitaph,"[{'persname': {'name': [{'@type': 'praenomen',..."


In [114]:
len(edh_xml_data_df)

81882

In [115]:
def try_to_get_placename(listofdicts, placename_type):
    try:
        placename = [d["text"] for d in listofdicts if d["type"]==placename_type][0]
    except:
        placename = ""
    return placename

In [116]:
edh_xml_data_df["province_label"] = edh_xml_data_df["placenames_refs"].apply(lambda x: try_to_get_placename(x, "province"))

In [151]:
edh_xml_data_df["province_label"].str.contains("\?", na=False).sum()

396

In [168]:
def try_to_get_pleiades(listofdicts):
    listofdicts = [d for d in listofdicts if "ref" in d.keys()]
    try:
        pleiades_id = [d["ref"] for d in listofdicts if "pleiades" in d["ref"]][0].rpartition("/")[2]
    except:
        pleiades_id = ""
    return pleiades_id

In [170]:
try_to_get_pleiades(test_listd)

'432808'

In [171]:
edh_xml_data_df["pleiades_id"] = edh_xml_data_df["placenames_refs"].apply(try_to_get_pleiades)
edh_xml_data_df.head(5)

Unnamed: 0,idno_uri,idno_tm,placenames_refs,text_edition,origdate_text,layout_execution,layout_execution_text,support_objecttype,support_objecttype_text,support_material,support_material_text,support_decoration,keywords_term,keywords_term_text,people,province_label,pleiades_id
0,HD000001,,"[{'type': 'province', 'text': 'Latium et Campa...",\n Dis Manibus Noniae Publi filiae Optatae et ...,71 AD - 130 AD,,,257,tabula,,"Marmor, ge\xc3\xa4dert / farbig",1000,92,epitaph,"[{'persname': {'name': [{'@type': 'nomen', '#t...",Latium et Campania (Regio I),432808.0
1,HD000002,,"[{'type': 'province', 'text': 'Roma'}, {'type'...",\n Caius Sextius Paris qui vixit annis LXX\n,51 AD - 200 AD,,,257,tabula,48.0,Marmor,1000,92,epitaph,"{'persname': {'name': [{'@type': 'praenomen', ...",Roma,423025.0
2,HD000003,,"[{'type': 'province', 'text': 'Baetica'}, {'ty...",\n Publio Mummio Publi filio Galeria Sisennae ...,131 AD - 170 AD,,,57,statue base,48.0,Marmor,1000,69,honorific inscription,"{'persname': {'name': [{'@type': 'praenomen', ...",Baetica,
3,HD000004,,"[{'type': 'province', 'text': 'Baetica'}, {'ty...",\n AVSLLA Marci Porci Nigri serva dominae Vene...,151 AD - 200 AD,,,29,altar,60.0,Kalkstein,1000,372,votive inscription,"[{'persname': {'name': {'@type': 'cognomen', '...",Baetica,
4,HD000005,,"[{'type': 'province', 'text': 'Roma'}, {'type'...",\n libertus Successus Luci libertus Irenaeus C...,1 AD - 200 AD,,,250,stele,,,1000,92,epitaph,"[{'persname': {'name': [{'@type': 'praenomen',...",Roma,423025.0


# Save the data locally and to sciencedata

In [172]:
edh_xml_data_df.to_json("../data/large_data/edh_xml_data.json")

In [173]:
s.write_file("edh_xml_data_2022-11-02.json", edh_xml_data_df)

A file with the same name ("edh_xml_data_2022-11-02.json") already exists in this location.
Your <class 'pandas.core.frame.DataFrame'> object has been succesfully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/EDH/edh_xml_data_2022-11-02.json"
