1. <?xml version="1.0" encoding="UTF-8"?>
2. Aliasing a library, especially one with a long name, makes it easier to call functions included in the library by giving them shorter, more readable names.


In [19]:
from pathlib import Path
try:
    from lxml import etree
    print("running with lxml.etree")
except ImportError:
    import xml.etree.ElementTree as ET
    print("running with Python's xml.etree.ElementTree")

finding_aid_path = Path('.','si676-2025-data-main/', 'data', 'xml', 'day_20221004_205435_UTC__ead.xml')
if finding_aid_path.is_file():
    tree = ET.parse(finding_aid_path)
    root = tree.getroot()
else:
    print(f"File not found: {finding_aid_path}")
    tree = None
    root = None

archDesc = tree.find('{http://ead3.archivists.org/schema/}archdesc')
for element in archDesc:
    print(type(element), element.tag, element.text)

running with Python's xml.etree.ElementTree
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}did 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}scopecontent 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}bioghist 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}accessrestrict 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}userestrict 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}prefercite 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}controlaccess 
    
<class 'xml.etree.ElementTree.Element'> {http://ead3.archivists.org/schema/}dsc None


## Question 4

Admittedly, the prefixed namespaces have me stumped. From my understanding, defining a namespace provides an array of places to pull metadata attributes from (e.g., DublinCore) to make sure XML formatted metadata is consistent across resources and collections.

In [96]:

ns = {
    'ex': 'http://www.example.org/',
    'dcterms': 'http://purl.archive.org/dc/terms/',
    'rdf': 'https://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'rdfs': 'https://www.w3.org/2000/01/rdf-schema#',
    'xsd': 'http://www.w3.org/2001/XMLSchema#'
}

fpath_simple_dc_record = Path('..','data','simple_dc_record.xml')

if fpath_simple_dc_record.is_file():
    print('You have already saved some DublinCore metadata!')
else:
    print('No file located')

You have already saved some DublinCore metadata!


In [94]:
metadata = etree.Element(f'{{{ns['dcterms']}}}metadata', nsmap=ns)

title = etree.SubElement(metadata, f'{{{ns['dcterms']}}}title')
title.text = "Oldsmobiles Crossing the Mackinac Bridge"

identifier = etree.SubElement(metadata, f'{{{ns['dcterms']}}}identifier')
identifier.text = "2017-03-001.007.052"

source = etree.SubElement(metadata, f'{{{ns['dcterms']}}}source')
source.text = "https://cadl.catalogaccess.com/archives/11662"

provenance = etree.SubElement(metadata, f'{{{ns['dcterms']}}}provenance')
provenance.text = "https://www.cadl.org"

provenanceStatement = etree.SubElement(metadata, f'{{{ns['dcterms']}}}provenanceStatement')
provenanceStatement.text = "Original shared by the Capital Area District Libraries (CADL)"

creators = ['Oldsmobile History Center', 'https://cadl.catalogaccess.com/people/1320']
for creator in creators:
  creator_elem = etree.SubElement(metadata, f'{{{ns['dcterms']}}}creator')
  creator_elem.text = creator

created = etree.SubElement(metadata, f'{{{ns['dcterms']}}}created')
created.text = "Unknown"

date = etree.SubElement(metadata, f'{{{ns['dcterms']}}}date')
date.text = "1960s/1970s"

subjects = ['Lansing (Mich.)', 'Ingham County (Mich.)', 'Parades & processions', 'Oldsmobile automobile', 'Mackinac Bridge', 'Bridges', 'Cars']
for subject in subjects:
  subj_elem = etree.SubElement(metadata, f'{{{ns['dcterms']}}}subject')
  subj_elem.text = subject

rights_ls = ['Copyright Not Evaluated', 'http://rightsstatements.org/vocab/CNE/1.0/']
for right in rights_ls:
  right_elem = etree.SubElement(metadata, f'{{{ns['dcterms']}}}rights')
  right_elem.text = right

In [95]:

# simple_dc_record = Path("../data/simple_dc_record.xml")
fpath_simple_dc_record = Path('.','data','simple_dc_record.xml')

# make sure parent dir exists
fpath_simple_dc_record.parent.mkdir(parents=True, exist_ok=True)

# build an ElementTree using the same module used to create `metadata`
tree = etree.ElementTree(metadata)

# write to file (use Path.open to avoid Path.write confusion)
with fpath_simple_dc_record.open('wb') as fh:
    tree.write(fh, xml_declaration=True, encoding='utf-8')

# confirm
if fpath_simple_dc_record.is_file():
    print('wrote your metadata!')

wrote your metadata!


In [101]:

import pprint


XML_metadata_object = etree.tostring(metadata, encoding='utf-8')
print(XML_metadata_object.decode(), end='')

<ns0:metadata xmlns:ns0="http://purl.archive.org/dc/terms/" nsmap="{'ex': 'http://www.example.org/', 'dcterms': 'http://purl.archive.org/dc/terms/', 'rdf': 'https://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'https://www.w3.org/2000/01/rdf-schema#', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}"><ns0:title>Oldsmobiles Crossing the Mackinac Bridge</ns0:title><ns0:identifier>2017-03-001.007.052</ns0:identifier><ns0:source>https://cadl.catalogaccess.com/archives/11662</ns0:source><ns0:provenance>https://www.cadl.org</ns0:provenance><ns0:provenanceStatement>Original shared by the Capital Area District Libraries (CADL)</ns0:provenanceStatement><ns0:creator>Oldsmobile History Center</ns0:creator><ns0:creator>https://cadl.catalogaccess.com/people/1320</ns0:creator><ns0:created>Unknown</ns0:created><ns0:date>1960s/1970s</ns0:date><ns0:subject>Lansing (Mich.)</ns0:subject><ns0:subject>Ingham County (Mich.)</ns0:subject><ns0:subject>Parades &amp; processions</ns0:subject><ns0:subject>Oldsmobil

Admittedly, I'm not sure why the dcterms portion of each xml tag is showing up as ns0. I notice that some of the attributes like creator are listed under the elements/1.1 namespace. I also wonder if some of the issues I was running into are because of the issues I was having with etree.