In [73]:
import glob
import pprint
import re
import os

from lxml import etree
from lxml.etree import XMLSyntaxError
import pandas as pd
import numpy as np


In [74]:
PATH: str = '../data/'
FILES: str = '*.xml'
NAMESPACE: dict = {'tei': 'http://www.tei-c.org/ns/1.0'}

In [75]:
def extract_metadata(file_path: str = PATH + FILES) -> (dict, dict):
    """Function for retrieving the needed metadata from the XML/TEI files."""
    person_information: dict = {}
    correspondences: dict = {}

    for file in glob.glob(file_path)[:2000]:
        try:
            # print(f"Working on {os.path.basename(file)}")
            tree = etree.parse(file)
            root = tree.getroot()

            sender_ref: str = ""
            receiver_ref: str = ""
            gnd_regex: str = 'http://d-nb.info/gnd/'

            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="sent"]//tei:persName',
                                     NAMESPACE
                                     ):

                # Building the reference list for all persons existing in the corpus.
                sender_ref = (re.sub(gnd_regex,
                                     '',
                                     str(elem.get('ref')),
                                     ).strip()
                              )

                if not sender_ref in person_information.keys():
                    person_information[sender_ref] = {'name': elem.text,
                                                      'key': elem.get('key')
                                                      }

                """Adding sender to correspondences. At the beginning, the correspondence list will be empty and the first sender can be added. After that, they will only be added if their GND-key is not already in the correspondence list.
                """
                if not correspondences:
                    correspondences[sender_ref] = {}
                else:
                    if sender_ref not in correspondences.keys():
                        correspondences[sender_ref] = {}
                    else:
                        continue


            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="received"]//tei:persName',
                                     NAMESPACE
                                     ):
                receiver_ref = (re.sub(gnd_regex,
                                       '',
                                       str(elem.get('ref'))
                                       ).strip()
                                )


            """Adding receivers to the list. This means that each person who sent a letter to someone will have that 'receiver' stored in a dictionary with the frequency of correspondance.
            """
            try:
                if not correspondences[sender_ref] or (correspondences[sender_ref] and receiver_ref not in correspondences[sender_ref].keys()):

                    correspondences[sender_ref][receiver_ref] = 1
                elif correspondences[sender_ref] and receiver_ref in correspondences[sender_ref].keys():
                    correspondences[sender_ref][receiver_ref] += 1
            except KeyError:
                # Entries with missing gnd-entries will be ignored as they can not be properly addressed.
                continue



        except XMLSyntaxError:
            # If there is a syntax error in the file, it will be ignored in the preprocessing.
            print(f"There has been a syntax error in {file}.")
            continue

    # Checking if the result is actually a set of only unique gnd-keys.
    # pprint.pprint(person_information.keys() == set(person_information.keys()))
    return correspondences, person_information




In [78]:
correspondences, person_information = extract_metadata()

for gnd_key in correspondences:
    if 'Schlegel' in person_information[gnd_key].get('name'):
        print(gnd_key, person_information[gnd_key].get('name'))

There has been a syntax error in ../data/AWS-aw-02on.xml.
There has been a syntax error in ../data/AWS-aw-05q8.xml.
There has been a syntax error in ../data/AWS-aw-0317.xml.
There has been a syntax error in ../data/AWS-aw-05di.xml.
There has been a syntax error in ../data/AWS-aw-02wy.xml.
There has been a syntax error in ../data/AWS-aw-02le.xml.
There has been a syntax error in ../data/AWS-aw-031g.xml.
There has been a syntax error in ../data/AWS-aw-05x5.xml.
118607960 August Wilhelm von Schlegel
118607987 Friedrich von Schlegel
117320536 Karl August Moritz Schlegel
100799434 Johann Carl Fürchtegott Schlegel
1019578068 Johanna Christiane Erdmuthe Schlegel
1019580852 Julie Schlegel
117321435 Sophie von Schlegel
118607995 Johann Adolf Schlegel
139545530 Karl August Schlegel
1019576790 Charlotte Schlegel
104066490 Johan F. W. Schlegel
138154856 Johann August Adolph Schlegel
