In [399]:
import glob
import re
import os

from lxml import etree
from lxml.etree import XMLSyntaxError
import pandas as pd
import numpy as np


In [400]:
PATH: str = '../data/'
FILES: str = '*.xml'
NAMESPACE: dict = {'tei': 'http://www.tei-c.org/ns/1.0'}

In [401]:
def extract_metadata(file_path: str = PATH + FILES) -> dict:
    """Function for retrieving the needed metadata from the XML/TEI files."""
    person_information: dict = {}
    correspondences: dict = {}

    for file in glob.glob(file_path)[:1000]:
        try:
            print(f"Working on {os.path.basename(file)}")
            tree = etree.parse(file)
            root = tree.getroot()

            sender_ref: str = ""
            receiver_ref: str = ""
            receiver: dict = {}
            gnd_regex: str = 'http://d-nb.info/gnd/'

            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="sent"]//tei:persName',
                                     NAMESPACE
                                     ):

                # Building the reference list for all persons existing in the corpus.
                sender_ref = (re.sub(gnd_regex,
                                     '',
                                     str(elem.get('ref')),
                                     ).strip()
                              )

                if not sender_ref in person_information.keys():
                    person_information[sender_ref] = {'name': elem.text,
                                                      'key': elem.get('key')
                                                      }

                """Adding sender to correspondences. At the beginning, the correspondence list will be empty and the first sender can be added. After that, they will only be added if their GND-key is not already in the correspondence list.
                """
                if not correspondences:
                    correspondences[sender_ref] = {}
                else:
                    if sender_ref not in correspondences.keys():
                        correspondences[sender_ref] = {}
                    else:
                        continue


            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="received"]//tei:persName',
                                     NAMESPACE
                                     ):
                receiver_ref = (re.sub(gnd_regex,
                                       '',
                                       str(elem.get('ref'))
                                       ).strip()
                                )


            """Adding receivers to the list. This means that each person who sent a letter to someone will have that 'receiver' stored in a dictionary with the frequency of correspondance.
            """
            if not correspondences[sender_ref] or\
                    (correspondences[sender_ref] and receiver_ref not in correspondences[sender_ref].keys()):
                correspondences[sender_ref][receiver_ref] = 1
            elif correspondences[sender_ref] and receiver_ref in correspondences[sender_ref].keys():
                correspondences[sender_ref][receiver_ref] += 1



        except XMLSyntaxError:
            # There has been an error with the syntax in "AWS-aw-02on.xml" which I fixed manually.
            print(f"There has been a syntax error in {file}.")
            continue


    return correspondences




In [404]:
df = pd.DataFrame.from_dict(extract_metadata())
df = df.replace(np.nan, 0)
print(df.head(40))

Working on AWS-aw-05dx.xml
Working on AWS-aw-027u.xml
Working on AWS-aw-024n.xml
Working on AWS-aw-0512.xml
Working on AWS-aw-02ae.xml
Working on AWS-aw-051s.xml
Working on AWS-aw-00ya.xml
Working on AWS-aw-0506.xml
Working on AWS-aw-053l.xml
Working on AWS-aw-050w.xml
Working on AWS-aw-00xe.xml
Working on AWS-aw-02cz.xml
Working on AWS-aw-025j.xml
Working on AWS-aw-0260.xml
Working on AWS-aw-024y.xml
Working on AWS-aw-05g5.xml
Working on AWS-aw-05do.xml
Working on AWS-aw-0248.xml
Working on AWS-aw-027b.xml
Working on AWS-aw-00yv.xml
Working on AWS-aw-051d.xml
Working on AWS-aw-02bi.xml
Working on AWS-aw-02a3.xml
Working on AWS-aw-00y7.xml
Working on AWS-aw-00xr.xml
Working on AWS-aw-02cm.xml
Working on AWS-aw-00x3.xml
Working on AWS-aw-028m.xml
Working on AWS-aw-00ub.xml
Working on AWS-aw-00v8.xml
Working on AWS-aw-00vy.xml
Working on AWS-aw-00tf.xml
Working on AWS-aw-02oy.xml
Working on AWS-aw-029i.xml
Working on AWS-aw-05hl.xml
Working on AWS-aw-05kw.xml
Working on AWS-aw-028z.xml
W