In [5]:
import glob
import pprint
import re
import os

from lxml import etree
from lxml.etree import XMLSyntaxError
import pandas as pd
import numpy as np


In [52]:
PATH: str = '../data/'
FILES: str = '*.xml'
REGISTER: str = 'register.xml'
NAMESPACE: dict = {'tei': 'http://www.tei-c.org/ns/1.0'}
GND_REGEX: str = 'http://d-nb.info/gnd/'

In [7]:
def extract_metadata(file_path: str = PATH + FILES) -> (dict, dict):
    # Function for retrieving the needed metadata from the XML/TEI files.

    # A look-up table for all relevant information about the people in the corpus.
    # person_information: dict = {}
    correspondences: dict = {}

    for file in glob.glob(file_path)[:]:
        try:
            # print(f"Working on {os.path.basename(file)}")
            tree = etree.parse(file)
            root = tree.getroot()

            sender_ref: str = ""
            receiver_ref: str = ""

            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="sent"]//tei:persName',
                                     NAMESPACE
                                     ):

                # Building the reference list for all persons existing in the corpus.
                sender_ref = (re.sub(GND_REGEX,
                                     '',
                                     str(elem.get('ref')),
                                     ).strip()
                              )

                # if not sender_ref in person_information.keys():
                #     person_information[sender_ref] = {'name': elem.text,
                #                                       'key': elem.get('key')
                #                                       }

                """Adding sender to correspondences. At the beginning, the correspondence list will be empty and the first sender can be added. After that, they will only be added if their GND-key is not already in the correspondence list.
                """
                if not correspondences:
                    correspondences[sender_ref] = {}
                else:
                    if sender_ref not in correspondences.keys():
                        correspondences[sender_ref] = {}
                    else:
                        continue


            for elem in root.findall('./tei:teiHeader//tei:correspAction[@type="received"]//tei:persName',
                                     NAMESPACE
                                     ):
                receiver_ref = (re.sub(GND_REGEX,
                                       '',
                                       str(elem.get('ref'))
                                       ).strip()
                                )


            """Adding receivers to the list. This means that each person who sent a letter to someone will have that 'receiver' stored in a dictionary with the frequency of correspondance.
            """
            try:
                if not correspondences[sender_ref] or (correspondences[sender_ref] and receiver_ref not in correspondences[sender_ref].keys()):

                    correspondences[sender_ref][receiver_ref] = 1
                elif correspondences[sender_ref] and receiver_ref in correspondences[sender_ref].keys():
                    correspondences[sender_ref][receiver_ref] += 1
            except KeyError:
                # Entries with missing gnd-entries will be ignored as they can not be properly addressed.
                continue



        except XMLSyntaxError:
            # If there is a syntax error in the file, it will be ignored in the preprocessing.
            print(f"There has been a syntax error in {file}.")
            continue

    # Checking if the result is actually a set of only unique gnd-keys.
    # pprint.pprint(person_information.keys() == set(person_information.keys()))

    return correspondences




In [36]:
correspondences_ = extract_metadata()

correspondences_df = pd.DataFrame.from_dict(correspondences_)
correspondences_df = correspondences_df.replace(np.nan, 0)

# new_directory: str = '../../data/retrieved/'
#
# if not os.path.exists(new_directory):
#     os.makedirs(new_directory)

correspondences_df.to_csv('correspondences.csv')



There has been a syntax error in ../data/AWS-aw-02on.xml.
There has been a syntax error in ../data/AWS-aw-05q8.xml.
There has been a syntax error in ../data/AWS-aw-0317.xml.
There has been a syntax error in ../data/AWS-aw-05di.xml.
There has been a syntax error in ../data/AWS-aw-02wy.xml.
There has been a syntax error in ../data/AWS-aw-02le.xml.
There has been a syntax error in ../data/AWS-aw-031g.xml.
There has been a syntax error in ../data/AWS-aw-05x5.xml.
There has been a syntax error in ../data/AWS-aw-01b6.xml.
There has been a syntax error in ../data/AWS-aw-05uo.xml.
There has been a syntax error in ../data/AWS-aw-01ml.xml.
There has been a syntax error in ../data/AWS-aw-01b5.xml.
There has been a syntax error in ../data/AWS-aw-03ln.xml.
There has been a syntax error in ../data/AWS-aw-03dw.xml.
There has been a syntax error in ../data/AWS-aw-02xm.xml.
There has been a syntax error in ../data/AWS-aw-028q.xml.
There has been a syntax error in ../data/AWS-aw-02tj.xml.
There has been

In [53]:
"""Loading data with labels into a corresponding table for later import into Gephi. For visualisation purposes, it is important to include the labels in the dataset to be able to identify the nodes in the graph.
"""

df = pd.read_csv('../data/retrieved/correspondences.csv')


_, person_information_ = extract_metadata()
for key, person in person_information_.items():
    df = df.rename(columns={key: person['name']})
    df.loc[df['Unnamed: 0'] == key, 'Unnamed: 0'] = person['name']


df.to_csv('./correspondences_with_labels.csv')




There has been a syntax error in ../data/AWS-aw-02on.xml.
There has been a syntax error in ../data/AWS-aw-05q8.xml.
There has been a syntax error in ../data/AWS-aw-0317.xml.
There has been a syntax error in ../data/AWS-aw-05di.xml.
There has been a syntax error in ../data/AWS-aw-02wy.xml.
There has been a syntax error in ../data/AWS-aw-02le.xml.
There has been a syntax error in ../data/AWS-aw-031g.xml.
There has been a syntax error in ../data/AWS-aw-05x5.xml.
There has been a syntax error in ../data/AWS-aw-01b6.xml.
There has been a syntax error in ../data/AWS-aw-05uo.xml.
There has been a syntax error in ../data/AWS-aw-01ml.xml.
There has been a syntax error in ../data/AWS-aw-01b5.xml.
There has been a syntax error in ../data/AWS-aw-03ln.xml.
There has been a syntax error in ../data/AWS-aw-03dw.xml.
There has been a syntax error in ../data/AWS-aw-02xm.xml.
There has been a syntax error in ../data/AWS-aw-028q.xml.
There has been a syntax error in ../data/AWS-aw-02tj.xml.
There has been

In [59]:
def match_register(path: str = PATH + REGISTER):
    with open(path) as file:
        tree = etree.parse(file)
        root = tree.getroot()

        for elem in root.findall('./tei:teiHeader]//tei:persName',
                                     NAMESPACE
                                     ):

                # Building the reference list for all persons existing in the corpus.
                ref = (re.sub(GND_REGEX,
                                     '',
                                     str(elem.get('ref')),
                                     ).strip()
                              )

                print(ref)

