## Parse from XML Data

In [2]:
import os
os.chdir("..")
print(os.getcwd())

C:\Users\simon\OneDrive\Dokumente\Programming\glamhack2024-europeana


In [3]:
# lists of lists approach

import os
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml_files(folder_path):
    xml_files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
    
    data = {}
    
    for xml_file in xml_files:
        file_path = os.path.join(folder_path, xml_file)
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
        except ET.ParseError:
            print(f"Skipping file due to parsing error: {xml_file}")
            continue
        
        provenances = root.findall(".//dcterms:provenance", namespaces={'dcterms': 'http://purl.org/dc/terms/'})
        
        if not provenances:
            print(f"No <dcterms:provenance> tags found in: {xml_file}")
            continue
        
        # Process provenance elements
        provenance_texts = []
        for prov in provenances:
            lang = prov.attrib.get('{http://www.w3.org/XML/1998/namespace}lang', 'no_lang')
            text = prov.text.strip() if prov.text else ''
            provenance_texts.append((lang, text))
        
        # Remove duplicates
        provenance_texts = list(set(provenance_texts))
        
        # Group by language
        grouped_texts = {}
        for lang, text in provenance_texts:
            if lang not in grouped_texts:
                grouped_texts[lang] = []
            grouped_texts[lang].append([text])  # Wrap text in a list
        
        data[xml_file] = grouped_texts
    
    return data

def create_dataframe(data):
    # Determine unique languages
    all_langs = set()
    for item in data.values():
        all_langs.update(item.keys())
    
    all_langs.discard('no_lang')  # Exclude 'no_lang' from column names
    
    columns = list(all_langs) + ['no_lang']
    
    df_data = []
    for xml_file, groups in data.items():
        row = {'xml_file': xml_file}
        for lang in columns:
            row[lang] = groups.get(lang, [['NA']])  # Provide a list of 'NA' if no content
        df_data.append(row)
    
    df = pd.DataFrame(df_data, columns=['xml_file'] + columns)
    return df

def main(folder_path, output_file):
    data = parse_xml_files(folder_path)
    df = create_dataframe(data)
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

main('data/91609', 'data/provenance_data_complete.csv')

No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111664.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111665.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111666.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111667.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111668.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111669.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111670.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111671.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111672.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111673.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111674.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111675.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111676.xml
No <dcterms:provenance> tags found in: SMVK_VKM_fotografi_111677.xml
No <dcterms:provenance> tags found