In [7]:
## Bibliotheken:
from bs4 import BeautifulSoup
from lxml import etree as ET
import pandas as pd
import unicodedata
from tqdm import tqdm
import polars as pl

In [3]:
#Einlesen des Dumps in ElementTree:

xml = open("ohs_03_2024.xml", "r", encoding="utf-8")
tree = ET.parse(xml) 
root = tree.getroot() 

records = list(root)
total_records = len(records)  

#Ausgabe der Gesamtzahl der in der Datei vorhandenen Records:
print("Gesamtzahl bibliographischer Einträge: ",total_records)

Gesamtzahl bibliographischer Einträge:  325358


In [4]:
#Function to get actual data: 
def parse_record(xml):
      
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
        
    #IDN:      
    idn = xml.findall("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
        
    #008:      
    c008 = xml.findall("marc:controlfield[@tag = '008']", namespaces=ns)
    try:
        c008 = c008[0].text
    except:
        c008 = 'fail'
        
        
    #Language: 
    lang = xml.findall("marc:datafield[@tag = '041']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        lang = lang[0].text
    except:
        lang = 'fail'
        
    #Creator: 
    creator = xml.findall("marc:datafield[@tag = '100']/marc:subfield[@code = 'a']", namespaces=ns)
    creator_body = xml.findall("marc:datafield[@tag = '110']/marc:subfield[@code = 'a']", namespaces=ns)
    if creator:
        creator = creator[0].text
    elif creator_body:
        creator = creator_body[0].text
    else:
        creator = "fail"    
        
        
    #Title: 
    maintitle = xml.findall("marc:datafield[@tag = '245']/marc:subfield[@code = 'a']", namespaces=ns)
    subtitle = xml.findall("marc:datafield[@tag = '245']/marc:subfield[@code = 'b']", namespaces=ns)
    if maintitle:
        maintitle = maintitle[0].text
        if subtitle: 
            title = maintitle + " : " + subtitle[0].text
        else:
            title = maintitle
    else:
        title = "fail"    
    
    
    #Place of publication: 
    place = xml.findall("marc:datafield[@tag = '264']/marc:subfield[@code = 'a']", namespaces=ns)
    diss = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'a']", namespaces=ns)
    diss2 = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'c']", namespaces=ns)
    if place:
        place = place[0].text
    elif diss:
        place = diss[0].text
    elif diss2:
        place = diss2[0].text
    else:
        place = "N/A"    
        
        
    #Year of publication: 
    year = xml.findall("marc:datafield[@tag = '264']/marc:subfield[@code = 'c']", namespaces=ns)
    diss = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'a']", namespaces=ns)
    c008_2 = xml.findall("marc:controlfield[@tag = '008']", namespaces=ns)
    if year:
        year = year[0].text
    elif diss:
        year = diss[0].text
    elif c008_2:
        year = c008_2[0].text
        year = year[7:11] + " - 008"
    else:
        year = "0"    
    
         
    #notes: 
    notes = xml.findall("marc:datafield[@tag = '500']/marc:subfield[@code = 'a']", namespaces=ns)
    if notes:
        notes = notes[0].text
    else:
        notes = "N/A" 
    
    #diss: 
    diss0 = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'a']", namespaces=ns)
    diss1 = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'b']", namespaces=ns) #Diss
    diss2 = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'c']", namespaces=ns) #Uni
    diss3 = xml.findall("marc:datafield[@tag = '502']/marc:subfield[@code = 'd']", namespaces=ns) #Jahr
    if diss0:
        diss0 = diss[0].text
    else:
        diss0 = "none"
        
    if diss1:
        dissdiss = diss1[0].text
    else:
        dissdiss = "none"  
        
    if diss2:
        dissuni = diss2[0].text
    else:
        dissuni = "none"   
    
    if diss3:
        dissyear = diss3[0].text
    else:
        dissyear = "none"  
         
    
    if diss0 != "none": 
        diss = diss0
    elif (dissdiss != "none") & (dissuni != "none") & (dissyear != "none"):
        diss = dissdiss + " " + dissuni + " " + dissyear
    elif (dissdiss != "none") & (dissuni != "none"):
        diss = dissdiss + " " + dissuni
    elif dissdiss != "none":
        diss = dissdiss
    else:
        diss = "not found"   
    
    #DDC: 
    ddc3 = xml.findall("marc:datafield[@tag = '082']/marc:subfield[@code = 'a']", namespaces=ns)
    if ddc3:
        ddc3 = ddc3[0].text
    else:
        ddc3 = "N/A"   
        
        
    #DDC weitere: 
    ddc1 = xml.findall("marc:datafield[@tag = '083']/marc:subfield[@code = 'a']", namespaces=ns)
    if ddc1:
        ddc1 = ddc1[0].text
    else:
        ddc1 = "N/A" 
        
        
    #DDC?: 
    ddc2 = xml.findall("marc:datafield[@tag = '083']/marc:subfield[@code = 'q']", namespaces=ns)
    if ddc2:
        ddc2 = ddc2[0].text
    else:
        ddc2 = "N/A" 
    
 
    #ddc_prio
    if ddc1: 
        ddc_prio = ddc1
    elif ddc2:
        ddc_prio = ddc2
    elif ddc3:
        ddc_prio = ddc3
    else:
        ddc_prio = "fail"
        

    #Sachgruppe: 
    sw = xml.findall("marc:datafield[@tag = '084']/marc:subfield[@code = 'a']", namespaces=ns)
    if sw:
        sw = sw[0].text
    else:
        sw = "N/A"    
        
    
    #Zusammenführen:
    gathered = {'ID':idn, 'Creator':creator, 'Title':title, 'lang':lang, 'Place':place, "Year": year, 'DDC':ddc_prio, 'DDC1':ddc1, 'DDC2':ddc2, 
                'DDC-DE?':ddc3, 'Sachgruppe':sw, 'diss':diss, 'notes':notes, '008':c008}
    return gathered

In [17]:
#extract relevant data for app from dump: 

pbar = tqdm(total=total_records)

result = []
for item in records:
    result.append(parse_record(item))
    pbar.update()


  2%|█▍                                                                         | 6297/325358 [00:28<23:50, 223.08it/s]
100%|█████████████████████████████████████████████████████████████████████████| 325358/325358 [10:07<00:00, 503.07it/s]

In [20]:
#Convert to Dataframe and save as .h5: 

#df = pl.DataFrame(result)
#df.write_parquet("data/online_diss_03-2024.parquet")
df = pd.DataFrame(result)
df.to_hdf('data/online_diss_03-2024.h5', key='df')  

MemoryError: 

In [13]:
df

ID,Creator,Title,lang,Place,Year,DDC,DDC1,DDC2,DDC-DE?,Sachgruppe,diss,notes,008
str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""010063374""","""fail""","""Veröffentlichungen aus dem Ge…","""ger""","""Berlin""","""1912-1936""","""N/A""","""N/A""","""N/A""","""610""","""N/A""","""not found""","""Digital.: Frankfurt, M. : Univ…","""991118d19121936xx u||m| ||| 0…"
"""010081208""","""fail""","""Gewässerschutz, Wasser, Abwas…","""ger""","""Aachen""","""1968-""","""550""","""550""","""DE-600""","""620""","""N/A""","""not found""","""Ersch. teils auch als CD-ROM-A…","""991118c19689999gw z||m| ||| 0…"
"""010251928""","""fail""","""Regionale Schulgeschichte : Sc…","""ger""","""Oldenburg""","""1988-""","""370""","""370""","""DE-600""","""370""","""N/A""","""not found""","""Ersch. teils auch als Online-A…","""991118c19889999gw z||m| ||| 0…"
"""010409254""","""Universität Stuttgart""","""Berichte aus dem Institut für…","""ger""","""Stuttgart""","""1982-""","""N/A""","""N/A""","""N/A""","""621.3""","""N/A""","""not found""","""N/A""","""991118c19829999gw z||m| ||| 0…"
"""010446613""","""Verein Deutscher Ingenieure""","""Fortschrittberichte VDI""","""ger""","""Düsseldorf""","""1985-""","""620""","""620""","""DE-600""","""620""","""N/A""","""not found""","""N/A""","""991118c19859999gw z||m| ||| 0…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999997297""","""Moschkau, Peter""","""Defektabhängige Transporteige…","""ger""","""Göttingen, Univ., Diss., 2009""","""Göttingen, Univ., Diss., 2009""","""N/A""","""N/A""","""N/A""","""530""","""N/A""","""Göttingen, Univ., Diss., 2009""","""N/A""","""100204s2009 gw |||||om||| 0…"
"""999997505""","""Scheler, Esther""","""Tailoring fluorene-based oligo…","""eng""","""Bayreuth, Univ., Diss., 2009""","""Bayreuth, Univ., Diss., 2009""","""540""","""540""","""DE-101""","""547.70455""","""VN 5927""","""Bayreuth, Univ., Diss., 2009""","""N/A""","""100204s2009 gw |||||om||| 0…"
"""99999767X""","""Körzdörfer, Thomas""","""Self-interaction and charge tr…","""eng""","""Bayreuth, Univ., Diss., 2009""","""Bayreuth, Univ., Diss., 2009""","""530""","""530""","""DE-101""","""537.6223""","""UM 1200""","""Bayreuth, Univ., Diss., 2009""","""Enth. 4 Sonderabdr. aus versch…","""100204s2009 gw |||||om||| 0…"
"""999999389""","""Xystrakis, Fotios""","""The drought tolerance limit …","""eng""","""Freiburg (Breisgau), Univ., Di…","""Freiburg (Breisgau), Univ., Di…","""N/A""","""N/A""","""N/A""","""630""","""N/A""","""Freiburg (Breisgau), Univ., Di…","""Dateien in unterschiedlichen F…","""100204s2009 gw |||||om||| 0…"
