In [1]:
import requests
import pandas as pd
import lxml.etree as ET
from bs4 import BeautifulSoup as soup
import unicodedata
import tqdm
from time import sleep
import time

## Laden der Daten aus Datei: 

In [2]:
# open from file: 
with open("pandemie_titeldaten.xml", "r", encoding="utf-8") as f2:
    result = f2.readlines()

Parsen des XML aus Datei:

In [3]:
content = "".join(result)
xml = soup(content, "lxml")
#print(xml)

Heraussuchen aller Datensätze des Typs "Bibliographic" aus dem XML:

In [4]:
myquery = xml.find_all('record', {'type':'Bibliographic'})
print(len(myquery))

11368


In [5]:
print(myquery[1].prettify())


<record type="Bibliographic" xmlns="http://www.loc.gov/MARC21/slim">
 <leader>
  00000nam a2200000uc 4500
 </leader>
 <controlfield tag="001">
  1317002237
 </controlfield>
 <controlfield tag="003">
  DE-101
 </controlfield>
 <controlfield tag="005">
  20240214080110.0
 </controlfield>
 <controlfield tag="007">
  cr||||||||||||
 </controlfield>
 <controlfield tag="008">
  240124s2024    gw |||||om||| 00||||ger
 </controlfield>
 <datafield ind1=" " ind2=" " tag="015">
  <subfield code="a">
   24,O02
  </subfield>
  <subfield code="2">
   dnb
  </subfield>
 </datafield>
 <datafield ind1="7" ind2=" " tag="016">
  <subfield code="2">
   DE-101
  </subfield>
  <subfield code="a">
   1317002237
  </subfield>
 </datafield>
 <datafield ind1="7" ind2=" " tag="024">
  <subfield code="2">
   urn
  </subfield>
  <subfield code="a">
   urn:nbn:de:bsz:14-qucosa2-890970
  </subfield>
 </datafield>
 <datafield ind1=" " ind2=" " tag="035">
  <subfield code="a">
   (DE-599)DNB1317002237
  </subfield>
 <

## Suche nach benötigten Informationen in den einzelnen Datensätzen: 

In [9]:
#Funktion für Titeldaten in MARC21

def parse_record_marc(item):

    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = ET.fromstring(unicodedata.normalize("NFC", str(item)))
    
    #idn
    idn = xml.findall("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'N/A' 
        
    
    #Titel 
    title = xml.findall("marc:datafield[@tag = '245']/marc:subfield[@code = 'a']", namespaces=ns)
    title2 = xml.findall("marc:datafield[@tag = '245']/marc:subfield[@code = 'b']", namespaces=ns)
    
    if title and not title2:
        titletext = title[0].text
    elif title and title2:     
        titletext = title[0].text + " " + title2[0].text   #Hier wird extra nur mit einem Leerzeichen getrennt statt eines Separators
    else:
        titletext = "N/A"
    
    
    #date
    date = xml.findall("marc:datafield[@tag = '264']/marc:subfield[@code = 'c']", namespaces=ns)
    date2 = xml.findall("marc:controlfield[@tag = '008']", namespaces=ns)
    if date:
        date = date[0].text
    elif date2:
        date = date2[0].text
        date = date[7:11]
    else:    
        date = 'N/A'
          
    #lang:
    lang = xml.findall("marc:datafield[@tag = '041']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        lang = lang[0].text
    except:
        lang = 'N/A'
    
    
    #URN: 
    urn = "N/A"
    for urn in xml.findall("marc:datafield[@tag = '024']/marc:subfield[@code = 'a']", namespaces=ns):
        if urn.text.startswith("urn:nbn"):
            urn = urn.text
        else: 
            urn = "N/A"
    
    
    meta_dict = {"idn":idn, "title": titletext, "date":date, "lang":lang, "urn":urn} 
               
    return meta_dict
    

In [10]:
output = [parse_record_marc(record) for record in myquery]

In [11]:
df = pd.DataFrame(output)
df

Unnamed: 0,idn,title,date,lang,urn
0,1285990005,"2020 Das Jahr, das die Welt veränderte Der ame...",2024,ger,
1,1317002237,Ängste der Allgemeinbevölkerung in Zeiten der ...,2024,ger,urn:nbn:de:bsz:14-qucosa2-890970
2,1319076106,Aufbau eines Key Account Managements in der To...,2024,ger,urn:nbn:de:101:1-2024021302301350289147
3,1319076114,Aufbau eines Key Account Managements in der To...,2024,ger,urn:nbn:de:101:1-2024021302302274697630
4,1315575361,Auswirkungen der Corona-Pandemie auf die Ausbi...,2024,ger,urn:nbn:de:101:1-2024011102383134159187
...,...,...,...,...,...
11363,129882589X,"Zwei Nachbarländer, eine Pandemie",2023,ger,urn:nbn:de:101:1-2023080821170040558967
11364,1232892661,Zwei Runden gehen an Corona – aber die Urologi...,2021,ger,urn:nbn:de:101:1-2021050520504432857162
11365,1307057543,Zwischen Arbeitserleichterung und De-Professio...,2023,ger,urn:nbn:de:101:1-2023102409011131884864
11366,1241532591,Zwischen Pandemie und Wahlkampf: Verhältnismäß...,2021,ger,urn:nbn:de:101:1-2021092021390437649681


In [12]:
df.to_hdf("pandemie_titeldaten.h5", key="df")