In [1]:
#Import libraries: 
import pandas as pd
from bs4 import BeautifulSoup as soup
import lxml.etree as ET
import unicodedata
from tqdm import tqdm

In [4]:
#Load data from dump-file: 
with open("biene_dump.xml", "r", encoding="utf-8") as f:
    result = f.readlines()
content = "".join(result)

#Parse data to xml:
xml = soup(content,"lxml")

In [5]:
#Find all bibliographic records in xml: 
records = xml.find_all('record', {'type':'Bibliographic'})

#Print record at position 1 to double check that everything is fine:
#print(records[1].prettify())

print(len(records))

2034


In [6]:
#Define function to parse each record and extract only the datafields needed: 
def parse_record(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = ET.fromstring(unicodedata.normalize("NFC", str(record)))
      
    #idn
    idn = xml.xpath("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
    
    # title
    title = xml.xpath("marc:datafield[@tag = '245']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        title = title[0].text
    except:
        title = "unknown"
         
    # subtitle
    subtitle = xml.xpath("marc:datafield[@tag = '245']/marc:subfield[@code = 'b']", namespaces=ns)
    try:
        subtitle = subtitle[0].text
    except:
        subtitle = "unknown"   
 
    #date:
    date1 = xml.xpath("marc:datafield[@tag = '264']/marc:subfield[@code = 'c']", namespaces=ns)
    date2 = xml.xpath("marc:controlfield[@tag = '008']", namespaces=ns)
        
    if date1:
        date = date1[0].text
    elif date2: 
        date = date2[0].text  ## extract complete 008
        date = date[7:11]     ## get only characters at positions 7-10
    else:
        date = "none"

       
    meta_dict = {"idn":idn, "title":title, "subtitle":subtitle, "date":date} #create dictionary
    
    return meta_dict

In [7]:
#Parse each record to function above: 
pbar = tqdm(total=len(records))

result = []
for record in records:
    result.append(parse_record(record))
    pbar.update()

 99%|████████████████████████████████████████████████████████████████████████████▏| 2014/2034 [00:04<00:00, 537.49it/s]

In [8]:
#Convert result to dataframe:
df = pd.DataFrame(result)
df

Unnamed: 0,idn,title,subtitle,date
0,1278699708,Biene,Blankbook,2023
1,1283945827,"Biene gib mir Honig, [Theaterstück]",unknown,2023
2,1283945835,"Biene gib mir Honig, [Theaterstück]",unknown,2023
3,1283945843,"Biene gib mir Honig, [Theaterstück]",unknown,2023
4,1283945908,"Biene gib mir Honig, [Theaterstück] - beiliege...",unknown,2023
...,...,...,...,...
2029,1232876879,Postkarte,unknown,1959-1959
2030,1235784428,Briefkarte,unknown,1900-1999
2031,1237361656,Konvolut von Programmen seines Theaterstücks B...,unknown,1973
2032,1238096719,Konvolut von Zeitungsausschnitten u. Ä. mit Be...,unknown,1960


100%|█████████████████████████████████████████████████████████████████████████████| 2034/2034 [00:20<00:00, 537.49it/s]

In [None]:
#If needed: Search for a specific title in dataframe:
df.query("title == 'Biene'")

In [None]:
df.to_csv("Biene_small_dataset.csv", encoding="utf-8")