In [1]:
#Import libraries: 
import pandas as pd
from bs4 import BeautifulSoup as soup
import lxml.etree as ET
import unicodedata
from tqdm import tqdm

In [2]:
#Load data from dump-file: 
with open("dataset_tutorial_eco.xml", "r", encoding="utf-8") as f:
    result = f.readlines()
content = "".join(result)

#Parse data to xml: 
xml = soup(content, features="xml")

In [3]:
#Find all bibliographic records in xml: 
records = xml.find_all('record', {'type':'Bibliographic'})

#Print record at position 1 to double check that everything is fine:
#print(records[1].prettify())

print(len(records))

2828


In [9]:
#Define function to parse each record and extract only the datafields needed: 
def parse_record(record):
    
    #ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = ET.fromstring(unicodedata.normalize("NFC", str(record)))
      
    #idn
    idn = xml.xpath("controlfield[@tag = '001']")
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
    
    # title
    title = xml.xpath("datafield[@tag = '245']/subfield[@code = 'a']")
    try:
        title = title[0].text
    except:
        title = "unknown"
         
    # subtitle
    subtitle = xml.xpath("datafield[@tag = '245']/subfield[@code = 'b']")
    try:
        subtitle = subtitle[0].text
    except:
        subtitle = "unknown"   
 
    #date:
    date1 = xml.xpath("datafield[@tag = '264']/subfield[@code = 'c']")
    date2 = xml.xpath("controlfield[@tag = '008']")
        
    if date1:
        date = date1[0].text
    elif date2: 
        date = date2[0].text  ## extract complete 008
        date = date[7:11]     ## get only characters at positions 7-10
    else:
        date = "none"

       
    meta_dict = {"idn":idn, "title":title, "subtitle":subtitle, "date":date} #create dictionary
    
    return meta_dict


100%|█████████████████████████████████████████████████████████████████████████████| 2828/2828 [00:20<00:00, 270.03it/s][A

In [10]:
#Parse each record to function above: 
pbar = tqdm(total=len(records))

result = []
for record in records:
    result.append(parse_record(record))
    pbar.update()

100%|█████████████████████████████████████████████████████████████████████████████| 2828/2828 [00:22<00:00, 127.85it/s]
 99%|████████████████████████████████████████████████████████████████████████████▌| 2812/2828 [00:09<00:00, 273.02it/s]

In [11]:
#Convert result to dataframe:
df = pd.DataFrame(result)
df

Unnamed: 0,idn,title,subtitle,date
0,1043718966,A cross-cultural study of motivational factors...,unknown,2010
1,1019902175,A forecast evaluation of PCA-based adaptive ...,unknown,2010
2,1012057232,A new perspective on social learning,unknown,2010
3,1009567691,A reconsideration of full-cost pricing,methodological aspects of marginalism and theo...,2010
4,1045290823,A study on the impact of mobile telecommunicat...,unknown,2010
...,...,...,...,...
2823,120766099X,Modellierung der Zahlungsschwierigkeiten vonPr...,unknown,2000
2824,961697555,Object Warehouse,Konzeption der Basis objektorientierter Manage...,2000
2825,1035450941,Phasenfeldmodellierung mehrphasiger Erstarrung,unknown,2000
2826,1206191139,Reform der gemeinsamen Agrarpolitik und EU-Int...,unknown,2000


In [13]:
#If needed: Search for a specific title in dataframe:
df.query("title == 'Object Warehouse'")

Unnamed: 0,idn,title,subtitle,date
2824,961697555,Object Warehouse,Konzeption der Basis objektorientierter Manage...,2000


In [27]:
df.to_csv("Eco_small_dataset.csv", encoding="utf-8")