In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
import unicodedata

In [2]:
with open('alldata.xml', encoding='utf8') as f:
    alldata = f.read()
    
new_response = BeautifulSoup(alldata) #Parsen der SRU-Textantwort in BeautifulSoup
gndm = new_response.find_all('record', {'type':'Authority'}) #Suchen der einzelnen Records

In [14]:
#Funktion zum Extrahieren der Inhalte:
def parse_record(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = etree.fromstring(unicodedata.normalize("NFC", str(record)))
    
    #Author: 
    creator = xml.findall("marc:datafield[@tag = '100']/marc:subfield[@code = 'a']", namespaces=ns)
    if creator:
        author = creator[0].text
    else:
        author = "fail"

    #IDN:      
    idn = xml.findall("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
        
        
    #Wirkungsort:      
    place = xml.findall("marc:datafield[@tag = '551']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        place = place[0].text
    except:
        place = 'N/A'
        
        
    #Zusammenführen:
    gathered = {'Author':author, 'ID':idn, 'Wirkungsort':place}
    return gathered

In [15]:
#Überführen in Tabelle: 
result = [parse_record(item) for item in gndm]
df_all = pd.DataFrame(result)
df_all

Unnamed: 0,Author,ID,Wirkungsort
0,"A., Sonay",114249253,
1,"Aarestrup, Emil",118643533,
2,"Abarbanell, Stephan",1076415040,Babelsberg
3,"Abbott, David",143858122,
4,"Abbott, David R.",1186811374,
...,...,...,...
9352,"Ibrāhīm, ʿAbd Allāh ʿAlī",1089224273,
9353,"ʿAbdullāhī, ʿAlī",1022218409,
9354,"Ibrāhīm, ʿAbdallah ʿAlī",1145657249,
9355,"Ibrāhīm, ʿAbd Allāh ʿAlī",1067938591,


In [17]:
#Show only results where Wirkungsort present: 
df2 = (df_all[df_all['Wirkungsort'] != 'N/A'])
df2


Unnamed: 0,Author,ID,Wirkungsort
2,"Abarbanell, Stephan",1076415040,Babelsberg
5,"Abbott, David",107937530,London
9,"Abedi, Isabel",12400346X,München
11,"Abele, Jan",1193110769,Hamburg
13,"Abeln, Reinhard",115401970,Osnabrück
...,...,...,...
9345,"Zweig, Arnold",118637452,Glogau
9346,"Zweig, Stefan-Jerzy",118829653,Krakau
9347,"Zweig, Stefan",118637479,Wien
9348,"Zweig, Stefanie",119368234,Leobschütz


In [43]:
#Öffnen von Results_Overview.csv: 
df = pd.read_csv("result_overview.csv")
df = df.drop(columns=['Unnamed: 0'])
df['found'] = df['found'].fillna(0)
df.found = df.found.astype(int)
df

Unnamed: 0,query,found
0,"A., Sonay",1
1,"Aarestrup, Emil",1
2,"Abarbanell, Stephan",1
3,"Abbott, David",5
4,"Äbdülrähmanlı, Näriman",1
...,...,...
4956,"Гейне, Генріх",0
4957,"Гете, Иоганн Вольфганг",0
4958,"Гримм, Братья",0
4959,"Захер-Мазох, Леопольд фон",0


In [48]:
#find authors with more than one hit: 
multiple = df.loc[(df['found'] != 1) & (df['found'] != 0)]
multiple


Unnamed: 0,query,found
3,"Abbott, David",5
12,"Abreu, Carlos",9
17,"Ackermann, Erich",12
18,"Ackermann, Rolf",4
21,"Adam, Gabi",3
...,...,...
4923,"Zimmermann, Katharina",14
4938,"Zöller, Martin",3
4943,"Zuckmayer, Carl",3
4949,"Zweig, Stefan",2


In [49]:
#find authors with exactly one hit: 
unique = df.loc[(df['found'] == 1)]
unique

Unnamed: 0,query,found
0,"A., Sonay",1
1,"Aarestrup, Emil",1
2,"Abarbanell, Stephan",1
4,"Äbdülrähmanlı, Näriman",1
5,"Abedi, Isabel",1
...,...,...
4948,"Zweig, Arnold",1
4950,"Zweig, Stefanie",1
4951,"Zybell, Jo",1
4952,"ʿAbbās, Bahǧat",1


In [51]:
#find authors with no hit: 
none = df.loc[(df['found'] == 0)]
none

Unnamed: 0,query,found
11,"Abraham, Filo M",0
13,"Acero, Irene",0
24,"Addams, Peter",0
25,"Adi Mira, Michaels",0
26,"Adib, Mu",0
...,...,...
4956,"Гейне, Генріх",0
4957,"Гете, Иоганн Вольфганг",0
4958,"Гримм, Братья",0
4959,"Захер-Мазох, Леопольд фон",0


In [53]:
#dataframes to csv: 
none.to_csv("none.csv")
unique.to_csv("unique.csv")
multiple.to_csv("multiple.csv")