# DNBLab Jupyter Notebook Tutorial

## Abfrage der SRU-Schnittstelle



### Einrichten der Arbeitsumgebung  <a class="anchor" id="Teil1"></a>

In [1]:
import requests
import pandas as pd
import lxml.etree as ET
from bs4 import BeautifulSoup as soup
import unicodedata

### Schnelle Abfrage (100 erste Treffer, nur Rückgabe der Treffermenge)  <a class="anchor" id="Teil1"></a>

In [64]:
def sru_dnb_simple(query): 

    dnb_url = "https://services.dnb.de/sru/dnb"
    parameter = {'version' : '1.1' , 'operation' : 'searchRetrieve' , 'query' : query,
                 'recordSchema' : 'MARC21-xml', 'maximumRecords': '100'} 

    r = requests.get(dnb_url, params = parameter)
    response = soup(r.content)
    number = response.find_all('numberofrecords')[0].text
    
    return number
    
    

In [66]:
myquery1 = sru_dnb_simple('catalog=dnb.hss and diss* and sgt="300" or sgt="500" or sgt="600" and jhr="1980"')
print(myquery1, "Ergebnisse")

8044 Ergebnisse


### Richtige Abfrage

In [46]:
def dnb_sru(query):
    
    base_url = "https://services.dnb.de/sru/dnb"
    params = {'recordSchema' : 'MARC21-xml',
          'operation': 'searchRetrieve',
          'version': '1.1',
          'maximumRecords': '100',
          'query': query
         }
    r = requests.get(base_url, params=params)
    xml = soup(r.content)
    records = xml.find_all('record', {'type':'Bibliographic'})
    
    if len(records) < 100:
        
        return records
    
    else:
        
        num_results = 100
        i = 101
        while num_results == 100:
            
            params.update({'startRecord': i})
            r = requests.get(base_url, params=params)
            xml = soup(r.content)
            new_records = xml.find_all('record', {'type':'Bibliographic'})
            records+=new_records
            i+=100
            num_results = len(new_records)
            
        return records



In [79]:
#myquery = sru_dnb('hss all "diss*" and jhr within "* 2020" and jhr within "1980 *"')
myquery = dnb_sru('catalog="dnb.hss" and diss* and sgt="600" and jhr="2010"')
print(len(myquery), "Ergebnisse")

108 Ergebnisse


### Verarbeiten der Ergebnisse  <a class="anchor" id="Teil4"></a>

In [84]:
def parse_record(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = ET.fromstring(unicodedata.normalize("NFC", str(record)))
    
    #idn
    idn = xml.xpath("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
    
    # titel
    titel = xml.xpath("marc:datafield[@tag = '245']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        titel = titel[0].text
    except:
        titel = "unknown"
        
    
    # author
    author = xml.xpath("marc:datafield[@tag = '100']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        author = author[0].text
    except:
        author = "unknown"
        
        
    #gnd
    gnd = xml.xpath("marc:datafield[@tag = '100']/marc:subfield[@code = '0']", namespaces=ns)
    try:
        gnd = gnd[0].text
    except:
        gnd = "none"
  

    # supervisor (maybe):
    superv = xml.xpath("marc:datafield[@tag = '700']/marc:subfield[@code = 'a']", namespaces=ns)
    superrole = xml.xpath("marc:datafield[@tag = '700']/marc:subfield[@code = 'e']", namespaces=ns)
    try:
        superv = superv[0].text
        superrole = superrole[0].text
        supervisor = superv + " - " + superrole
    except:
        supervisor = "unknown"
    
    
    meta_dict = {"idn":idn, "titel":titel, "author":author, "gnd":gnd, "supervisor":supervisor}
    
    return meta_dict



In [85]:
output = [parse_record(record) for record in myquery]
df = pd.DataFrame(output)
df

Unnamed: 0,idn,titel,author,gnd,supervisor
0,100822880X,A method for the comparison of transport air...,"Gologan, Corin",(DE-588)142739642,unknown
1,1007183780,A new approach to establish tactility in min...,"Kübler, Bernhard",(DE-588)142331783,unknown
2,1003606326,Abiotically catalyzed glucose fuel cells for p...,"Kerzenmacher, Sven",(DE-588)1105594211,unknown
3,1008306487,Adaptive functional modeling of neural activity,"Gürel, Tayfun",none,unknown
4,1009326414,Analog Integrated CMOS Circuits for the Readou...,"Karagounis, Michael Athanassios",(DE-588)143188321,unknown
...,...,...,...,...,...
103,1000185826,Vergleich der Materialkennwerte von Formstoffe...,"Weltschev, Margit",(DE-588)140308385,unknown
104,100513264X,Verifying and allocating real-time tasks on di...,"Masrur, Alejandro",(DE-588)141928689,unknown
105,1007432985,Web service discovery based on semantic inform...,"Schulte, Stefan",(DE-588)135768713,unknown
106,1010986694,Welchen Einfluss haben das verwendete Bypassma...,"Porath, Mark",(DE-588)143129570,unknown


In [115]:
df.to_csv("Abfrage.csv", index=False) 

In [122]:
df.to_excel("Abfrage.xlsx", index=False, encoding='utf8') 

### Filtern des Dataframe

In [73]:
## Show all items where no author was found: 

noauthors = df.query('author == "unknown"')
noauthors

Unnamed: 0,idn,titel,author


In [75]:
print(myquery[0])

<record type="Bibliographic" xmlns="http://www.loc.gov/MARC21/slim">
<leader>00000nam a2200000uc 4500</leader>
<controlfield tag="001">1253067287</controlfield>
<controlfield tag="003">DE-101</controlfield>
<controlfield tag="005">20220308223115.0</controlfield>
<controlfield tag="007">cr||||||||||||</controlfield>
<controlfield tag="008">220308s2022    gw |||||om||| 00||||eng  </controlfield>
<datafield ind1=" " ind2=" " tag="015">
<subfield code="a">22,O04</subfield>
<subfield code="2">dnb</subfield>
</datafield>
<datafield ind1="7" ind2=" " tag="016">
<subfield code="2">DE-101</subfield>
<subfield code="a">1253067287</subfield>
</datafield>
<datafield ind1="7" ind2=" " tag="024">
<subfield code="2">urn</subfield>
<subfield code="a">urn:nbn:de:bsz:21-dspace-994892</subfield>
</datafield>
<datafield ind1=" " ind2=" " tag="035">
<subfield code="a">(DE-599)DNB1253067287</subfield>
</datafield>
<datafield ind1=" " ind2=" " tag="040">
<subfield code="a">1240</subfield>
<subfield code="b">