In [19]:
import os, time, json, re
import internetarchive as ia
import pymarc
import pandas as pd

In [3]:
# Define the name of the collection on Internet Archive
collection_name = "darwinslibrary"

In [4]:
# Create error log
error_log = open('bpl-marcs-errors.log', 'a')

In [5]:
# Define the path where the files will be saved
save_path = "marc-files"

# Create the directory if it doesn't exist
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [6]:
# Search for items in the collection and download the MARC records
items = ia.search_items(f'collection:{collection_name}')

In [7]:
for item in items:
    identifier = item["identifier"]

    # Download the MARCXML record for the item
    try:
        xml_files = ia.download(identifier, 
                                formats=["MARC"], 
                                verbose=True, 
                                destdir=save_path,
                                no_directory=True)

    # If there is a error, write it to the error log
    except Exception as e:
        error_log.write(f"Could not download {identifier} because of error: {e}\n")
        print("There was an error; writing to log.")

Abercrombie1838wa66W:
 skipping marc-files\Abercrombie1838wa66W_marc.xml, file already exists based on length and date.
Abercrombie1838wa66W_MS:
 skipping marc-files\Abercrombie1838wa66W_MS_marc.xml, file already exists based on length and date.
Agassiz1850up52I:
 skipping marc-files\Agassiz1850up52I_marc.xml, file already exists based on length and date.
Agassiz1850up52I_MS:
 skipping marc-files\Agassiz1850up52I_MS_marc.xml, file already exists based on length and date.
Agassiz2006ft69Y:
 skipping marc-files\Agassiz2006ft69Y_marc.xml, file already exists based on length and date.
Agassiz2006ft69Y_MS:
 skipping marc-files\Agassiz2006ft69Y_MS_marc.xml, file already exists based on length and date.
Barker-Webb1840yf41S:
 skipping marc-files\Barker-Webb1840yf41S_marc.xml, file already exists based on length and date.
Barker-Webb1840yf41S_MS:
 skipping marc-files\Barker-Webb1840yf41S_MS_marc.xml, file already exists based on length and date.
Bechstein1840ob74D:
 skipping marc-files\Bechste

In [8]:
# Prepare the list of dictionaries
data = []

In [9]:
def get_author(record):
    try:
        author = record['100']['a']
        return author
    except Exception as e:
        print(e)

In [10]:
def get_title(record):
    try:
        title = record['245']['a']
        return title
    except Exception as e:
        print(e)

In [11]:
def get_pub_place(record):
    try:
        pub_place = record['260']['a']
        return pub_place
    except Exception as e:
        print(e)

In [12]:
def get_publisher(record):
    try:
        publisher = record['260']['b']
        return publisher
    except Exception as e:
        print(e)

In [13]:
def get_pub_date(record):
    try:
        pub_date = record['260']['c']
        return pub_date
    except Exception as e:
        print(e)

In [14]:
# Some fields like subjects can repeat. In cases like that you will want to 
# use get_fields to get all of them as pymarc.Field objects

def get_note(record):
    try:
        notes = [note['a'] for note in record.get_fields('500')]
        return notes
    except Exception as e:
        print(e)

In [15]:
def get_ident(lst):
    try:
        for strng in lst:
            if strng.startswith('Identifier'):
                ident = strng[12:]
                return ident
    except Exception as e:
        print(e)

In [16]:
def get_manuscript(record):
    try:
        manuscript = record['770']['o']
        return manuscript
    except Exception as e:
        print(e)

In [17]:
def get_parent(record):
    try:
        parent = record['772']['o']
        return parent
    except Exception as e:
        print(e)

In [18]:
# Iterate over the folder
for filename in os.listdir(save_path):

    # Construct the path to the MARCXML file in the current folder
    file_path = os.path.join(save_path, filename)

    # Check if the file exists at that path
    if os.path.isfile(file_path):
        
        # Open the MARCXML file
        with open(file_path, 'rb') as marc_file:
            records = pymarc.marcxml.parse_xml_to_array(marc_file)
    
            # Iterate over the MARC records in the file
            # and generate a dictionary for each record
            for record in records:
                dct = {
                    'ID': get_ident(get_note(record)),
                    'AUTHOR': get_author(record),
                    'TITLE': get_title(record),
                    'NOTE': get_note(record),
                    'MANUSCRIPT': get_manuscript(record),
                    'PARENT': get_parent(record),
                    'PUB_PLACE': get_pub_place(record),
                    'PUBLISHER': get_publisher(record),
                    'PUB_DATE': get_pub_date(record)
                }
                data.append(dct)


























































































































































































































































































































































































































































































































































































































































































































































































In [20]:
df = pd.DataFrame(data)

In [21]:
print(df)

                           ID              AUTHOR  \
0       Abercrombie1838wa66W.  Abercrombie, John.   
1    Abercrombie1838wa66W_MS.    Darwin, Charles.   
2           Agassiz1850up52I.     Agassiz, Louis.   
3        Agassiz1850up52I_MS.    Darwin, Charles.   
4           Agassiz2006ft69Y.     Agassiz, Louis.   
..                        ...                 ...   
310       Youatt1834ip29P_MS.    Darwin, Charles.   
311          Youatt1837zd27L.    Youatt, William.   
312       Youatt1837zd27L_MS.    Darwin, Charles.   
313          Youatt1845wt42Y.    Youatt, William.   
314       Youatt1845wt42Y_MS.    Darwin, Charles.   

                                                 TITLE  \
0    Inquiries concerning the intellectual powers a...   
1    Inquiries concerning the intellectual powers a...   
2    Lake Superior: its character, vegetation, and ...   
3    Lake Superior: its character, vegetation, and ...   
4    Contributions to the natural history of the Un...   
..             