In [45]:
import os, time, json, re
import internetarchive as ia
import pymarc
import pandas as pd
import numpy as np

In [3]:
# Define the name of the collection on Internet Archive
collection_name = "darwinslibrary"

In [4]:
# Create error log
error_log = open('bpl-marcs-errors.log', 'a')

In [5]:
# Define the path where the files will be saved
save_path = "marc-files"

# Create the directory if it doesn't exist
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [6]:
# Search for items in the collection and download the MARC records
items = ia.search_items(f'collection:{collection_name}')

In [7]:
for item in items:
    identifier = item["identifier"]

    # Download the MARCXML record for the item
    try:
        xml_files = ia.download(identifier, 
                                formats=["MARC"], 
                                verbose=True, 
                                destdir=save_path,
                                no_directory=True)

    # If there is a error, write it to the error log
    except Exception as e:
        error_log.write(f"Could not download {identifier} because of error: {e}\n")
        print("There was an error; writing to log.")

Abercrombie1838wa66W:
 skipping marc-files\Abercrombie1838wa66W_marc.xml, file already exists based on length and date.
Abercrombie1838wa66W_MS:
 skipping marc-files\Abercrombie1838wa66W_MS_marc.xml, file already exists based on length and date.
Agassiz1850up52I:
 skipping marc-files\Agassiz1850up52I_marc.xml, file already exists based on length and date.
Agassiz1850up52I_MS:
 skipping marc-files\Agassiz1850up52I_MS_marc.xml, file already exists based on length and date.
Agassiz2006ft69Y:
 skipping marc-files\Agassiz2006ft69Y_marc.xml, file already exists based on length and date.
Agassiz2006ft69Y_MS:
 skipping marc-files\Agassiz2006ft69Y_MS_marc.xml, file already exists based on length and date.
Barker-Webb1840yf41S:
 skipping marc-files\Barker-Webb1840yf41S_marc.xml, file already exists based on length and date.
Barker-Webb1840yf41S_MS:
 skipping marc-files\Barker-Webb1840yf41S_MS_marc.xml, file already exists based on length and date.
Bechstein1840ob74D:
 skipping marc-files\Bechste

There was an error; writing to log.


NHM158552:
 skipping marc-files\NHM158552_marc.xml, file already exists based on length and date.
NHM163482:
 skipping marc-files\NHM163482_marc.xml, file already exists based on length and date.
NHM177028:
 skipping marc-files\NHM177028_marc.xml, file already exists based on length and date.
NHM19639:
 skipping marc-files\NHM19639_marc.xml, file already exists based on length and date.
NHM200556:
 skipping marc-files\NHM200556_marc.xml, file already exists based on length and date.
NHM20305:
 skipping marc-files\NHM20305_marc.xml, file already exists based on length and date.
NHM227605-vol1:
 skipping marc-files\NHM227605-vol1_marc.xml, file already exists based on length and date.
NHM227605-vol2:
 skipping marc-files\NHM227605-vol2_marc.xml, file already exists based on length and date.
NHM243352:
 skipping marc-files\NHM243352_marc.xml, file already exists based on length and date.
NHM251872:
 skipping marc-files\NHM251872_marc.xml, file already exists based on length and date.
NHM2

In [106]:
# Prepare the list of dictionaries
data = []

In [9]:
def get_author(record):
    try:
        author = record['100']['a']
        return author
    except Exception as e:
        print(e)

In [10]:
def get_title(record):
    try:
        title = record['245']['a']
        return title
    except Exception as e:
        print(e)

In [11]:
def get_pub_place(record):
    try:
        pub_place = record['260']['a']
        return pub_place
    except Exception as e:
        print(e)

In [12]:
def get_publisher(record):
    try:
        publisher = record['260']['b']
        return publisher
    except Exception as e:
        print(e)

In [13]:
def get_pub_date(record):
    try:
        pub_date = record['260']['c']
        return pub_date
    except Exception as e:
        print(e)

In [14]:
# Some fields like subjects can repeat. In cases like that you will want to 
# use get_fields to get all of them as pymarc.Field objects

def get_note(record):
    try:
        notes = [note['a'] for note in record.get_fields('500')]
        return notes
    except Exception as e:
        print(e)

In [15]:
def get_ident(lst):
    try:
        for strng in lst:
            if strng.startswith('Identifier'):
                ident = strng[12:]
                return ident.lower()
    except Exception as e:
        print(e)

In [16]:
def get_alt_ident(lst):
    try:
        for strng in lst:
            if strng.startswith('Public number: '):
                ident = strng[15:]
                return ident
    except Exception as e:
        print(e)

In [93]:
def get_manuscript(record):
    try:
        manuscript = record['770']['o']
        return manuscript.lower()
    except Exception as e:
        print(e)

In [18]:
def get_parent(record):
    try:
        parent = record['772']['o']
        return parent.lower()
    except Exception as e:
        print(e)

In [107]:
# Iterate over the folder
for filename in os.listdir(save_path):

    # Construct the path to the MARCXML file in the current folder
    file_path = os.path.join(save_path, filename)

    # Check if the file exists at that path
    if os.path.isfile(file_path):
        
        # Open the MARCXML file
        with open(file_path, 'rb') as marc_file:
            records = pymarc.marcxml.parse_xml_to_array(marc_file)
    
            # Iterate over the MARC records in the file
            # and generate a dictionary for each record
            for record in records:
                dct = {
                    'ID': get_ident(get_note(record)),
                    'ALT_ID': get_alt_ident(get_note(record)),
                    'AUTHOR': get_author(record),
                    'TITLE': get_title(record),
                    'NOTE': ' '.join(get_note(record)),
                    'MANUSCRIPT': get_manuscript(record),
                    'PARENT': get_parent(record),
                    'PUB_PLACE': get_pub_place(record),
                    'PUBLISHER': get_publisher(record),
                    'PUB_DATE': get_pub_date(record)
                }
                data.append(dct)
























































































































































































































































































































































































































































































































































































































































































































































































In [20]:
def normalize_missing_value(value):
    try:
        value = re.sub(r'\[s.n.\]|n.d.', 'None', value)
    except:
        None
    return value

In [21]:
def clean_value(value):
    try:
        value = re.sub(r'[.,!?:;\s]+$', '', value)
    except:
        None
    return value

In [108]:
clean_data = []

In [109]:
for dct in data:
    clean_dct = {key: normalize_missing_value(value) for key, value in dct.items()}
    clean_dct = {key: clean_value(value) for key, value in clean_dct.items()}
    clean_data.append(clean_dct)

In [110]:
print(clean_data)

[{'ID': 'abercrombie1838wa66w', 'ALT_ID': '0001', 'AUTHOR': 'Abercrombie, John', 'TITLE': 'Inquiries concerning the intellectual powers and the investigation of truth', 'NOTE': 'Location: Cambridge. Identifier: Abercrombie1838wa66W. Public number: 0001', 'MANUSCRIPT': 'abercrombie1838wa66w_ms', 'PARENT': None, 'PUB_PLACE': 'London', 'PUBLISHER': 'John Murray', 'PUB_DATE': '1838'}, {'ID': 'abercrombie1838wa66w_ms', 'ALT_ID': None, 'AUTHOR': 'Darwin, Charles', 'TITLE': 'Inquiries concerning the intellectual powers and the investigation of truth', 'NOTE': 'Identifier: Abercrombie1838wa66W_MS', 'MANUSCRIPT': None, 'PARENT': 'abercrombie1838wa66w', 'PUB_PLACE': None, 'PUBLISHER': None, 'PUB_DATE': None}, {'ID': 'agassiz1850up52i', 'ALT_ID': '0017', 'AUTHOR': 'Agassiz, Louis', 'TITLE': 'Lake Superior: its character, vegetation, and animals, compared with those of other similar regions', 'NOTE': 'Inscription. Location: Cambridge. Identifier: Agassiz1850up52I. Public number: 0017', 'MANUSCRIPT

In [111]:
df = pd.DataFrame(clean_data)

In [112]:
print(df.head())

                        ID ALT_ID             AUTHOR  \
0     abercrombie1838wa66w   0001  Abercrombie, John   
1  abercrombie1838wa66w_ms   None    Darwin, Charles   
2         agassiz1850up52i   0017     Agassiz, Louis   
3      agassiz1850up52i_ms   None    Darwin, Charles   
4         agassiz2006ft69y   0015     Agassiz, Louis   

                                               TITLE  \
0  Inquiries concerning the intellectual powers a...   
1  Inquiries concerning the intellectual powers a...   
2  Lake Superior: its character, vegetation, and ...   
3  Lake Superior: its character, vegetation, and ...   
4  Contributions to the natural history of the Un...   

                                                NOTE               MANUSCRIPT  \
0  Location: Cambridge. Identifier: Abercrombie18...  abercrombie1838wa66w_ms   
1                Identifier: Abercrombie1838wa66W_MS                     None   
2  Inscription. Location: Cambridge. Identifier: ...      agassiz1850up52i_ms   
3 

In [27]:
%matplotlib inline

In [113]:
df['RELATED'] = df['MANUSCRIPT'].fillna(df['PARENT'])
print(df.head())

                        ID ALT_ID             AUTHOR  \
0     abercrombie1838wa66w   0001  Abercrombie, John   
1  abercrombie1838wa66w_ms   None    Darwin, Charles   
2         agassiz1850up52i   0017     Agassiz, Louis   
3      agassiz1850up52i_ms   None    Darwin, Charles   
4         agassiz2006ft69y   0015     Agassiz, Louis   

                                               TITLE  \
0  Inquiries concerning the intellectual powers a...   
1  Inquiries concerning the intellectual powers a...   
2  Lake Superior: its character, vegetation, and ...   
3  Lake Superior: its character, vegetation, and ...   
4  Contributions to the natural history of the Un...   

                                                NOTE               MANUSCRIPT  \
0  Location: Cambridge. Identifier: Abercrombie18...  abercrombie1838wa66w_ms   
1                Identifier: Abercrombie1838wa66W_MS                     None   
2  Inscription. Location: Cambridge. Identifier: ...      agassiz1850up52i_ms   
3 

In [114]:
df = df.drop(columns=['MANUSCRIPT', 'PARENT'])
print(df.head())

                        ID ALT_ID             AUTHOR  \
0     abercrombie1838wa66w   0001  Abercrombie, John   
1  abercrombie1838wa66w_ms   None    Darwin, Charles   
2         agassiz1850up52i   0017     Agassiz, Louis   
3      agassiz1850up52i_ms   None    Darwin, Charles   
4         agassiz2006ft69y   0015     Agassiz, Louis   

                                               TITLE  \
0  Inquiries concerning the intellectual powers a...   
1  Inquiries concerning the intellectual powers a...   
2  Lake Superior: its character, vegetation, and ...   
3  Lake Superior: its character, vegetation, and ...   
4  Contributions to the natural history of the Un...   

                                                NOTE PUB_PLACE  \
0  Location: Cambridge. Identifier: Abercrombie18...    London   
1                Identifier: Abercrombie1838wa66W_MS      None   
2  Inscription. Location: Cambridge. Identifier: ...    Boston   
3                    Identifier: Agassiz1850up52I_MS      None

In [115]:
df.to_csv('2024-01-16_darwinslibrary.csv', index=False)