In [1]:
import csv
import json
import requests

import glob
import os
from os.path import join

In [2]:
endpoint = 'https://www.loc.gov/free-to-use'
parameters = {
    'fo' : 'json'
}

In [3]:
collection = 'birds'

In [4]:
collection_list_response = requests.get(endpoint + '/' + collection, params=parameters)

In [5]:
collection_list_response.url

'https://www.loc.gov/free-to-use/birds?fo=json'

In [6]:
collection_json = collection_list_response.json()

In [7]:
collection_json.keys()

dict_keys(['breadcrumbs', 'content', 'content_is_post', 'description', 'expert_resources', 'next', 'next_sibling', 'options', 'pages', 'portal', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type'])

In [8]:
for k in collection_json['content']['set']['items']:
    print(k)

{'image': '/static/portals/free-to-use/public-domain/birds/birds-1.jpg', 'link': '/resource/gtfy.01168/', 'title': 'Bird watcher, Central Park. Photo by Bernard Gotfryd, 1970-1980. Prints & Photographs Division'}
{'image': '/static/portals/free-to-use/public-domain/birds/birds-2.jpg', 'link': '/resource/cph.3a49143/', 'title': "The Humming-Bird ... ; the Cuckow. In: Cock Robin's Death and Funeral, 1780. Rare Book Division"}
{'image': '/static/portals/free-to-use/public-domain/birds/birds-3.jpg', 'link': '/resource/cph.3b52232/', 'title': 'Summer red bird. Tanager. Color engraving by R. Havell, after drawing by John J. Audubon. Elephant folio, 1827-1838. Rare Book Division'}
{'image': '/static/portals/free-to-use/public-domain/birds/birds-4.jpg', 'link': '/resource/jpd.02954/', 'title': 'Minowa kanasugi mikawashima. (Red-crowned crane) Color woodcut, 1857. Prints & Photographs Division'}
{'image': '/static/portals/free-to-use/public-domain/birds/birds-5.jpg', 'link': '/resource/ds.10210

In [9]:
len(collection_json['content']['set']['items'])

48

In [10]:
collection_json['content']['set']['items'][0].keys()

dict_keys(['image', 'link', 'title'])

In [11]:
collection_set_list = os.path.join('data','collection_set_list.csv')
headers = ['image','link','title']

with open(collection_set_list, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for item in collection_json['content']['set']['items']:
        
        # clean up errant spaces in the title fields
        item['title'] = item['title'].rstrip()
        writer.writerow(item)
    print('wrote',collection_set_list)

wrote data/collection_set_list.csv


In [12]:
# update endpoint info
endpoint = 'https://www.loc.gov'
parameters = {
    'fo' : 'json'
}

In [13]:
#   NAME CHANGE FOR FOLDER

# run this cell to confirm that you have a location for the JSON files
item_metadata_directory = os.path.join('data','ftu_birds_metadata')

if os.path.isdir(item_metadata_directory):
    print(item_metadata_directory,'exists')
else:
    os.mkdir(item_metadata_directory)
    print('created',item_metadata_directory)

data/ftu_birds_metadata exists


In [14]:
item_count = 0
error_count = 0
file_count = 0

data_directory = 'data'
item_metadata_directory = 'ftu_birds_metadata'
item_metadata_file_start = 'item_metadata'
json_suffix = '.json'

collection_set_list = os.path.join('data','collection_set_list.csv')

with open(collection_set_list, 'r', encoding='utf-8', newline='') as f:
    reader = csv.DictReader(f, fieldnames=headers)
    for item in reader:
        if item['link'] == 'link':
            continue
        # these resource links could redirect to item pages, but currently don't work
        if '?' in item['link']:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID + '&fo=json')
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except: #basically this catches all of the highsmith photos with hhh in the ID
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1
        else:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID, params=parameters)
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except:
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1

print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/gtfy.01168/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-gtfy.01168.json
requested https://www.loc.gov/resource/cph.3a49143/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-cph.3a49143.json
requested https://www.loc.gov/resource/cph.3b52232/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-cph.3b52232.json
requested https://www.loc.gov/resource/jpd.02954/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-jpd.02954.json
requested https://www.loc.gov/resource/ds.10210/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-ds.10210.json
requested https://www.loc.gov/resource/cph.3g05259/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-cph.3g05259.json
requested https://www.loc.gov/resource/cph.3b52391/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-cph.3b52391.json
requested https://www.loc.gov/resource/cph.3b52385/?fo=json 200
wrote data/ftu_birds_metadata/item_metadata-cph.3b52385.json
requested ht

TRANSFORMATION PART 1: Testing

In [15]:
current_loc = os.getcwd()

print(current_loc)

/Users/raynaketchum/Documents/umich/SI676/networked-services-labs


In [16]:
metadata_file_path = os.path.join('data','ftu_birds_metadata')

print(metadata_file_path)

data/ftu_birds_metadata


In [18]:
file_count = 0

for file in glob.glob('data/ftu_birds_metadata/item_metadata-*.json'):
    file_count += 1
    print(file)

print('found', file_count)

data/ftu_birds_metadata/item_metadata-ppmsca.45466.json
data/ftu_birds_metadata/item_metadata-fsa.8b22903.json
data/ftu_birds_metadata/item_metadata-hec.42940.json
data/ftu_birds_metadata/item_metadata-stereo.1s26898.json
data/ftu_birds_metadata/item_metadata-cph.3g05259.json
data/ftu_birds_metadata/item_metadata-ppmsca.26467.json
data/ftu_birds_metadata/item_metadata-cph.3b52227.json
data/ftu_birds_metadata/item_metadata-det.4a26643.json
data/ftu_birds_metadata/item_metadata-ds.12960.json
data/ftu_birds_metadata/item_metadata-fsa.8b20226.json
data/ftu_birds_metadata/item_metadata-cph.3b52381.json
data/ftu_birds_metadata/item_metadata-ihas.100004017.0.json
data/ftu_birds_metadata/item_metadata-ds.04698.json
data/ftu_birds_metadata/item_metadata-ppmsca.44352.json
data/ftu_birds_metadata/item_metadata-highsm.33347.json
data/ftu_birds_metadata/item_metadata-highsm.14976.json
data/ftu_birds_metadata/item_metadata-agc.7a00162.json
data/ftu_birds_metadata/item_metadata-gtfy.01168.json
data/f

In [19]:
list_of_item_metadata_files = list()
for file in glob.glob('data/ftu_birds_metadata/item_metadata-*.json'):
    list_of_item_metadata_files.append(file)

In [20]:
len(list_of_item_metadata_files)

48

In [21]:
list_of_item_metadata_files.sort()

for file in list_of_item_metadata_files:
    print(file)

data/ftu_birds_metadata/item_metadata-acd.2a07586.json
data/ftu_birds_metadata/item_metadata-acd.2a09222.json
data/ftu_birds_metadata/item_metadata-agc.7a00162.json
data/ftu_birds_metadata/item_metadata-cai.2a14845.json
data/ftu_birds_metadata/item_metadata-cph.3a49143.json
data/ftu_birds_metadata/item_metadata-cph.3b52227.json
data/ftu_birds_metadata/item_metadata-cph.3b52232.json
data/ftu_birds_metadata/item_metadata-cph.3b52381.json
data/ftu_birds_metadata/item_metadata-cph.3b52385.json
data/ftu_birds_metadata/item_metadata-cph.3b52391.json
data/ftu_birds_metadata/item_metadata-cph.3f05606.json
data/ftu_birds_metadata/item_metadata-cph.3g05259.json
data/ftu_birds_metadata/item_metadata-det.4a26643.json
data/ftu_birds_metadata/item_metadata-ds.04698.json
data/ftu_birds_metadata/item_metadata-ds.05086.json
data/ftu_birds_metadata/item_metadata-ds.09801.json
data/ftu_birds_metadata/item_metadata-ds.10210.json
data/ftu_birds_metadata/item_metadata-ds.12960.json
data/ftu_birds_metadata/i

In [22]:
# try first with one file, can you open the json, can you see what elements are in the json?
with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as item:
    # what are we looking at?
    print('file:',list_of_item_metadata_files[0],'\n')
    
    # load the item data
    item_data = json.load(item)
    
    for element in item_data.keys():
        print(element,':',item_data[element])

file: data/ftu_birds_metadata/item_metadata-acd.2a07586.json 

_version_ : 1709975316943863808
access_restricted : False
aka : ['https://www.loc.gov/pictures/item/2016680132/', 'http://www.loc.gov/item/2016680132/', 'http://www.loc.gov/pictures/item/2016680132/', 'https://www.loc.gov/pictures/collection/acd/item/2016680132/', 'http://www.loc.gov/pictures/collection/acd/item/2016680132/', 'http://www.loc.gov/resource/acd.2a07586/', 'http://lccn.loc.gov/2016680132', 'http://hdl.loc.gov/loc.pnp/acd.2a07586']
call_number : CD 1 - Conacher, no. 121 (B size)
campaigns : []
contributor_names : ['Conacher, John C., 1876-1947, artist', 'Life Publishing Company, copyright claimant', 'Life Publishing Company, publisher']
contributors : [{'conacher, john c.': 'https://www.loc.gov/search/?fa=contributor:conacher,+john+c.&fo=json'}, {'life publishing company': 'https://www.loc.gov/search/?fa=contributor:life+publishing+company&fo=json'}]
control_number : 
created : 2020-06-24T17:39:34Z
created_publi

In [23]:
item_data.keys()

dict_keys(['_version_', 'access_restricted', 'aka', 'call_number', 'campaigns', 'contributor_names', 'contributors', 'control_number', 'created', 'created_published', 'created_published_date', 'date', 'dates', 'description', 'digital_id', 'digitized', 'display_offsite', 'extract_timestamp', 'extract_urls', 'format', 'format_headings', 'genre', 'group', 'hassegments', 'id', 'image_url', 'index', 'item', 'language', 'languages', 'library_of_congress_control_number', 'link', 'marc', 'medium', 'medium_brief', 'mime_type', 'modified', 'notes', 'number', 'number_carrier_type', 'number_former_id', 'number_lccn', 'number_source_modified', 'online_format', 'original_format', 'other_control_numbers', 'other_formats', 'other_title', 'partof', 'related', 'repository', 'reproductions', 'resource_links', 'resources', 'rights', 'rights_advisory', 'rights_information', 'score', 'shelf_id', 'site', 'sort_date', 'source_collection', 'source_created', 'source_modified', 'subject', 'subject_headings', 'su

In [24]:
print('\ndate:',item_data['date'], type(item_data['date']))



date: 1922 <class 'str'>


In [25]:
print('\nformat:',item_data['format'][0], type(item_data['format']))


format: {'photo, print, drawing': 'https://www.loc.gov/search/?fa=original_format:photo,+print,+drawing&fo=json'} <class 'list'>


TEST: TRY IT WITH ONE EXAMPLE 

In [26]:
collection_info_csv = 'collection_items_data.csv'

headers = ['source_file', 'item_id', 'title', 'date', 'source_url', 'phys_format', 'subjects', 'rights'
]

with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as data:
    item_data = json.load(data)

    source_file = str(file)
    try:
        item_id = item_data['library_of_congress_control_number']
    except:
        item_id = item_data['url'].split('/')[-2]
    title = item_data['title']
    date = item_data['date']
    source_url = item_data['url']
    try:
        phys_format = item_data['format'][0]
    except:
        phys_format = 'Not found'
    try:
        subjects = item_data['subjects'][0]
    except:
        subjects = 'Not found'
    mime_type = item_data['mime_type']
    try:
        rights = item_data['rights_information']
    except:
        rights = 'Undetermined'
    
    
    row_dict = dict()
    # look for the item metadata, assign it to the dictionary; 
    # start with some basic elements likely (already enumerated in the headers list) :
    # source file
    row_dict['source_file'] = source_file
    # identifier
    row_dict['item_id'] = item_id
    # title
    row_dict['title'] = title
    # date
    row_dict['date'] = date
    # link
    row_dict['source_url'] = source_url
    # format
    row_dict['phys_format'] = phys_format
    # digital format
    row_dict['subjects'] = subjects
    #rights
    row_dict['rights'] = rights
    print('created row dictionary:',row_dict)

    # write to the csv
    with open(collection_info_csv, 'w', encoding='utf-8') as fout:
        writer = csv.DictWriter(fout, fieldnames=headers)
        writer.writeheader()
        writer.writerow(row_dict)
        print('wrote',collection_info_csv)

created row dictionary: {'source_file': 'data/ftu_birds_metadata/item_metadata-stereo.1s26898.json', 'item_id': '2016680132', 'title': 'The bird-lover', 'date': '1922', 'source_url': 'https://www.loc.gov/item/2016680132/', 'phys_format': {'photo, print, drawing': 'https://www.loc.gov/search/?fa=original_format:photo,+print,+drawing&fo=json'}, 'subjects': {'american': 'https://www.loc.gov/search/?fa=subject:american&fo=json'}, 'rights': 'No known restrictions on publication.'}
wrote collection_items_data.csv


Developing the structure of the CSV file 

Find a direct url to a good image file for the item. 

HEADERS!

In [28]:
# collection_info_csv = 'collection_items_data.csv'

# # set up a list for the columns in your csv; in future, this should be more automated but this works for now as you set up the crosswalk
# headers = ['source_file', 'item_id', 'title', 'date', 'source_url', 'phys_format', 'subjects', 'rights']

# # try first with one file
# with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as data:
#     # load the item data
#     item_data = json.load(data)
#     print(item_data['image_url'][3])

IndexError: list index out of range

Transformation Part 2

In [29]:
items_data_file = os.path.join(data_directory, 'collection_items_data.csv')

if os.path.isfile(items_data_file):
    os.unlink(items_data_file)
    print('removed',items_data_file)

# clear row_dict
row_dict = ()

In [30]:
from datetime import date

date_string_for_today = date.today().strftime('%Y-%m-%d') # see https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior

print(date_string_for_today)

2022-12-13


In [31]:
collection_info_csv = os.path.join('data','collection_items_data.csv')
file_count = 0
items_written = 0
error_count = 0

headers = ['item_type', 'date_uploaded', 'source_file', 'item_id', 'title', 'date', 'source_url', 'phys_format', 'subjects', 'rights', "image_url" ]

for file in list_of_item_metadata_files:
    file_count += 1
    print('opening',file)
    with open(file, 'r', encoding='utf-8') as item:
        # load the item data
        try:
            item_data = json.load(item)
        except:
            print('error loading',file)
            error_count += 1
            continue

        # extract/name the data you want
        # item type
        item_type = 'Item'
        # date uplaoded
        date_uploaded = date_string_for_today
        # for checking purposes, add in the source of the info
        source_file = str(file)
        # make sure there's some unique and stable identifier
        try:
            item_id = item_data['library_of_congress_control_number']
        except:
            item_id = item_data['url'].split('/')[-2]
        title = item_data['title']
        try:
            date = item_data['date']
        except:
            date = 'n.d.'
        source_url = item_data['url']
        try:
            phys_format = item_data['format'][0]
        except:
            phys_format = 'Not found'
        try:
            subjects = item_data['subjects'][0]
        except:
            subjects = 'Not found'
        try:
            rights = item_data['rights_information']
        except:
            rights = 'Undetermined'
        try:
            image_url = item_data['image_url'][0]
        except:
            image_url = 'Not Found'

         # dictionary for the rows
        row_dict = dict()

        # look for the item metadata, assign it to the dictionary;
        # start with some basic elements likely (already enumerated in the headers list) :
        # item type
        row_dict['item_type'] = item_type
        # date uploaded
        row_dict['date_uploaded'] = date_uploaded
        # source filename
        row_dict['source_file'] = source_file
        # identifier
        row_dict['item_id'] = item_id
        # title
        row_dict['title'] = title
        # date
        row_dict['date'] = date
        # link
        row_dict['source_url'] = source_url
        # format
        row_dict['phys_format'] = phys_format
        # digital format
        row_dict['subjects'] = subjects
        #rights
        row_dict['rights'] = rights
        #image
        row_dict['image_url'] = image_url


        # write to the csv
        with open(collection_info_csv, 'a', encoding='utf-8') as fout:
            writer = csv.DictWriter(fout, fieldnames=headers)
            if items_written == 0:
                writer.writeheader()
            writer.writerow(row_dict)
            items_written += 1
            print('adding',item_id)

print('\n\n--- LOG ---')
print('wrote',collection_info_csv)
print('with',items_written,'items')
print(error_count,'errors (info not written)')


opening data/ftu_birds_metadata/item_metadata-acd.2a07586.json
adding 2016680132
opening data/ftu_birds_metadata/item_metadata-acd.2a09222.json
adding 2016681950
opening data/ftu_birds_metadata/item_metadata-agc.7a00162.json
adding 2018707354
opening data/ftu_birds_metadata/item_metadata-cai.2a14845.json
adding 2010718076
opening data/ftu_birds_metadata/item_metadata-cph.3a49143.json
adding 2006685856
opening data/ftu_birds_metadata/item_metadata-cph.3b52227.json
adding 2002718914
opening data/ftu_birds_metadata/item_metadata-cph.3b52232.json
adding 2002718919
opening data/ftu_birds_metadata/item_metadata-cph.3b52381.json
adding 2002718985
opening data/ftu_birds_metadata/item_metadata-cph.3b52385.json
adding 2002718989
opening data/ftu_birds_metadata/item_metadata-cph.3b52391.json
adding 2002718964
opening data/ftu_birds_metadata/item_metadata-cph.3f05606.json
adding 98518441
opening data/ftu_birds_metadata/item_metadata-cph.3g05259.json
adding 97515217
opening data/ftu_birds_metadata/