# to do:
- DONE: Paginate
- DONE: Figure out what's up with Mutual CSV formatting issues (avoided the issue by cutting off at 1,000 characters)
- DONE: Remove extra lines in the CSV
- DONE: Search only for collections (aka, "resources") and items within collections (aka, "archival records")
- Check to see if assumptions about notes aren't correct

In [2]:
# Import some stuff you'll use
import requests
import json
import csv

# Change these: Add your credentials

In [3]:
USER = '#'
PASS = '#'
HOST = '#' #don't end with a slash
def aspace_auth(host, username, password):
    auth = requests.post(HOST + '/users/' + username + '/login',
                        params={'password' : password})
    if auth.status_code == 200:
        token = auth.json()['session']
        headers = {'X-ArchivesSpace-Session': token}
        return(headers)
    else:
        return(False)

headers = aspace_auth(HOST, USER, PASS)
print(headers)

{'X-ArchivesSpace-Session': '51df86c569b69f53005b0a5cc41fec00b035b2ccbc662f9a316fec25a1a3803f'}


# Search for any aspace records of any type with a given query term in any notes field

In [8]:
# CHANGE YOUR SEARCH TERM AND PAGE SIZES HERE!
q = 'holdings'
LAST_PAGE = 6



In [9]:
# Set things up so the filename is based on your search term
filename = "search_for_" + q + ".csv"

# Create and open the file 
f=csv.writer(open(filename, 'w', newline=''),delimiter=',')

#Create the first row of column headers
f.writerow(['uri','title','identifier','external_id','record_type','note_with_search_term','note_type'])

78

In [10]:
# LET'S TEST!

#Reset page to "1"
page = 1
page_num = str(page)

#While page is less than whatever you entered above as the value for "LAST_PAGE"
while page <= LAST_PAGE:

    # Here is the actual search we're sending to ArchivesSpace!
    # First we set up the crazy complicated URL for an advanced query of the "notes" field.
    endpoint = '/repositories/2/search?page=' + page_num + '&type[]=archival_object&type[]=resource&aq={"query":{"field":"notes","value":"' + q + '", "jsonmodel_type":"field_query","negated":false,"literal":true}}'
    # Then we send that search up to ArchivesSpace
    results_full = requests.get(HOST + endpoint, headers=headers)
    # Make sure our python script knows to read the JSON results as python data structures.
    results_full=results_full.json()
    
    #Chris, Dolsy, and Rachel stopped here. 
    
    # The results include a bunch of facet & page information at the top, which we want to skip.
    # Below all the facet and page info, are the actual results. 
    # We'll isolate the part of the results that are really the results, excluding the facets & page info. 
    results=results_full['results']
    print(results)


    # Add all the data!
    # For each record returned by the search above . . . 
    for result in results:
        # Define all the fields that you'd like put into the CSV.
        uri = result.get('uri')
        title = result.get('title')
        identifier = result.get('identifier')
        external_id = result.get('external_id')
        record_type = result.get('types')
        aspace_json = json.loads(result.get('json'))
        # Below, we are making the assumption that each record will have the search term in only one note, 
        # and each multi-part note actually only has one subnote 
        # This returns the content of the first note that has the search term.
        # Note for the user of this data: If you don't see the search term as expected, 
        # it may be in the record but not in the spreadsheet, so check ASpace. 
        notes = aspace_json['notes']
        
        for note in notes:
            try:
                subnotes = note.get('subnotes')
                content = subnotes[0]['content']
                #cut off the note content field if it's more than 1,000 characters, and add an elipse at the end
                content = (content[:75] + '. . . [This note was cut off. Please check ASpace]') if len(content) > 1000 else content
                note_type = note['type']
            except:
                content = note['content'][0]
                #cut off the note content field if it's more than 1,000 characters, and add an elipse at the end
                content = (content[:75] + '. . . [This note was cut off. Please check ASpace]') if len(content) > 1000 else content
                note_type = note['type']
            if q in content:
                note_with_search_term = content
        f.writerow([uri,title,identifier,external_id,record_type,note_with_search_term,note_type])
    # finally, add 1 to the page so the next time the loop
    # runs it will get the next page
    page = page + 1
    page_num = str(page)
    
# When everything is done, print below the page information
last_page = results_full['last_page']
last_page = str(last_page)
print('Total pages: ' + last_page)
print('Results are saved to: ' + filename)

[{'type_enum_s': ['odd'], 'four_part_id': 'COR.0002.MS', 'last_modified_by': 'admin', 'primary_type': 'resource', 'system_mtime': '2017-10-17T12:27:24Z', 'publish': True, 'create_time': '2016-07-05T16:02:49Z', 'json': '{"lock_version":0,"title":"Constance Stuart Larrabee collection","publish":true,"restrictions":false,"ead_location":"http://library.gwu.edu/ead/XXXXXX.xml","finding_aid_title":"\\n","finding_aid_author":"Finding aid prepared by Special Collections Research Center, The George Washington University","finding_aid_language":"Finding aid written in <language encodinganalog=\\"language\\">English</language>","created_by":"admin","last_modified_by":"admin","create_time":"2016-07-05T16:02:49Z","system_mtime":"2017-10-17T12:27:24Z","user_mtime":"2016-07-05T16:02:49Z","suppressed":false,"id_0":"COR.0002.MS","language":"eng","level":"collection","finding_aid_description_rules":"dacs","finding_aid_status":"unprocessed","jsonmodel_type":"resource","external_ids":[{"external_id":"1590

# ASpace search queries that didn't work. 

In [None]:
query="q:Anything"
search = requests.get(HOST + '/search?', headers=headers, data=query)
search

In [None]:
endpoint = '/search?page=1&page_size=1&q="/repositories/2"'

results = requests.get(HOST + endpoint, headers=headers)
results