In [49]:
import requests
import os
from urllib.parse import urlencode
import uuid
import hashlib
import base64
from email.utils import formatdate
import json
from pathlib import Path
from PIL import Image
import pandas as pd

In [2]:
s = requests.Session()

In [3]:
APP_ID = os.environ['APP_ID']
APP_KEY = os.environ['APP_KEY']

In [4]:
def edan_request(session, query, app_id, app_key, start=0, rows=10):
    URL_BASE = 'http://edan.si.edu/metadata/v2.0/collections/search.htm'
    request_date = formatdate()
    nonce = str(uuid.uuid4())
    query_params = {'q': query,
                    'rows': rows,
                    'start': start}
    query_param_string = urlencode(query_params)
    string_to_sign = '\n'.join([nonce, query_param_string, request_date, app_key])
    
    hashed_string = hashlib.sha1(string_to_sign.encode()).hexdigest()
    encoded_string = base64.b64encode(hashed_string.encode())
    request_headers = {'X-AppId': app_id,
                       'X-Nonce': nonce,
                       'X-RequestDate': request_date,
                       'X-AuthContent': encoded_string}
    
    r = session.get(URL_BASE, params = query_params, headers = request_headers)
    response_json = r.json()
    return response_json

In [5]:
test_call = edan_request(s, '2007.1.69*', APP_ID, APP_KEY, rows=1)
test_call

{'rows': [{'id': 'edanmdm-nmaahc_2007.1.69.20.32.B',
   'title': 'Studio Portrait of a Child Sitting on a Sofa',
   'unitCode': 'NMAAHC',
   'linkedId': '0',
   'type': 'edanmdm',
   'url': 'edanmdm:nmaahc_2007.1.69.20.32.B',
   'content': {'descriptiveNonRepeating': {'title_sort': 'STUDIO PORTRAIT OF A CHILD SITTING ON A SOFA',
     'title': {'content': 'Studio Portrait of a Child Sitting on a Sofa',
      'label': 'Object Name'},
     'online_media': {'mediaCount': '1',
      'media': [{'content': 'http://ids.si.edu/ids/deliveryService?id=NMAAHC-HCA_20.32.B',
        'idsId': 'NMAAHC-HCA_20.32.B',
        'thumbnail': 'http://ids.si.edu/ids/deliveryService?id=NMAAHC-HCA_20.32.B',
        'type': 'Images'}]},
     'record_ID': 'nmaahc_2007.1.69.20.32.B',
     'data_source': 'National Museum of African American History and Culture',
     'unit_code': 'NMAAHC'},
    'indexedStructured': {'topic': ['American South',
      'Communities',
      'Segregation',
      'Children',
      'Photo

In [6]:
test_call.keys()

dict_keys(['rows', 'facetQueries', 'facets', 'start', 'rowCount'])

In [7]:
print(len(test_call['rows']))

1


In [8]:
print(test_call['rowCount'])

4271


In [9]:
from tqdm import tqdm_notebook

In [10]:
STEP = 100

all_rows = []

for start in tqdm_notebook(range(0,4271,STEP)):
    edan_json = edan_request(s, '2007.1.69*', APP_ID, APP_KEY, start, STEP)
    edan_rows = edan_json['rows']
    all_rows += edan_rows
print(len(all_rows))

HBox(children=(IntProgress(value=0, max=43), HTML(value='')))


4271


In [11]:
with open('data/metadata/edan_anderson_photos.json','w') as json_out:
    json.dump(all_rows, json_out, indent=2)

In [23]:
all_rows[0]['content']['descriptiveNonRepeating']

{'title_sort': 'STUDIO PORTRAIT OF A CHILD SITTING ON A SOFA',
 'title': {'content': 'Studio Portrait of a Child Sitting on a Sofa',
  'label': 'Object Name'},
 'online_media': {'mediaCount': '1',
  'media': [{'content': 'http://ids.si.edu/ids/deliveryService?id=NMAAHC-HCA_20.32.B',
    'idsId': 'NMAAHC-HCA_20.32.B',
    'thumbnail': 'http://ids.si.edu/ids/deliveryService?id=NMAAHC-HCA_20.32.B',
    'type': 'Images'}]},
 'record_ID': 'nmaahc_2007.1.69.20.32.B',
 'data_source': 'National Museum of African American History and Culture',
 'unit_code': 'NMAAHC'}

In [21]:
all_rows[0]['content']['descriptiveNonRepeating']['online_media']['media'][0]['content']

'http://ids.si.edu/ids/deliveryService?id=NMAAHC-HCA_20.32.B'

In [22]:
image_urls = []
for row in all_rows:
    image_url = row['content']['descriptiveNonRepeating']['online_media']['media'][0]['content']
    image_urls.append(image_url)
len(image_urls)

4271

In [46]:
def download_image(url):
    return_dict = {}
    directory = Path('data/images')
    filebase = url.split('=')[1] + '.jpg'
    return_dict['id'] = url.split('=')[1]
    filename = Path(directory / filebase)
    r = requests.get(url)
    if r.headers['Content-Type'] == 'image/jpeg':
        with open(filename, 'wb') as image_path:
            image_path.write(r.content)
        with Image.open(filename) as im:
            return_dict['width'], return_dict['height'] = im.size
    return return_dict

In [47]:
test_result = download_image(image_urls[0])
test_result

{'id': 'NMAAHC-HCA_20.32.B', 'width': 1417, 'height': 1812}

In [48]:
image_results = []
for url in tqdm_notebook(image_urls[:100]):
    image_result = download_image(url)
    image_results.append(image_result)

HBox(children=(IntProgress(value=0), HTML(value='')))

In [50]:
image_df = pd.DataFrame(image_results)
image_df.head()

Unnamed: 0,height,id,width
0,1812,NMAAHC-HCA_20.32.B,1417
1,1782,NMAAHC-HCA_4.36.B-000001,1398
2,1794,NMAAHC-HCA_10.34.D,1447
3,1416,NMAAHC-HCA_15.38.A,1752
4,1404,NMAAHC-HCA_7.2.D,1776


In [51]:
image_df = image_df[['id','height','width']]
image_df.to_csv('data/images/image_results.tsv', sep='\t', index=False)