# Data Collection - Harvard
In this notebook, I'll collect data from Harvard Museum API.  
Information can be found here: https://github.com/harvardartmuseums/api-docs

In [1]:
import pandas as pd
import numpy as np
import requests
import json

import pickle

import time
from PIL import Image

In [1]:
# getting keys from local drive
def get_keys(path):
    with open(path) as f: 
        return json.load(f)

In [3]:
# path = r"path"

In [4]:
api_key = get_keys(f"{path}/harvard_mus_api.json")["api_key"]

### Check classifications

In [5]:
url = "https://api.harvardartmuseums.org/classification"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)
# good if 200

200


Checking what type of classifications available from this API.

In [None]:
classifications = []

n = int(resp.json()['info']['pages']) # getting the page number 
        
for i in range(n):
    url_params["page"] = i
    print(f"page {i}")
    
    resp = requests.get(url, params = url_params)

    try: 
        classifications.extend(resp.json()['records']) # add it to the list
    except:
        print(f"Error on page {i+1}") # let me know if there's an error

In [22]:
classifications = pd.DataFrame(classifications)

In [29]:
classifications.sort_values(by = 'objectcount', ascending = False)[0:10]

Unnamed: 0,objectcount,name,id,lastupdate,classificationid
59,84315,Photographs,17,2020-09-15T04:29:44-0400,17
20,27702,Drawings,21,2020-09-15T04:29:44-0400,21
15,6978,Paintings,26,2020-09-15T04:29:44-0400,26
5,6978,Paintings,26,2020-09-15T04:29:44-0400,26
21,6244,Sculpture,30,2020-09-15T04:29:44-0400,30
35,6190,Vessels,57,2020-09-15T04:29:44-0400,57
49,5898,Seals,189,2020-09-15T04:29:44-0400,189
43,4880,Straus Materials,959,2020-09-15T04:29:44-0400,959
25,4800,Fragments,94,2020-09-15T04:29:44-0400,94
48,4388,Manuscripts,185,2020-09-15T04:29:44-0400,185


## Collecting Full Data
It seems like paintings alone could give us enough data for now. I'll collect all paintings data and download each images.

In [166]:
url = "https://api.harvardartmuseums.org/object"

url_params = {
    "apikey": api_key, 
    "classification": 'Paintings'
}

resp = requests.get(url, params = url_params)

resp.status_code

200

In [None]:
# getting the number of pages
n = int(resp.json()['info']['pages'])
full_db = []

for i in range(219,n):
    url_params['page'] = i
    resp = requests.get(url, params = url_params)
    try: 
        full_db.extend(resp.json()['records'])
    except:
        print(f"error on page {i+1}")
              
    pause = np.random.randint(1, 60)
    print(f'{i+1}/{n} complete. pausing for {pause} secs ...')
    time.sleep(pause) # buffer

In [197]:
df = pd.DataFrame(full_db)

In [198]:
# saving
full_df = df.drop_duplicates(subset = 'id')
filename = 'PKL/raw_data_Harvard.pkl'
full_df.to_pickle(filename)

In [6]:
# loading data
# full_df = pd.read_pickle('PKL/raw_data_Harvard.pkl')

### Downloading Images
Now I'll take the url in the dataset and download them to the local drive.

In [7]:
image_id_set = full_df[['id', 'primaryimageurl']]

In [28]:
# downloading images
def download_img(df): 
    url = str(df['primaryimageurl'])
    print(f'connecting {url}...')
    if url.startswith('http'):
        resp = requests.get(url)
        print(resp.status_code)
        
        # writing to a local drive
        fpath = f"IMAGES/HARVARD/{df['id']}.jpg"
        with open(fpath, 'wb') as fp:
            fp.write(resp.content)

        # open and resize it to smaller thumbnail
        img = Image.open(fpath)
        img.thumbnail((500, 500))
        img.save(fpath)
    else:
        print('error')
    
    # give some buffer to distribute traffic
    pause = np.random.randint(1, 60)
    print(f'pausing for {pause} secs ...')
    time.sleep(pause) # buffer
    
    return df

In [None]:
# run it
image_id_set.apply(lambda x: download_img(x), axis = 1)