# Setup

First, we import the library. Then, we grab the API tokens from our `keys.py` file (not in Git!).

In [9]:
from hathitrust_api import DataAPI
import json
from keys import ht_keys
import os
import time

In [10]:
data_api = DataAPI(ht_keys['access'], ht_keys['secret'])

In [11]:
metadata_path = "554050894-1535834127.json"

In [12]:
with open(metadata_path, "r") as fp:
    data = json.load(fp)
    
data.keys()

dict_keys(['id', 'type', 'description', 'created', 'extent', 'formats', 'publisher', 'title', 'visibility', 'gathers'])

In [13]:
data['gathers']

[{'title': 'The tales of Peter Parley about America. ',
  'author': 'Goodrich, Samuel G. 1793-1860.',
  'date': '1827-00-00',
  'rights': 'pd',
  'oclc': ['39945876'],
  'lccn': None,
  'isbn': None,
  'catalog_url': 'https://catalog.hathitrust.org/Record/100221263',
  'htitem_id': 'osu.32435078698222'},
 {'title': 'The tales of Peter Parley about America. ',
  'author': 'Goodrich, Samuel G. 1793-1860.',
  'date': '1830-00-00',
  'rights': 'pd',
  'oclc': ['4358983'],
  'lccn': None,
  'isbn': None,
  'catalog_url': 'https://catalog.hathitrust.org/Record/009719143',
  'htitem_id': 'hvd.32044021161005'}]

In [14]:
vol_ids = [item['htitem_id'] for item in data['gathers']]
vol_ids

['osu.32435078698222', 'hvd.32044021161005']

In [None]:
def ht_download(item_id, out_dir=None):
    """
    :param item_id: unique HathiTrust volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with IMAGE_ON_PAGE feature
    """
    
    # metadata from API in json format (different than HT collection metadata)
    meta = json.loads(data_api.getmeta(item_id, json=True))

    # sequence gets us each page of the PDF in order, with any
    # additional information that might be available for it
    sequence = meta['htd:seqmap'][0]['htd:seq']

    # list comprehension finds pages with given feature
    # try/except block handles situation where no "pfeats" exist OR
    # the values are not numeric
    try:
        img_pages = [int(page['pseq']) for page in sequence if 'IMAGE_ON_PAGE' in page['htd:pfeat']]
    except TypeError:
        img_pages = []
        
    # track for download progress report
    total_pages = len(img_pages)

    # if out_dir is not None, then also download page images
    if out_dir:
        
         # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory {}".format(item_id, out_dir))
        os.makedirs(out_dir)
        
        for i, page in enumerate(img_pages):
            try:
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                img = data_api.getpageimage(item_id, page)
            
                # just store in CWD
                img_out = os.path.join(out_dir, str(page) + ".jpg")
    
                # write out the image
                with open(img_out, 'wb') as fp:
                    fp.write(img)

                # per conversation with Ryan Dubnicek @ HathiTrust
                # avoid API throttling...
                time.sleep(2)

            except Exception as e:
                print("[{}] Error downloading page {}: {}".format(item_id, page,e))
                
    # return the list of image pages
    return img_pages

In [None]:
for item_id in vol_ids:
    print(item_id, ht_download(item_id, out_dir=os.path.join("items", "hathitrust", item_id)))

[items\hathitrust\osu.32435078698222] Making directory
[osu.32435078698222] Downloading page 8 (1/31)
[osu.32435078698222] Downloading page 10 (2/31)
[osu.32435078698222] Downloading page 11 (3/31)
[osu.32435078698222] Downloading page 12 (4/31)
[osu.32435078698222] Downloading page 13 (5/31)
[osu.32435078698222] Downloading page 16 (6/31)
[osu.32435078698222] Downloading page 20 (7/31)
[osu.32435078698222] Downloading page 26 (8/31)
[osu.32435078698222] Downloading page 28 (9/31)
[osu.32435078698222] Downloading page 29 (10/31)
[osu.32435078698222] Downloading page 37 (11/31)
[osu.32435078698222] Downloading page 38 (12/31)
[osu.32435078698222] Downloading page 41 (13/31)
[osu.32435078698222] Downloading page 44 (14/31)
[osu.32435078698222] Downloading page 46 (15/31)
[osu.32435078698222] Downloading page 47 (16/31)
[osu.32435078698222] Downloading page 48 (17/31)
[osu.32435078698222] Downloading page 49 (18/31)
[osu.32435078698222] Downloading page 50 (19/31)
[osu.32435078698222] Dow