# Setup

First, we import the library. Then, we grab the API tokens from our `keys.py` file (not in Git!).

In [19]:
from hathitrust_api import DataAPI
import json
from keys import ht_keys
import os
import time

In [6]:
data_api = DataAPI(ht_keys['access'], ht_keys['secret'])

In [8]:
metadata_path = "554050894-1535834127.json"

In [10]:
with open(metadata_path, "r") as fp:
    data = json.load(fp)
    
data.keys()

dict_keys(['id', 'type', 'description', 'created', 'extent', 'formats', 'publisher', 'title', 'visibility', 'gathers'])

In [11]:
data['gathers']

[{'title': 'The tales of Peter Parley about America. ',
  'author': 'Goodrich, Samuel G. 1793-1860.',
  'date': '1827-00-00',
  'rights': 'pd',
  'oclc': ['39945876'],
  'lccn': None,
  'isbn': None,
  'catalog_url': 'https://catalog.hathitrust.org/Record/100221263',
  'htitem_id': 'osu.32435078698222'},
 {'title': 'The tales of Peter Parley about America. ',
  'author': 'Goodrich, Samuel G. 1793-1860.',
  'date': '1830-00-00',
  'rights': 'pd',
  'oclc': ['4358983'],
  'lccn': None,
  'isbn': None,
  'catalog_url': 'https://catalog.hathitrust.org/Record/009719143',
  'htitem_id': 'hvd.32044021161005'}]

In [12]:
vol_ids = [item['htitem_id'] for item in data['gathers']]
vol_ids

['osu.32435078698222', 'hvd.32044021161005']

In [30]:
def ht_download(item_id, out_dir=None):
    """
    :param item_id: unique HathiTrust volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with IMAGE_ON_PAGE feature
    """
    
    # metadata from API in json format (different than HT collection metadata)
    meta = json.loads(data_api.getmeta(item_id, json=True))

    # sequence gets us each page of the PDF in order, with any
    # additional information that might be available for it
    sequence = meta['htd:seqmap'][0]['htd:seq']

    # list comprehension finds pages with given feature
    # try/except block handles situation where no "pfeats" exist OR
    # the values are not numeric
    try:
        img_pages = [int(page['pseq']) for page in sequence if 'IMAGE_ON_PAGE' in page['htd:pfeat']]
    except TypeError:
        img_pages = []
        
    # track for download progress report
    total_pages = len(img_pages)

    # if out_dir is not None, then also download page images
    if out_dir:
        
         # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory".format(out_dir))
        os.makedirs(out_dir)
        
        for i, page in enumerate(img_pages):
            try:
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                img = data_api.getpageimage(item_id, page)
            
                # just store in CWD
                img_out = os.path.join(out_dir, str(page) + ".jpg")
    
                # write out the image
                with open(img_out, 'wb') as fp:
                    fp.write(img)

                # per conversation with Ryan Dubnicek @ HathiTrust
                # avoid API throttling...
                time.sleep(2)

            except Exception as e:
                print("[{}] Error downloading page {}: {}".format(item_id, page,e))
                
    # return the list of image pages
    return img_pages

In [29]:
# dest name is just the volume id
ht_download(vol_ids[0], out_dir=vol_ids[0])  

[osu.32435078698222] Making directory
[osu.32435078698222] Downloading page 8 (0/31)
[osu.32435078698222] Downloading page 10 (1/31)
[osu.32435078698222] Downloading page 11 (2/31)
[osu.32435078698222] Downloading page 12 (3/31)
[osu.32435078698222] Downloading page 13 (4/31)
[osu.32435078698222] Downloading page 16 (5/31)
[osu.32435078698222] Downloading page 20 (6/31)
[osu.32435078698222] Downloading page 26 (7/31)
[osu.32435078698222] Downloading page 28 (8/31)
[osu.32435078698222] Downloading page 29 (9/31)
[osu.32435078698222] Downloading page 37 (10/31)
[osu.32435078698222] Downloading page 38 (11/31)
[osu.32435078698222] Downloading page 41 (12/31)
[osu.32435078698222] Downloading page 44 (13/31)
[osu.32435078698222] Downloading page 46 (14/31)
[osu.32435078698222] Downloading page 47 (15/31)
[osu.32435078698222] Downloading page 48 (16/31)
[osu.32435078698222] Downloading page 49 (17/31)
[osu.32435078698222] Downloading page 50 (18/31)
[osu.32435078698222] Downloading page 54 (

[8,
 10,
 11,
 12,
 13,
 16,
 20,
 26,
 28,
 29,
 37,
 38,
 41,
 44,
 46,
 47,
 48,
 49,
 50,
 54,
 57,
 58,
 61,
 65,
 70,
 74,
 84,
 94,
 100,
 105,
 119]

In [32]:
for item_id in vol_ids:
    print(item_id, ht_download(item_id, out_dir=None))

osu.32435078698222 [8, 10, 11, 12, 13, 16, 20, 26, 28, 29, 37, 38, 41, 44, 46, 47, 48, 49, 50, 54, 57, 58, 61, 65, 70, 74, 84, 94, 100, 105, 119]
hvd.32044021161005 [1, 2, 7, 14, 16, 17, 18, 19, 22, 27, 33, 35, 37, 44, 45, 46, 47, 48, 49, 52, 55, 57, 58, 59, 60, 61, 65, 68, 69, 71, 73, 74, 76, 80, 86, 88, 103, 128, 129, 130]
