# Setup

First, we import the library. Then, we grab the API tokens from our `keys.py` file (not in Git!).

Before running this code, check that you are in the `ml-mhl` environment.

```bash
conda env list
```

In [43]:
import gzip
import internetarchive as ia
from keys import ia_keys
import os
import requests
import sys
import xml.etree.ElementTree as ET

In [20]:
ia.configure(ia_keys['username'], ia_keys['password'])

'C:\\Users\\stephen-krewson/.ia'

In [21]:
# sample search
query = "peter parley date:[1825 TO 1830] mediatype:texts"
vol_ids = [result['identifier'] for result in ia.search_items(query)]
vol_ids

['UF00003119', 'talespeterparle00goodgoog']

In [59]:
def ia_download(item_id, out_dir=None):
    """
    :param item_id: unique Internet Archive volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with one or more blockType=Picture in OCR data
    """

    print("[{}] Starting processing".format(item_id))
    
    # See common formats for book with:
    # ia metadata formats peterparleysmet00goodgoog
    returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"]))
    
    # make sure something got returned
    if len(returned_files) > 0:
        abbyy_file = returned_files[0].name
    else:
        print("[{}] Could not get Abbyy file".format(item_id))
        return None
    
    # download the abbyy file to CWD
    ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, destdir=os.getcwd(), no_directory=True)
    
    # collect the pages with at least one picture block
    img_pages = []
    
    with gzip.open(abbyy_file) as fp:
        tree = ET.parse(fp)
        document = tree.getroot()
        for i, page in enumerate(document):
            for block in page:
                try:
                    if block.attrib['blockType'] == 'Picture':
                        img_pages.append(i)
                        break
                except KeyError:
                    continue
    
    # 0 is not a valid page for making GET requests to IA, yet sometimes
    # it's in the zipped Abbyy file
    img_pages = [page for page in img_pages if page > 0]
    
    # track for download progress report
    total_pages = len(img_pages)

    # OCR files are huge, so just delete once we have pagelist
    os.remove(abbyy_file)
    
    # if out_dir is not None, then also download page images
    if out_dir:
        
        # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory".format(out_dir))
        os.makedirs(out_dir)
        
        # See Michael Karpeles email! saves tedious JP2 conversion
        # now we want the urls (PNG is also an option)
        urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) for page in img_pages]
        
        # no direct page download through API, DIY
        for i, page, url in zip(range(1,total_pages), img_pages, urls):
            rsp = requests.get(url, allow_redirects=True)
            if rsp.status_code == 200:
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp:
                    fp.write(rsp.content)
    
    # return this just for kicks
    return img_pages

In [60]:
for item_id in vol_ids:
    img_pages = ia_download(item_id, out_dir=item_id)

[UF00003119] Starting processing
[UF00003119] Could not get Abbyy file
[talespeterparle00goodgoog] Starting processing
talespeterparle00goodgoog: d - success
[talespeterparle00goodgoog] Making directory
[talespeterparle00goodgoog] Downloading page 1 (2/43)
[talespeterparle00goodgoog] Downloading page 2 (3/43)
[talespeterparle00goodgoog] Downloading page 7 (4/43)
[talespeterparle00goodgoog] Downloading page 8 (5/43)
[talespeterparle00goodgoog] Downloading page 12 (6/43)
[talespeterparle00goodgoog] Downloading page 14 (7/43)
[talespeterparle00goodgoog] Downloading page 16 (8/43)
[talespeterparle00goodgoog] Downloading page 17 (9/43)
[talespeterparle00goodgoog] Downloading page 18 (10/43)
[talespeterparle00goodgoog] Downloading page 19 (11/43)
[talespeterparle00goodgoog] Downloading page 22 (12/43)
[talespeterparle00goodgoog] Downloading page 27 (13/43)
[talespeterparle00goodgoog] Downloading page 33 (14/43)
[talespeterparle00goodgoog] Downloading page 35 (15/43)
[talespeterparle00goodgoo