In [1]:
# First, we import the requests library. If you're using Python,
# and you want to download things, you should install requests.
# You can probably install it by running the following command
# at the command line: `pip install requests`.

import requests

# And for pretty printing...

from pprint import pprint

# The base url for the Chronicling America API is the `batches`
# collection. It contains information about all the batches of
# data in the corpus. There are between fifty and sixty pages
# of batch information; this downloads the first two and saves
# the informational urls for each one.

next_batch_page_url = 'http://chroniclingamerica.loc.gov/batches.json'
batch_urls = []
n = 0

# `n` denotes the number of pages downloaded. To download more, 
# change `2` to a larger value. To download all of them, remove
# `and n < 2` entirely. 

while next_batch_page_url is not None and n < 2:
    data = requests.get(next_batch_page_url).json()
    next_batch_page_url = data.get('next', None)
    new_batch_urls = [b['url'] for b in data['batches']]
    batch_urls.extend(new_batch_urls)
    n += 1
    
# Here's the first URL:

print(batch_urls[0])

http://chroniclingamerica.loc.gov/batches/batch_mdu_fitzgerald_ver01.json


In [2]:
# The data at that location has detailed information about each
# of the newspaper issues inside the batch. 

first_batch_data = requests.get(batch_urls[0]).json()
issues = first_batch_data['issues']
first_batch_first_issue = issues[0]

# The data is stored in a format called JSON. It consists of a 
# nested structure of maps and sequences. It looks like this:

pprint(first_batch_first_issue)


{'date_issued': '1841-08-07',
 'title': {'name': 'The Cecil Whig.',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348.json'},
 'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1.json'}


In [3]:
# You can extract the name of the publication...

print(first_batch_first_issue['title']['name'])

The Cecil Whig.


In [4]:
# As well as the url where data about this specific issue is stored.

print(first_batch_first_issue['url'])

http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1.json


In [5]:
# Here we download that information. The `.json()` method takes
# JSON formatted text from the server and automatically converts
# it into a native Python data structure. 

issue_data = requests.get(first_batch_first_issue['url']).json()
pprint(issue_data)

{'batch': {'name': 'batch_mdu_fitzgerald_ver01',
           'url': 'http://chroniclingamerica.loc.gov/batches/batch_mdu_fitzgerald_ver01.json'},
 'date_issued': '1841-08-07',
 'edition': 1,
 'number': '1',
 'pages': [{'sequence': 1,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-1.json'},
           {'sequence': 2,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-2.json'},
           {'sequence': 3,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-3.json'},
           {'sequence': 4,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-4.json'}],
 'title': {'name': 'The Cecil Whig.',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348.json'},
 'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1.json',
 'volume': '1'}


In [6]:
# To get at the individual page data, we grab the
# pages sequence, and pick the first one (that is, 
# the one at index `0`). Then we grab its URL.

first_page_data = issue_data['pages'][0]['url']

# Now we can get the information we need about the
# individual page:

page_ocr = requests.get(first_page_data).json()
pprint(page_ocr)

{'issue': {'date_issued': '1841-08-07',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1.json'},
 'jp2': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-1.jp2',
 'ocr': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-1/ocr.xml',
 'pdf': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-1.pdf',
 'sequence': 1,
 'text': 'http://chroniclingamerica.loc.gov/lccn/sn83016348/1841-08-07/ed-1/seq-1/ocr.txt',
 'title': {'name': 'The Cecil Whig.',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn83016348.json'}}


In [7]:
# With one final request, we grab the text:

first_page_pdf = requests.get(page_ocr['pdf']).content
with open('first_page.pdf', 'wb') as outfile:
    outfile.write(first_page_pdf)

In [8]:
first_page_text = requests.get(page_ocr['text']).text
with open('first_page.txt', 'w') as outfile:
    outfile.write(first_page_text)
print(first_page_text)

jj YOU. I
1| BUS linn EVERY SATURDAY MORMNG,
I ||Y V. C. RICKETTS.
In the LOG GAIIIN, next door to the
POST OFFICE.
| TERMS.
Two Dollars per annum, (payable half year
in advance, or Two Dollars and Fiurf Cents
ii*\ paid till the etui of the year.
J No Hulweription wift be rccicved for less tliain
six inonlliH, and no paper discontinued until all
arrearages arc paid, unless at the discretion of
Hhe editor.
Advertisement* of one square inserted three
Himes for O.vi: Dollar, and twkvtt five cents'
:fr eneb subsequent insertion; longer ones in pro
n Advertising eusloincrs will please mark
od..> niiiiuiseript bow iwny insertions arc re
]jKh.. If no such direction is given, the adver-
B. i.i will le continued until forbid, and char
’ All cuiiiiiiuiucatioiis to the Editor should be
.fmal puiii.
Agents kM jUio Cecil Whig.
Suliscrilier's Names, Sulisrription Money, Ad
vertisements, Osdor* for Printing, \c Kc., left
with the following gentlemen, will he promptly
'ittended to, viz :
J.*m: 11. Var