In [1]:
# First, we import the requests library. If you're using Python,
# and you want to download things, you should install requests.
# You can probably install it by running the following command
# at the command line: `pip install requests`.

import requests

# And for pretty printing...

from pprint import pprint

# The base url for the Chronicling America API is the `batches`
# collection. It contains information about all the batches of
# data in the corpus. There are between fifty and sixty pages
# of batch information; this downloads the first two and saves
# the informational urls for each one.

next_batch_page_url = 'http://chroniclingamerica.loc.gov/batches.json'
batch_urls = []
n = 0

# `n` denotes the number of pages downloaded. To download more, 
# change `2` to a larger value. To download all of them, remove
# `and n < 2` entirely. 

while next_batch_page_url is not None and n < 2:
    data = requests.get(next_batch_page_url).json()
    next_batch_page_url = data.get('next', None)
    new_batch_urls = [b['url'] for b in data['batches']]
    batch_urls.extend(new_batch_urls)
    n += 1
    
# Here's the first URL:

print(batch_urls[0])

http://chroniclingamerica.loc.gov/batches/ct_floyd_ver01.json


In [2]:
# The data at that location has detailed information about each
# of the newspaper issues inside the batch. 

first_batch_data = requests.get(batch_urls[0]).json()
issues = first_batch_data['issues']
first_batch_first_issue = issues[0]

# The data is stored in a format called JSON. It consists of a 
# nested structure of maps and sequences. It looks like this:

pprint(first_batch_first_issue)


{'date_issued': '1924-09-02',
 'title': {'name': 'New Britain herald.',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519.json'},
 'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1.json'}


In [3]:
# You can extract the name of the publication...

print(first_batch_first_issue['title']['name'])

New Britain herald.


In [4]:
# As well as the url where data about this specific issue is stored.

print(first_batch_first_issue['url'])

http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1.json


In [5]:
# Here we download that information. The `.json()` method takes
# JSON formatted text from the server and automatically converts
# it into a native Python data structure. 

issue_data = requests.get(first_batch_first_issue['url']).json()
pprint(issue_data)

{'batch': {'name': 'ct_floyd_ver01',
           'url': 'http://chroniclingamerica.loc.gov/batches/ct_floyd_ver01.json'},
 'date_issued': '1924-09-02',
 'edition': 1,
 'number': '',
 'pages': [{'sequence': 1,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-1.json'},
           {'sequence': 2,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-2.json'},
           {'sequence': 3,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-3.json'},
           {'sequence': 4,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-4.json'},
           {'sequence': 5,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-5.json'},
           {'sequence': 6,
            'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-6.json'},
           {'sequence': 7,
            'url': 'http://chr

In [6]:
# To get at the individual page data, we grab the
# pages sequence, and pick the first one (that is, 
# the one at index `0`). Then we grab its URL.

first_page_data = issue_data['pages'][0]['url']

# Now we can get the information we need about the
# individual page:

page_ocr = requests.get(first_page_data).json()
pprint(page_ocr)

{'issue': {'date_issued': '1924-09-02',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1.json'},
 'jp2': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-1.jp2',
 'ocr': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-1/ocr.xml',
 'pdf': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-1.pdf',
 'sequence': 1,
 'text': 'http://chroniclingamerica.loc.gov/lccn/sn82014519/1924-09-02/ed-1/seq-1/ocr.txt',
 'title': {'name': 'New Britain herald.',
           'url': 'http://chroniclingamerica.loc.gov/lccn/sn82014519.json'}}


In [7]:
# With one final request, we grab the text:

first_page_pdf = requests.get(page_ocr['pdf']).content
with open('first_page.pdf', 'wb') as outfile:
    outfile.write(first_page_pdf)

In [8]:
first_page_text = requests.get(page_ocr['text']).text
with open('first_page.txt', 'w') as outfile:
    outfile.write(first_page_text)
print(first_page_text)

IgUAd tsBSBkBSBBtfaBBSsBftrsv A tjsktebbsae assHiBBBBaaBlaatBh& e
News of the World"
By Associated Press v
Circulation
Week Ending 1 A Q C
Aug. 30th .. XU)000
3
ESTABLISHED 1870 v.,:.
NEW BRITAIN, CONNECTICUT, TUESDAY, SEPTEMBER 2, 1924. -SIXTEEN PAGES.
HIICE
ICE THREE CENTS
NEW EMIT.
SMALLER - AM 770jW HHJIrT.
SOME MUTUAL AGREEMENT
TO GUARANTEE SECURITY,
Prepared To Insist
That Disarmament
Conferences Be Held
Through League Of
Nations Only Sec
ond Session Is On.
By The Associated Press. ' . v
Clan mi -a Qt .Tk. -1 1 -
tions of Europe, especially the mem
ber of the little entente, manifested
today their determination to achieve
eome International fact which they
consider will safeguard their secur
ity when they selected Foreign Min
ister Duca, of Rumania, to preside
over the important deliberations of
the third assembly commission
which is. to probe the entire prob
lem of disarmament and security.
There seems here to be a distinct
movement, under the apparent lead
ership of France, to 