In [1]:
from time import sleep
import multiprocessing

import ujson
import requests
import newspaper
from tqdm import tqdm

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Download-urls-from-archive" data-toc-modified-id="Download-urls-from-archive-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Download urls from archive</a></span><ul class="toc-item"><li><span><a href="#Explore-document-types" data-toc-modified-id="Explore-document-types-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Explore document types</a></span></li></ul></li><li><span><a href="#Scrape-the-content" data-toc-modified-id="Scrape-the-content-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Scrape the content</a></span></li></ul></div>

# Download urls from archive

In [14]:
url = 'https://api.nytimes.com/svc/archive/v1/%s/%s.json?api-key=29a51170349f43d9abe651b0e2331ea6'
year_from = 2000  # 1852
year_to = 2018

with open('data/nytimes/archive.json', 'w') as out_archive:
    with tqdm(total=(year_from - year_to) * 12) as progress:
        for year in range(year_from, year_to + 1):
            for month in range(1, 13):
                docs = None
                try:
                    response = requests.get(url % (year, month))
                    docs = response.json()['response']['docs']
                except Exception:
                    sleep(10)

                if docs is None:
                    response = requests.get(url % (year, month))
                    docs = response.json()['response']['docs']

                for doc in docs:
                    out_archive.write(ujson.dumps(doc) + '\n')

                progress.update()

218it [29:43,  7.66s/it]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Explore document types

In [15]:
archive_entries_doc_types = {}
with open('data/nytimes/archive.json', 'r') as in_archive: 
    with tqdm() as progress:
        for line in in_archive:
            archive_entry = ujson.loads(line)
            doc_type = archive_entry['document_type']
            
            archive_entries_doc_types.setdefault(doc_type, 0)
            archive_entries_doc_types[doc_type] += 1
            
            progress.update()

2002799it [02:06, 15852.34it/s]


In [16]:
archive_entries_doc_types

{'article': 1550951, 'blogpost': 372608, 'multimedia': 79240}

# Scrape the content

In [2]:
def scrape_article(line):
    archive_entry = ujson.loads(line)
    url = archive_entry['web_url']

    if archive_entry['document_type'] != 'article':
        return None

    try:
        article = newspaper.Article(url, fetch_images=False)
        article.download()
        article.parse()
    except Exception:
        print('Something went wrong parsing url:', url)
        return None

    archive_entry['newspaper'] = {
        'title': article.title,
        'content': article.text,
        'authors': ', '.join(article.authors),
        'keywords': ', '.join(article.keywords),
        'meta_keywords': article.meta_keywords,
        'meta_description': article.meta_description,
        'tags': ', '.join(article.tags),
        'summary': article.summary
    }
    
    return archive_entry

In [5]:
with open('data/nytimes/archive_scraped.jsonl', 'w') as out_archive_scraped:
    with open('data/nytimes/archive.json', 'r') as in_archive: 
        with tqdm() as progress:
            with multiprocessing.Pool(processes=32) as pool:
                for archive_entry in pool.imap(scrape_article, in_archive, chunksize=1):
                    if archive_entry is None:
                        continue

                    out_archive_scraped.write(ujson.dumps(archive_entry) + '\n')
                    progress.update()

5it [00:08,  3.46s/it]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html on URL https://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html


203it [00:35,  6.00it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html on URL https://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html


264it [00:43,  1.98it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html on URL https://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html


362it [00:56,  4.55it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html on URL https://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html


441it [01:05,  4.70it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html on URL https://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html


455it [01:07,  6.70it/s]


KeyboardInterrupt: 

In [5]:
archive_entry

{'_id': '4fd1f22e8eb7c8105d7496fb',
 'abstract': 'American teenagers feel pressure to succeed at dawn of new century; photos (M)',
 'blog': [],
 'byline': {'original': 'By Dirk Johnson',
  'person': [{'firstname': 'Dirk',
    'lastname': 'Johnson',
    'organization': '',
    'rank': 1,
    'role': 'reported'}]},
 'document_type': 'article',
 'headline': {'kicker': 'VISIONS: IDENTITY',
  'main': "A Generation's Anthem: 'Smells Like Teen Pressure'"},
 'keywords': [{'name': 'subject', 'value': 'CHILDREN AND YOUTH'}],
 'lead_paragraph': "YOUNG, hip and smart, Casey Collier seems to glide from one peak to the next, knocking out A's, winning student council elections and starring in the school play. She is an 18-year-old with a dazzling future. So why does she fret? ''I feel overwhelmed -- I feel like, 'Oh my gosh, this stress,' '' said Miss Collier, a senior at Shawnee Mission North High School in Kansas, outside Kansas City. ''The other night, I wondered, 'Is it possible to have a nervous