In [61]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

api_key = '4IH9pY849SPlqcApxjAyv56UzrLFUhws'



In [75]:
end_dt= datetime.date(2021,3,1)
start_dt = end_dt - relativedelta(years=2)

In [79]:
dates = [x.split(' ') for x in pd.date_range(start_dt, end_dt, freq='MS').strftime("%Y %-m").tolist()]

In [80]:
dates

[['2019', '3'],
 ['2019', '4'],
 ['2019', '5'],
 ['2019', '6'],
 ['2019', '7'],
 ['2019', '8'],
 ['2019', '9'],
 ['2019', '10'],
 ['2019', '11'],
 ['2019', '12'],
 ['2020', '1'],
 ['2020', '2'],
 ['2020', '3'],
 ['2020', '4'],
 ['2020', '5'],
 ['2020', '6'],
 ['2020', '7'],
 ['2020', '8'],
 ['2020', '9'],
 ['2020', '10'],
 ['2020', '11'],
 ['2020', '12'],
 ['2021', '1'],
 ['2021', '2'],
 ['2021', '3']]

In [81]:
def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('ny_times'):
        os.mkdir('ny_times')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('ny_times/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving ny_times/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [82]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + api_key
    response = requests.get(url).json()
    time.sleep(10)
    return response


In [83]:
def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start_dt and date < end_dt
    has_abstract = type(article['abstract']) == str 
    return is_in_range and has_abstract


In [84]:
def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {
        'date': [], 
        'abstract': [],  
        'keywords': [],
        'doc_type': [],
        'material_type': [],
        #'section': [],
        'news_desk':[]
        }
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['abstract'].append(article['abstract']) 
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            if 'news_desk' in article: 
                data['news_desk'].append(article['news_desk'])
            else:
                data['news_desk'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 

In [85]:
get_data(dates)

Date range: ['2019', '3'] to ['2021', '3']
Saving ny_times/2019-3.csv...
Saving ny_times/2019-4.csv...
Saving ny_times/2019-5.csv...
Saving ny_times/2019-6.csv...
Saving ny_times/2019-7.csv...
Saving ny_times/2019-8.csv...
Saving ny_times/2019-9.csv...
Saving ny_times/2019-10.csv...
Saving ny_times/2019-11.csv...
Saving ny_times/2019-12.csv...
Saving ny_times/2020-1.csv...
Saving ny_times/2020-2.csv...
Saving ny_times/2020-3.csv...
Saving ny_times/2020-4.csv...
Saving ny_times/2020-5.csv...
Saving ny_times/2020-6.csv...
Saving ny_times/2020-7.csv...
Saving ny_times/2020-8.csv...
Saving ny_times/2020-9.csv...
Saving ny_times/2020-10.csv...
Saving ny_times/2020-11.csv...
Saving ny_times/2020-12.csv...
Saving ny_times/2021-1.csv...
Saving ny_times/2021-2.csv...
Saving ny_times/2021-3.csv...
Number of articles collected: 111214


In [87]:
df = pd.read_csv('ny_times/2019-3.csv')

In [88]:
df

Unnamed: 0,date,abstract,keywords,doc_type,material_type,news_desk
0,2019-03-02,"For some players, the sense of beginnings and ...",['Baseball'],article,Op-Ed,OpEd
1,2019-03-02,The former Trump campaign chairman will be sen...,['Russian Interference in 2016 US Elections an...,article,News,Washington
2,2019-03-02,The family of Otto Warmbier spoke out after Pr...,"['United States International Relations', 'Uni...",article,News,National
3,2019-03-02,Cellphones and apps are like slot machines tha...,"['Computers and the Internet', 'Mobile Applica...",article,News,SpecialSections
4,2019-03-02,The police and Major League Baseball said they...,"['Baseball', 'Domestic Violence', 'Video Recor...",article,News,Express
...,...,...,...,...,...,...
4491,2019-03-31,Coach Tom Izzo improved to 2-11 in games again...,"['NCAA Basketball Championships (Men)', 'Baske...",article,News,Sports
4492,2019-03-31,A racist caricature from 19th-century minstrel...,"['Blacks', 'Black People', 'Discrimination', '...",article,Editorial,Editorial
4493,2019-03-31,"No corrections appeared in print on Monday, Ap...",[],article,Correction,Corrections
4494,2019-03-31,"Capitalism, he says, is slowly committing suic...","['Capitalism (Theory and Philosophy)', 'Immigr...",article,Op-Ed,OpEd
