In [1]:
from dateutil.relativedelta import relativedelta
import pandas as pd
import datetime
import dateutil.parser
import requests
import time

def send_request(begin_date, end_date, page):
    '''Sends a request to the NYT Archive API for a given date range and page.'''
    url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
    api_key = 'X8vGMLWCXtOxkRhrgABZGj3baiaKApAn'
    query = 'tesla'
    sort = 'best'

    # Send the request to the API
    response = requests.get(url, params={'api-key': api_key, 'q': query, 'begin_date': begin_date, 'end_date': end_date, 'sort': sort, 'page': page})
    time.sleep(15)
    return response


def parse_response(response, start, end):
    '''Parses and returns response as a pandas DataFrame.'''
    data = {
        'headline': [],
        'date': [],
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []
    }
    
    articles = response.json()['response']['docs']
    for article in articles:
        # Make sure the article falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if start <= date <= end:
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article:
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    
    return pd.DataFrame(data)


def get_data(start, end):
    '''Sends and parses requests to/from NYT Archive API for the given date range.'''
    total = 0
    print('Date range: ' + str(start) + ' to ' + str(end))
    page = 0
    while True:
        response = send_request(start, end, page)
        if response.status_code != 200:
            print('Error:', response.json()['fault']['faultstring'])
            break
        df = parse_response(response, start, end)
        if len(df) == 0:
            break
        total += len(df)
        df.to_csv(f'headlines_pages/{start}-{end}_page{page}.csv', index=False)
        print(f'Saving headlines_pages/{start}-{end}_page{page}.csv...')
        page += 1
    print('Number of articles collected:', total)


# Define the start and end dates
end = datetime.date.today()
start = datetime.date(1978, 1, 1)

# Iterate over each month and retrieve data
current_date = start
while current_date <= end:
    next_date = current_date + relativedelta(months=1)
    get_data(current_date.strftime('%Y%m%d'), next_date.strftime('%Y%m%d'))
    current_date = next_date


Date range: 19780101 to 19780201
Number of articles collected: 0
Date range: 19780201 to 19780301
Number of articles collected: 0
Date range: 19780301 to 19780401


TypeError: '<=' not supported between instances of 'str' and 'datetime.date'