In [1]:
### Install packages

import requests
import json

from datetime import datetime
import time

import pandas as pd
import numpy as np

In [2]:
### 1) Review & Access the webpage

# Decompose the base URL & define the API key
base_URL1 = "https://api.nytimes.com/svc/archive/v1/"
base_URL2 = ".json?api-key="
api_key = "'''YOUR API KEY'''"


# Initialize an empty list to store metadata of the articles
nyt_articles = []


# For over each year & month (2018/01/01 ~ 2022/12/31),
for year in range(2018, 2023):
    for month in range(1, 13):
        
        # Construct a URL to access NYT's archived data
        nyt_URL = base_URL1 + str(year) + "/" + str(month) + base_URL2 + api_key

        # Make an API request 
        response = requests.get(nyt_URL)

        if response.status_code == 200:

            # If the requeset is successful, indicate the 1) year & month of publication and 2) time of access
            print(f"(a) Current search: {year}/{month}")
            print(f"(b) Time of access: {datetime.now().strftime('%Hh%M')}")
            
            
            ### 2) Parse the JSON response
            
            # Extract metadata about the articles
            data = response.json()
            articles = data['response']['docs']

            # Store only the relevant metadata for the current month & year
            for article in articles:
                
                search_results = {}
                
                search_results["title"] = article["headline"]["main"]
                search_results["abstract"] = article["abstract"] 
                search_results["date"] = datetime.strptime(article["pub_date"], '%Y-%m-%dT%H:%M:%S%z').strftime('%Y-%m-%d')
                search_results["section"] = article["section_name"]
                search_results["url"] = article["web_url"]
                search_results["source"] = article["source"]
                search_results["news_desk"] = article["news_desk"]
                search_results["type_of_material"] = article["type_of_material"]
                
                nyt_articles.append(search_results)
            
            
            # Confirm the status of the search. Add a 12-second delay to avoid hitting the daily API rate limit
            print(f"(c) Download successful! Begin sleep mode: {datetime.now().strftime('%Hh%M')}\n")
            
            time.sleep(12)
                

                        
        else:
            print(f"Failed to retrieve data for {year}/{month}")

(a) Current search: 2018/1
(b) Time of access: 20h42
(c) Download successful! Begin sleep mode: 20h42

(a) Current search: 2018/2
(b) Time of access: 20h43
(c) Download successful! Begin sleep mode: 20h43

(a) Current search: 2018/3
(b) Time of access: 20h43
(c) Download successful! Begin sleep mode: 20h43

(a) Current search: 2018/4
(b) Time of access: 20h43
(c) Download successful! Begin sleep mode: 20h43

(a) Current search: 2018/5
(b) Time of access: 20h43
(c) Download successful! Begin sleep mode: 20h43

(a) Current search: 2018/6
(b) Time of access: 20h44
(c) Download successful! Begin sleep mode: 20h44

(a) Current search: 2018/7
(b) Time of access: 20h44
(c) Download successful! Begin sleep mode: 20h44

(a) Current search: 2018/8
(b) Time of access: 20h44
(c) Download successful! Begin sleep mode: 20h44

(a) Current search: 2018/9
(b) Time of access: 20h44
(c) Download successful! Begin sleep mode: 20h44

(a) Current search: 2018/10
(b) Time of access: 20h44
(c) Download succes

In [3]:
### 3) Store metadata as a data frame

# NOTE: Due to API call limit, I couldn't use NYT's 'Article Search' API,
# which allows me to look up articles by keyword (climate change).
# I used NYT's 'Archive' API instead, which retrieves article metadata for a given month (but does not support keyword searches)
# Hence, I will have to subset the data to climate related articles once I have downloaded the metadata.

nyt_data = pd.DataFrame(nyt_articles)
nyt_data.head()

Unnamed: 0,title,abstract,date,section,url,source,news_desk,type_of_material
0,Tom Brokaw: You Can Find the Entire World Insi...,American health care is a universe of scientif...,2018-01-01,Opinion,https://www.nytimes.com/2017/12/31/opinion/tom...,The New York Times,OpEd,Op-Ed
1,New York Family of 5 Among 10 Americans Killed...,The Costa Rican government said the crash occu...,2018-01-01,World,https://www.nytimes.com/2017/12/31/world/ameri...,The New York Times,Foreign,News
2,The Wall of Love Outside a Jail,Families have turned the side of a warehouse f...,2018-01-01,New York,https://www.nytimes.com/2017/12/31/nyregion/me...,The New York Times,Metro,News
3,The Sentient-Being Diet,Making New Year resolutions as a hedge against...,2018-01-01,Opinion,https://www.nytimes.com/2017/12/31/opinion/new...,The New York Times,OpEd,Op-Ed
4,Changing the Script,Screenwriters need to stop letting male charac...,2018-01-01,Opinion,https://www.nytimes.com/2017/12/31/opinion/men...,The New York Times,Letters,Letter


In [4]:
nyt_data.to_csv('NYT.csv', index = False)