# The New York Times

## Data collection

In [1]:
### Install packages

import requests
import json

from datetime import datetime
import time

import pandas as pd
import numpy as np

# Settings the warnings to be ignored 
import warnings 
warnings.filterwarnings('ignore') 

### I. Scrape <ins>ALL</ins> articles (regardless of topic) published in <ins>2018-2022</ins>



In [2]:
### 1) Review & Access the webpage

# Decompose the base URL & define the API key
base_URL1 = "https://api.nytimes.com/svc/archive/v1/"
base_URL2 = ".json?api-key="
api_key = '''YOUR API KEY'''


# Initialize an empty list to store metadata of the articles
metadata = []


# For over each year & month (2018/01/01 ~ 2022/12/31),
for year in range(2018, 2023):
    for month in range(1, 13):
        
        # Construct a URL to access NYT's archived data
        nyt_URL = base_URL1 + str(year) + "/" + str(month) + base_URL2 + api_key

        # Make an API request 
        response = requests.get(nyt_URL)

        if response.status_code == 200:

            # If the requeset is successful, indicate the 1) year & month of publication and 2) time of access
            print(f"(a) Current search: {year}/{month}")
            print(f"(b) Time of access: {datetime.now().strftime('%Hh%M')}")
            
            
            ### 2) Parse the JSON response
            
            # Extract metadata about the articles
            data = response.json()
            articles = data['response']['docs']

            # Store only the relevant metadata for the current month & year
            for article in articles:
                
                search_results = {}
                
                search_results["title"] = article["headline"]["main"]
                search_results["abstract"] = article["abstract"] 
                search_results["date"] = datetime.strptime(article["pub_date"], "%Y-%m-%dT%H:%M:%S%z").strftime("%Y-%m-%d")
                search_results["section"] = article["section_name"]
                search_results["url"] = article["web_url"]
                search_results["news_desk"] = article["news_desk"]
                search_results["type_of_material"] = article["type_of_material"]
                search_results["keywords"] = [keyword["value"] for keyword in article["keywords"]]
                
                metadata.append(search_results)
            
            
            # Confirm the status of the search. Add a 12-second delay to avoid hitting the daily API rate limit
            print(f"(c) Download successful! Begin sleep mode: {datetime.now().strftime('%Hh%M')}\n")
            
            time.sleep(12)
                

                        
        else:
            print(f"Failed to retrieve data for {year}/{month}")

(a) Current search: 2018/1
(b) Time of access: 17h11
(c) Download successful! Begin sleep mode: 17h11

(a) Current search: 2018/2
(b) Time of access: 17h11
(c) Download successful! Begin sleep mode: 17h11

(a) Current search: 2018/3
(b) Time of access: 17h11
(c) Download successful! Begin sleep mode: 17h11

(a) Current search: 2018/4
(b) Time of access: 17h11
(c) Download successful! Begin sleep mode: 17h11

(a) Current search: 2018/5
(b) Time of access: 17h11
(c) Download successful! Begin sleep mode: 17h11

(a) Current search: 2018/6
(b) Time of access: 17h12
(c) Download successful! Begin sleep mode: 17h12

(a) Current search: 2018/7
(b) Time of access: 17h12
(c) Download successful! Begin sleep mode: 17h12

(a) Current search: 2018/8
(b) Time of access: 17h12
(c) Download successful! Begin sleep mode: 17h12

(a) Current search: 2018/9
(b) Time of access: 17h12
(c) Download successful! Begin sleep mode: 17h12

(a) Current search: 2018/10
(b) Time of access: 17h13
(c) Download succes

In [3]:
### 3) Store metadata as a data frame

# NOTE: Due to API call limit, I couldn't use NYT's 'Article Search' API,
# which allows me to look up articles by keyword (climate change).
# I used NYT's 'Archive' API instead, which retrieves article metadata for a given month (but does not support keyword searches)
# Hence, I will have to subset the data to climate related articles once I have downloaded the metadata.

data = pd.DataFrame(metadata)


### II. Filter the results to only <ins>climate change</ins> related articles

In [4]:
### Create a list of keywords to filter by

climate_change_keywords = [
    
    "Global Warming", "Environment", "Temperature",
    
    # Emissions
    "Greenhouse Gas Emissions", "Carbon Dioxide", "Fuel Emissions (Transportation)", "Methane", "Chlorofluorocarbons", 
    "Carbon Capture and Sequestration", "Carbon Caps and Emissions Trading Programs", "Fuel Emissions (Transportation)",
    "Air Conditioning",
    
    # Natural disasters
    "Disasters and Emergencies",
    "Floods", "Wildfires", "Hurricanes and Tropical Storms", "Drought", "Heat and Heat Waves", "Cyclones",

    # Air/water/land pollution
    "Pollution", "Water", "Water Pollution", "Air Pollution",
    "Waste Materials and Disposal", "Recycling of Waste Materials", "Conservation of Resources", "Plastics",
        
    # Energy/fuel
    "Alternative and Renewable Energy", "Coal", "Natural Gas", "Oil (Petroleum) and Gasoline",
    "Energy and Power", "Wind Power", "Solar Energy", "Fuel Efficiency", "Energy Efficiency",
    "Electric and Hybrid Vehicles",
    
    # Biodiversity
    "Reefs", "Coral",
    "ARCTIC REGIONS", "ANTARCTIC REGIONS",
    
    # Climate related government agencies/IOs
    "United Nations Framework Convention on Climate Change", "Intergovernmental Panel on Climate Change",
    "World Meteorological Organization", "United Nations Environment Program", 
    "Environmental Protection Agency", "National Oceanic and Atmospheric Administration",
    "National Weather Service", "Federal Emergency Management Agency", 
    
    # Environmental policy
    "Clean Air Act", "Clean Water Act"
    # "Fish and Other Marine Life", "Biodiversity"
]

In [5]:
### Define functions that:

## a) Keep only 'news articles' that are listed under the 'Climate' or 'Science' section
def filter_by_type_and_section():
    return (
        (data['type_of_material'] == 'News') &
        (data['section'].isin(['Climate', 'Science'])) &
        ~(data['news_desk'].isin(['Obits', 'Photo', 'Video', 'NYTNow']))
    )


## b) Search for results that have either "climate change" or "climate-change" in the metadata
def filter_by_keyword(metadata):
    return data[metadata].str.contains(r"climate[- ]?change", case = False, na = False)

In [6]:
### Construct search conditions for locating articles:

### a) with 'climate change' in their title/abstract/URL; or
topic_explicit = filter_by_keyword("title") | filter_by_keyword("abstract") | filter_by_keyword("url")

### b) cover climate change related topics
topic_implicit = (~filter_by_keyword("title") | ~filter_by_keyword("abstract") | ~filter_by_keyword("url")
                  ) & data['keywords'].apply(lambda x: any(keyword in x for keyword in climate_change_keywords))   

In [7]:
### Subset the data to climate change related articles (published in 2018-2022)
### Remove metadata about material type and tagged keywords

search_query = filter_by_type_and_section() & (topic_explicit | topic_implicit)
NYT = data[search_query][['title', 'abstract', 'date', 'section', 'url']].reset_index(drop = True)

In [8]:
### Print the number of rows and the first 5 rows of the dataset

def print_rows(data, source):
    print(f"{source}: {data.shape[0]} climate-related articles published in 2018-2022")
    return data.head()

print_rows(NYT, "The New York Times")

The New York Times: 2327 climate-related articles published in 2018-2022


Unnamed: 0,title,abstract,date,section,url
0,Three New Year’s Resolutions That Can Help Fig...,Here are three things you can do to help reduc...,2018-01-03,Climate,https://www.nytimes.com/2018/01/03/climate/cli...
1,Why So Cold? Climate Change May Be Part of the...,Studies suggest that one factor could be warmi...,2018-01-03,Climate,https://www.nytimes.com/2018/01/03/climate/col...
2,Trump Moves to Open Nearly All Offshore Waters...,The proposal would give the energy industry ac...,2018-01-04,Climate,https://www.nytimes.com/2018/01/04/climate/tru...
3,Global Warming’s Toll on Coral Reefs: As if Th...,"Mass bleaching of coral reefs, once virtually ...",2018-01-04,Climate,https://www.nytimes.com/2018/01/04/climate/cor...
4,2017 Set a Record for Losses From Natural Disa...,"Hurricanes, fires and floods drove insurance p...",2018-01-04,Climate,https://www.nytimes.com/2018/01/04/climate/los...


### III. Store the data as a csv file

In [9]:
NYT.to_csv("./1_Data/NYT.csv", index = False)