This code goes over news releases of different public-facing agencies and randomly extracts some of those news releases -- it is quite customizable, although note that for each agency, we need to write custom codes since each government website has its own bugs and protection issues..

The below code goes over news releases by NOAA, EPA, and Cal Air Board 

# NOAA headlines

Parsing NOAA headline is relatively easy: https://www.noaa.gov/news

In [16]:
import requests
from bs4 import BeautifulSoup
import random
import csv
import time

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}

def extract_articles_from_page(page_num):
    url = f"https://www.noaa.gov/news?page={page_num}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="node-teaser")
    results = []

    for a in articles:
        title_tag = a.find("div", class_="title")
        if title_tag:
            link_tag = title_tag.find("a")
            if link_tag and link_tag.text and link_tag["href"]:
                title = link_tag.text.strip()
                href = link_tag["href"]
                full_url = "https://www.noaa.gov" + href if href.startswith("/") else href
                results.append((title, full_url))
    return results

# Step 1: Sample 30 random pages (assume ~12-15 articles per page → ~400 total)
max_pages = 71
random_pages = random.sample(range(max_pages), 30)

collected = set()
for page_num in random_pages:
    articles = extract_articles_from_page(page_num)
    for title, url in articles:
        collected.add((title, url))
    time.sleep(0.2)  # be nice to NOAA's servers

# Step 2: Sample 50 from the ~400
all_articles = list(collected)
sampled = random.sample(all_articles, min(50, len(all_articles)))

# Step 3: Save to CSV
with open("noaa_headlines.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    for title, url in sampled:
        writer.writerow(["NOAA", title, url])

print(f" Scraped {len(all_articles)} total headlines, saved 50 random ones to 'noaa_headlines.csv'")

✅ Scraped 300 total headlines, saved 50 random ones to 'noaa_headlines.csv'


# Cal Air board headlines

Note: Cal Air website is terribly buggy and most news releases on pages 2 onwards are not accessible..

In [30]:
import requests
from bs4 import BeautifulSoup
import csv

BASE_URL = "https://ww2.arb.ca.gov"
URL = f"{BASE_URL}/news"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}

# Fetch and parse the first news page
r = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(r.text, "html.parser")

# Inspect and update the selector if needed
links = soup.select("a[href^='/news/']")

results = []
for tag in links:
    title = tag.text.strip()
    href = tag.get("href")
    if title and href and href.startswith("/news/"):
        full_url = BASE_URL + href
        results.append((title, full_url))

# Deduplicate
unique_results = list(set(results))

# Save to CSV
with open("carb_headlines.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    for title, url in unique_results:
        writer.writerow(["CARB", title, url])

print(f" Saved {len(unique_results)} headlines from CARB front page to 'carb_headlines.csv'")

✅ Saved 21 headlines from CARB front page to 'carb_headlines.csv'


# EPA headlines

Note: EPA has bot protection and we CANNOT parse the webpages, have to save them locally to parse the headlines.. sigh

First locally save 10 random pages

In [50]:
import random
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

NUM_PAGES = 10
MIN_PAGE = 31
MAX_PAGE = 100  # adjust if needed
BASE_URL = "https://www.epa.gov/newsreleases/search?page="
SAVE_DIR = "epa_pages"

os.makedirs(SAVE_DIR, exist_ok=True)

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

page_numbers = random.sample(range(MIN_PAGE, MAX_PAGE), NUM_PAGES)

for page_num in page_numbers:
    url = BASE_URL + str(page_num)
    driver.get(url)
    time.sleep(3)
    html = driver.page_source
    with open(f"{SAVE_DIR}/epa_page{page_num}.html", "w", encoding="utf-8") as f:
        f.write(html)

driver.quit()
print("Saved HTML pages to:", SAVE_DIR)

✅ Saved HTML pages to: epa_pages


Next parse and save 50 random headlines

In [53]:
import os
import random
import csv
from bs4 import BeautifulSoup

# Set to your local directory
SAVE_DIR = "epa_pages"

# Find all relevant files in that folder
epa_files = [os.path.join(SAVE_DIR, f) for f in os.listdir(SAVE_DIR)
             if f.startswith("epa_page") and f.endswith(".html")]

headlines_data = []

# Parse each file for headlines and URLs
for filepath in epa_files:
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
        for tag in soup.select("h3 a"):
            headline = tag.get_text(strip=True)
            link = tag.get("href")
            full_url = "https://www.epa.gov" + link if link.startswith("/") else link
            headlines_data.append(["EPA", headline, full_url])

# Randomly select 50
sampled = random.sample(headlines_data, min(50, len(headlines_data)))

# Save to CSV
with open("epa_headlines.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    #writer.writerow(["source", "headline", "url"])
    writer.writerows(sampled)

print("Done: Saved epa_headlines.csv")

Done: Saved epa_headlines.csv
