## Pulling Articles from NYT API using following Topics & Issues: 

##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission  
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt 



##### Subtopic 1: 
##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 500

In [25]:
import requests
import os
import json
import time
from datetime import datetime

# Your NYT API key
api_key = 'DCoMD6VlV3W3VwdAryHp2n93HQwQWpx5'

# Define the base URL for the NYT API search endpoint
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

# Politics topics to search
politics_topics = [
    'Partisan', 'Foreign', 'Elections', 'Immigration', 'Education'
]

# Set up the parameters for the API request
params = {
    'api-key': api_key,
    'begin_date': '20250301',  # Start date (format: YYYYMMDD)
    'end_date': '20250331',    # End date (format: YYYYMMDD)
    'page': 0,                 # Start at page 0
    'sort': 'newest'           # Sort by newest articles
}

# Initialize variables for pagination
current_page = 0
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_politics_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_NYT/Politics'
os.makedirs(output_dir, exist_ok=True)

# Controlled delay to avoid overloading the system
def controlled_delay():
    delay = 12  # Set a fixed delay of 12 seconds between requests
    print(f"Waiting for {delay} seconds before the next request...")
    time.sleep(delay)

# Function to fetch articles with rate limit handling
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    retry_count = 0  # To track retries for 429 errors
    
    while current_page < total_pages and current_page < max_pages:
        print(f"Fetching page {current_page + 1} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('docs', [])
            all_articles.extend(articles)
            # Calculate total pages dynamically
            total_hits = data.get('response', {}).get('meta', {}).get('hits', 1)
            total_pages = (total_hits // 10) + (1 if total_hits % 10 != 0 else 0)  # Total pages based on hits
            current_page += 1
            controlled_delay()  # Add a 12-second delay between requests
        elif response.status_code == 429:  # Rate limit exceeded
            print(f"Too many requests (429). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 5:  # Retry a maximum of 5 times before giving up
                print(f"Max retries reached for {query}. Exiting.")
                break
        elif response.status_code == 503:  # Service unavailable
            print(f"Service unavailable (503). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the politics topics
for topic in politics_topics:
    all_articles = []
    current_page = 0
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all Politics topics
    total_politics_articles += len(all_articles)

    # Controlled delay between topics (optional if you want to slow down topic-based requests)
    controlled_delay()

# Display the total number of articles for all Politics topics
print(f"\nTotal number of articles retrieved across all Politics topics: {total_politics_articles}")


Fetching page 1 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 2 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 3 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 4 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 5 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 6 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 7 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 8 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 9 for query: Partisan...
Waiting for 12 seconds before the next request...
Fetching page 10 for query: Partisan...
Waiting for 12 seconds before the next request...
Total number of articles retrieved for 'Partisan': 100
Waiting for 12 seconds before the next request...
Fetc

##### Subtopic 2: 
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 328

In [34]:
import requests
import os
import json
import time

# Your NYT API key
api_key = 'DCoMD6VlV3W3VwdAryHp2n93HQwQWpx5'

# Define the base URL for the NYT API search endpoint
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

# Environmental topics to search
environmental_topics = [
    'Sustainability', 'Climate Change', 'Air Pollution', 'Recycling', 'Green Tech', 'Carbon Emissions'
]

# Set up the parameters for the API request
params = {
    'api-key': api_key,
    'begin_date': '20250301',  # Start date (format: YYYYMMDD)
    'end_date': '20250331',    # End date (format: YYYYMMDD)
    'page': 0,                 # Start at page 0
    'sort': 'newest'           # Sort by newest articles
}

# Initialize variables for pagination
current_page = 0
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_environmental_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_NYT/Environmental'
os.makedirs(output_dir, exist_ok=True)

# Controlled delay to avoid overloading the system
def controlled_delay():
    delay = 12  # Set a fixed delay of 12 seconds between requests
    print(f"Waiting for {delay} seconds before the next request...")
    time.sleep(delay)

# Function to fetch articles with rate limit handling
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    retry_count = 0  # To track retries for 429 errors
    
    while current_page < total_pages and current_page < max_pages:
        print(f"Fetching page {current_page + 1} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('docs', [])
            all_articles.extend(articles)
            # Calculate total pages dynamically
            total_hits = data.get('response', {}).get('meta', {}).get('hits', 1)
            total_pages = (total_hits // 10) + (1 if total_hits % 10 != 0 else 0)  # Total pages based on hits
            current_page += 1
            controlled_delay()  # Add a 12-second delay between requests
        elif response.status_code == 429:  # Rate limit exceeded
            print(f"Too many requests (429). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 5:  # Retry a maximum of 5 times before giving up
                print(f"Max retries reached for {query}. Exiting.")
                break
        elif response.status_code == 503:  # Service unavailable
            print(f"Service unavailable (503). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the environmental topics
for topic in environmental_topics:
    all_articles = []
    current_page = 0
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all environmental topics
    total_environmental_articles += len(all_articles)

    # Controlled delay between topics (optional if you want to slow down topic-based requests)
    controlled_delay()

# Display the total number of articles for all Environmental topics
print(f"\nTotal number of articles retrieved across all Environmental topics: {total_environmental_articles}")


Fetching page 1 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 2 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 3 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 4 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 5 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 6 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 7 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 8 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 9 for query: Sustainability...
Waiting for 12 seconds before the next request...
Fetching page 10 for query: Sustainability...
Waiting for 12 seconds before the next request...
Total number of articles retrieved for 'Sustainab

##### Subtopic 3: 
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt¶

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 262

In [36]:
import requests
import os
import json
import time

# Your NYT API key
api_key = 'DCoMD6VlV3W3VwdAryHp2n93HQwQWpx5'

# Define the base URL for the NYT API search endpoint
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

# Education topics to search
education_topics = [
    'Education Inequality', 'Remote Learning', 'Diversity Education', 'School Funding', 'Student Loan Debt'
]

# Set up the parameters for the API request
params = {
    'api-key': api_key,
    'begin_date': '20250301',  # Start date (format: YYYYMMDD)
    'end_date': '20250331',    # End date (format: YYYYMMDD)
    'page': 0,                 # Start at page 0
    'sort': 'newest'           # Sort by newest articles
}

# Initialize variables for pagination
current_page = 0
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_education_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_NYT/Education'
os.makedirs(output_dir, exist_ok=True)

# Controlled delay to avoid overloading the system
def controlled_delay():
    delay = 12  # Set a fixed delay of 12 seconds between requests
    print(f"Waiting for {delay} seconds before the next request...")
    time.sleep(delay)

# Function to fetch articles with rate limit handling
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    retry_count = 0  # To track retries for 429 errors
    
    while current_page < total_pages and current_page < max_pages:
        print(f"Fetching page {current_page + 1} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('docs', [])
            all_articles.extend(articles)
            # Calculate total pages dynamically
            total_hits = data.get('response', {}).get('meta', {}).get('hits', 1)
            total_pages = (total_hits // 10) + (1 if total_hits % 10 != 0 else 0)  # Total pages based on hits
            current_page += 1
            controlled_delay()  # Add a 12-second delay between requests
        elif response.status_code == 429:  # Rate limit exceeded
            print(f"Too many requests (429). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 5:  # Retry a maximum of 5 times before giving up
                print(f"Max retries reached for {query}. Exiting.")
                break
        elif response.status_code == 503:  # Service unavailable
            print(f"Service unavailable (503). Retrying in 12 seconds...")
            time.sleep(12)  # Sleep for 12 seconds to respect rate limit
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the education topics
for topic in education_topics:
    all_articles = []
    current_page = 0
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all education topics
    total_education_articles += len(all_articles)

    # Controlled delay between topics (optional if you want to slow down topic-based requests)
    controlled_delay()

# Display the total number of articles for all Education topics
print(f"\nTotal number of articles retrieved across all Education topics: {total_education_articles}")


Fetching page 1 for query: Education Inequality...
Waiting for 12 seconds before the next request...
Fetching page 2 for query: Education Inequality...
Waiting for 12 seconds before the next request...
Fetching page 3 for query: Education Inequality...
Waiting for 12 seconds before the next request...
Total number of articles retrieved for 'Education Inequality': 26
Waiting for 12 seconds before the next request...
Fetching page 1 for query: Remote Learning...
Waiting for 12 seconds before the next request...
Fetching page 2 for query: Remote Learning...
Waiting for 12 seconds before the next request...
Total number of articles retrieved for 'Remote Learning': 19
Waiting for 12 seconds before the next request...
Fetching page 1 for query: Diversity Education...
Waiting for 12 seconds before the next request...
Fetching page 2 for query: Diversity Education...
Waiting for 12 seconds before the next request...
Fetching page 3 for query: Diversity Education...
Waiting for 12 seconds befor