## Pulling Articles from News API (Fox News) using following Topics & Issues: 

##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission  
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt 


##### Subtopic 1: 
##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 

In [23]:
import requests
import os
import json
import time
from datetime import datetime, timedelta

# Your News API key
API_KEY = '726a4b91967a492cb942c4b2ba03b030'

# Define the base URL for News API
BASE_URL = 'https://newsapi.org/v2/everything'

# Calculate the date range: from today to 25 days ago
end_date = datetime.today().strftime('%Y-%m-%d')  # Today
start_date = (datetime.today() - timedelta(days=25)).strftime('%Y-%m-%d')  # 25 days ago

# Set up headers for requests (if needed)
headers = {
    'User-Agent': 'Mozilla/5.0',  # Optional: Can help in some cases to mimic a browser request
}

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(keyword):
    current_page = 1
    total_articles = 0
    all_articles = []
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors

    while current_page <= 5:  # Limiting to 5 pages for faster retrieval
        print(f"Fetching page {current_page} for query: {keyword}...")
        params = {
            'apiKey': API_KEY,
            'q': keyword,  # Keyword to filter articles
            'from': start_date,  # Start date (YYYY-MM-DD)
            'to': end_date,  # End date (YYYY-MM-DD)
            'pageSize': 50,  # Reduced page size to prevent 426 error
            'language': 'en',  # Language of the articles
            'sortBy': 'publishedAt',  # Sort by the most recent articles
            'page': current_page,  # Pagination
            'sources': 'fox-news'  # Specific source: Fox News
        }

        response = requests.get(BASE_URL, params=params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            all_articles.extend(articles)
            total_articles = data.get('totalResults', 0)

            if len(articles) < 50:
                print(f"Reached the last page for {keyword}. Total articles: {total_articles}")
                break

            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request

        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1

            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {keyword}. Exiting.")
                break
        elif response.status_code == 426:
            print(f"Error: 426 Upgrade Required. Please check the API version or protocol.")
            break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

    return all_articles, total_articles

# Function to save articles to a JSON file
def save_articles_to_json(topic, articles):
    output_dir = 'Output_NewsApi'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(articles, file, ensure_ascii=False, indent=4)
    print(f"Saved {len(articles)} articles to {output_file}")

# Subtopics for Politics
politics_subtopics = [
    "Partisan Divide", "Foreign Policy", "Elections", "Immigration Policy", "Education Policy"
]

# Total articles counter for the overarching topic
total_politics_articles = 0

# Loop through each subtopic
for subtopic in politics_subtopics:
    print(f"Fetching articles for: {subtopic}")
    all_articles, total_articles = fetch_articles_with_backoff(subtopic)
    save_articles_to_json(subtopic, all_articles)
    total_politics_articles += total_articles  # Accumulate total articles for Politics
    print(f"Total articles for '{subtopic}': {total_articles}")

# Print total articles for the Politics topic
print(f"Total articles for 'Politics' (all subtopics): {total_politics_articles}")


Fetching articles for: Partisan Divide
Fetching page 1 for query: Partisan Divide...
Reached the last page for Partisan Divide. Total articles: 8
Saved 8 articles to Output_NewsApi/partisan_divide_articles.json
Total articles for 'Partisan Divide': 8
Fetching articles for: Foreign Policy
Fetching page 1 for query: Foreign Policy...
Fetching page 2 for query: Foreign Policy...
Fetching page 3 for query: Foreign Policy...
Error: 426 Upgrade Required. Please check the API version or protocol.
Saved 100 articles to Output_NewsApi/foreign_policy_articles.json
Total articles for 'Foreign Policy': 145
Fetching articles for: Elections
Fetching page 1 for query: Elections...
Fetching page 2 for query: Elections...
Fetching page 3 for query: Elections...
Error: 426 Upgrade Required. Please check the API version or protocol.
Saved 100 articles to Output_NewsApi/elections_articles.json
Total articles for 'Elections': 118
Fetching articles for: Immigration Policy
Fetching page 1 for query: Immigrat

##### Subtopic 2: 
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 


In [25]:
import requests
import os
import json
import time
from datetime import datetime, timedelta

# Your News API key
API_KEY = '726a4b91967a492cb942c4b2ba03b030'

# Define the base URL for News API
BASE_URL = 'https://newsapi.org/v2/everything'

# Calculate the date range: from today to 25 days ago
end_date = datetime.today().strftime('%Y-%m-%d')  # Today
start_date = (datetime.today() - timedelta(days=25)).strftime('%Y-%m-%d')  # 25 days ago

# Set up headers for requests (if needed)
headers = {
    'User-Agent': 'Mozilla/5.0',  # Optional: Can help in some cases to mimic a browser request
}

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(keyword):
    current_page = 1
    total_articles = 0
    all_articles = []
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors

    while current_page <= 5:  # Limiting to 5 pages for faster retrieval
        print(f"Fetching page {current_page} for query: {keyword}...")
        params = {
            'apiKey': API_KEY,
            'q': keyword,  # Keyword to filter articles
            'from': start_date,  # Start date (YYYY-MM-DD)
            'to': end_date,  # End date (YYYY-MM-DD)
            'pageSize': 50,  # Reduced page size to prevent 426 error
            'language': 'en',  # Language of the articles
            'sortBy': 'publishedAt',  # Sort by the most recent articles
            'page': current_page,  # Pagination
            'sources': 'fox-news'  # Specific source: Fox News
        }

        response = requests.get(BASE_URL, params=params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            all_articles.extend(articles)
            total_articles = data.get('totalResults', 0)

            if len(articles) < 50:
                print(f"Reached the last page for {keyword}. Total articles: {total_articles}")
                break

            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request

        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1

            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {keyword}. Exiting.")
                break
        elif response.status_code == 426:
            print(f"Error: 426 Upgrade Required. Please check the API version or protocol.")
            break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

    return all_articles, total_articles

# Function to save articles to a JSON file
def save_articles_to_json(topic, articles):
    output_dir = 'Output_NewsApi'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(articles, file, ensure_ascii=False, indent=4)
    print(f"Saved {len(articles)} articles to {output_file}")

# Subtopics for Environment
environment_subtopics = [
    "Sustainability", "Climate Change", "Air Pollution", "Recycling", "Green Tech", "Carbon Emission"
]

# Total articles counter for the overarching topic
total_environment_articles = 0

# Loop through each subtopic
for subtopic in environment_subtopics:
    print(f"Fetching articles for: {subtopic}")
    all_articles, total_articles = fetch_articles_with_backoff(subtopic)
    save_articles_to_json(subtopic, all_articles)
    total_environment_articles += total_articles  # Accumulate total articles for Environment
    print(f"Total articles for '{subtopic}': {total_articles}")

# Print total articles for the Environment topic
print(f"Total articles for 'Environment' (all subtopics): {total_environment_articles}")


Fetching articles for: Sustainability
Fetching page 1 for query: Sustainability...
Reached the last page for Sustainability. Total articles: 10
Saved 10 articles to Output_NewsApi/sustainability_articles.json
Total articles for 'Sustainability': 10
Fetching articles for: Climate Change
Fetching page 1 for query: Climate Change...
Reached the last page for Climate Change. Total articles: 46
Saved 46 articles to Output_NewsApi/climate_change_articles.json
Total articles for 'Climate Change': 46
Fetching articles for: Air Pollution
Fetching page 1 for query: Air Pollution...
Reached the last page for Air Pollution. Total articles: 8
Saved 8 articles to Output_NewsApi/air_pollution_articles.json
Total articles for 'Air Pollution': 8
Fetching articles for: Recycling
Fetching page 1 for query: Recycling...
Reached the last page for Recycling. Total articles: 2
Saved 2 articles to Output_NewsApi/recycling_articles.json
Total articles for 'Recycling': 2
Fetching articles for: Green Tech
Fetchi

##### Subtopic 3: 
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 

In [27]:
import requests
import os
import json
import time
from datetime import datetime, timedelta

# Your News API key
API_KEY = '726a4b91967a492cb942c4b2ba03b030'

# Define the base URL for News API
BASE_URL = 'https://newsapi.org/v2/everything'

# Calculate the date range: from today to 25 days ago
end_date = datetime.today().strftime('%Y-%m-%d')  # Today
start_date = (datetime.today() - timedelta(days=25)).strftime('%Y-%m-%d')  # 25 days ago

# Set up headers for requests (if needed)
headers = {
    'User-Agent': 'Mozilla/5.0',  # Optional: Can help in some cases to mimic a browser request
}

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(keyword):
    current_page = 1
    total_articles = 0
    all_articles = []
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors

    while current_page <= 5:  # Limiting to 5 pages for faster retrieval
        print(f"Fetching page {current_page} for query: {keyword}...")
        params = {
            'apiKey': API_KEY,
            'q': keyword,  # Keyword to filter articles
            'from': start_date,  # Start date (YYYY-MM-DD)
            'to': end_date,  # End date (YYYY-MM-DD)
            'pageSize': 50,  # Reduced page size to prevent 426 error
            'language': 'en',  # Language of the articles
            'sortBy': 'publishedAt',  # Sort by the most recent articles
            'page': current_page,  # Pagination
            'sources': 'fox-news'  # Specific source: Fox News
        }

        response = requests.get(BASE_URL, params=params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            all_articles.extend(articles)
            total_articles = data.get('totalResults', 0)

            if len(articles) < 50:
                print(f"Reached the last page for {keyword}. Total articles: {total_articles}")
                break

            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request

        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1

            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {keyword}. Exiting.")
                break
        elif response.status_code == 426:
            print(f"Error: 426 Upgrade Required. Please check the API version or protocol.")
            break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

    return all_articles, total_articles

# Function to save articles to a JSON file
def save_articles_to_json(topic, articles):
    output_dir = 'Output_NewsApi'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(articles, file, ensure_ascii=False, indent=4)
    print(f"Saved {len(articles)} articles to {output_file}")

# Subtopics for Education
education_subtopics = [
    "Education Inequality", "Remote Learning", "Diversity Education", "School Funding", "Student Loan Debt"
]

# Total articles counter for the overarching topic
total_education_articles = 0

# Loop through each subtopic
for subtopic in education_subtopics:
    print(f"Fetching articles for: {subtopic}")
    all_articles, total_articles = fetch_articles_with_backoff(subtopic)
    save_articles_to_json(subtopic, all_articles)
    total_education_articles += total_articles  # Accumulate total articles for Education
    print(f"Total articles for '{subtopic}': {total_articles}")

# Print total articles for the Education topic
print(f"Total articles for 'Education' (all subtopics): {total_education_articles}")


Fetching articles for: Education Inequality
Fetching page 1 for query: Education Inequality...
Reached the last page for Education Inequality. Total articles: 3
Saved 3 articles to Output_NewsApi/education_inequality_articles.json
Total articles for 'Education Inequality': 3
Fetching articles for: Remote Learning
Fetching page 1 for query: Remote Learning...
Reached the last page for Remote Learning. Total articles: 1
Saved 1 articles to Output_NewsApi/remote_learning_articles.json
Total articles for 'Remote Learning': 1
Fetching articles for: Diversity Education
Fetching page 1 for query: Diversity Education...
Reached the last page for Diversity Education. Total articles: 33
Saved 33 articles to Output_NewsApi/diversity_education_articles.json
Total articles for 'Diversity Education': 33
Fetching articles for: School Funding
Fetching page 1 for query: School Funding...
Fetching page 2 for query: School Funding...
Fetching page 3 for query: School Funding...
Error: 426 Upgrade Require