## Pulling Articles from Guardian API using following Topics & Issues: 

##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission  
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt 



##### Subtopic 1: 
##### Politics: Partisan Divide, Foreign Policy, Elections, Immigration Policy, Education Policy

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 1653

In [5]:
import requests
import os
import json
import time

# Your Guardian API key
api_key = '72ce31b8-f54f-47c3-a8dd-d83f23b7c4c1'

# Define the base URL for the Guardian API search endpoint
url = "https://content.guardianapis.com/search"

# Politics topics to search
politics_topics = [
    'Partisan Divide', 'Foreign Policy', 'Elections', 'Immigration Policy', 'Education Policy'
]

# Set up the parameters for the API request
params = {
    'from-date': '2025-03-01',  # Changed the date range to March 1, 2025 to March 31, 2025
    'to-date': '2025-03-31',
    'api-key': api_key,
    'format': 'json',
    'page-size': 200  # Maximum allowed per request
}

# Initialize variables for pagination
current_page = 1
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_politics_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_Guardian/Politics'
os.makedirs(output_dir, exist_ok=True)

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors
    
    while current_page <= total_pages and current_page <= max_pages:
        print(f"Fetching page {current_page} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('results', [])
            all_articles.extend(articles)
            total_pages = data.get('response', {}).get('pages', 1)
            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request
        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the politics topics
for topic in politics_topics:
    all_articles = []
    current_page = 1
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all Politics topics
    total_politics_articles += len(all_articles)

# Display the total number of articles for all Politics topics
print(f"\nTotal number of articles retrieved across all Politics topics: {total_politics_articles}")


Fetching page 1 for query: Partisan Divide...
Total number of articles retrieved for 'Partisan Divide': 181
Fetching page 1 for query: Foreign Policy...
Fetching page 2 for query: Foreign Policy...
Fetching page 3 for query: Foreign Policy...
Total number of articles retrieved for 'Foreign Policy': 473
Fetching page 1 for query: Elections...
Fetching page 2 for query: Elections...
Total number of articles retrieved for 'Elections': 253
Fetching page 1 for query: Immigration Policy...
Fetching page 2 for query: Immigration Policy...
Fetching page 3 for query: Immigration Policy...
Total number of articles retrieved for 'Immigration Policy': 418
Fetching page 1 for query: Education Policy...
Fetching page 2 for query: Education Policy...
Total number of articles retrieved for 'Education Policy': 328

Total number of articles retrieved across all Politics topics: 1653


##### Subtopic 2: 
##### Environmental: Sustainability, Climate Change, air pollution, recycling, green tech, carbon emission

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 1458

In [27]:
import requests
import os
import json
import time

# Your Guardian API key
api_key = '72ce31b8-f54f-47c3-a8dd-d83f23b7c4c1'

# Define the base URL for the Guardian API search endpoint
url = "https://content.guardianapis.com/search"

# Environmental topics to search
environmental_topics = [
    'Sustainability', 'Climate Change', 'Air Pollution', 'Recycling', 'Green Tech', 'Carbon Emission'
]

# Set up the parameters for the API request
params = {
    'from-date': '2025-03-01',  # Date range set to March 1, 2025 to March 31, 2025
    'to-date': '2025-03-31',
    'api-key': api_key,
    'format': 'json',
    'page-size': 200  # Maximum allowed per request
}

# Initialize variables for pagination
current_page = 1
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_environmental_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_Guardian/Environmental'
os.makedirs(output_dir, exist_ok=True)

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors
    
    while current_page <= total_pages and current_page <= max_pages:
        print(f"Fetching page {current_page} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('results', [])
            all_articles.extend(articles)
            total_pages = data.get('response', {}).get('pages', 1)
            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request
        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the environmental topics
for topic in environmental_topics:
    all_articles = []
    current_page = 1
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all Environmental topics
    total_environmental_articles += len(all_articles)

# Display the total number of articles for all Environmental topics
print(f"\nTotal number of articles retrieved across all Environmental topics: {total_environmental_articles}")


Fetching page 1 for query: Sustainability...
Total number of articles retrieved for 'Sustainability': 92
Fetching page 1 for query: Climate Change...
Fetching page 2 for query: Climate Change...
Fetching page 3 for query: Climate Change...
Total number of articles retrieved for 'Climate Change': 478
Fetching page 1 for query: Air Pollution...
Fetching page 2 for query: Air Pollution...
Total number of articles retrieved for 'Air Pollution': 202
Fetching page 1 for query: Recycling...
Total number of articles retrieved for 'Recycling': 35
Fetching page 1 for query: Green Tech...
Fetching page 2 for query: Green Tech...
Fetching page 3 for query: Green Tech...
Total number of articles retrieved for 'Green Tech': 477
Fetching page 1 for query: Carbon Emission...
Total number of articles retrieved for 'Carbon Emission': 174

Total number of articles retrieved across all Environmental topics: 1458


##### Subtopic 3: 
##### Education: education inequality, remote learning, diversity education, school funding, Student Loan Debt

##### Date Range: March 1, 2025 - March 31, 2025
##### Total Articles Pulled: 1197

In [36]:
import requests
import os
import json
import time

# Your Guardian API key
api_key = '72ce31b8-f54f-47c3-a8dd-d83f23b7c4c1'

# Define the base URL for the Guardian API search endpoint
url = "https://content.guardianapis.com/search"

# Education topics to search
education_topics = [
    'Education Inequality', 'Remote Learning', 'Diversity Education', 'School Funding', 'Student Loan Debt'
]

# Set up the parameters for the API request
params = {
    'from-date': '2025-03-01',  # Date range set to March 1, 2025 to March 31, 2025
    'to-date': '2025-03-31',
    'api-key': api_key,
    'format': 'json',
    'page-size': 200  # Maximum allowed per request
}

# Initialize variables for pagination
current_page = 1
total_pages = 1
all_articles = []
max_pages = 10  # Limit to 10 pages
total_education_articles = 0  # Variable to keep track of the total number of articles across all topics

# Create the output directory if it doesn't exist
output_dir = 'Output_Guardian/Education'
os.makedirs(output_dir, exist_ok=True)

# Function to fetch articles with exponential backoff
def fetch_articles_with_backoff(query):
    global current_page, total_pages, all_articles  # Use global variables
    backoff_time = 1  # Start with 1 second
    retry_count = 0  # To track retries for 503 errors
    
    while current_page <= total_pages and current_page <= max_pages:
        print(f"Fetching page {current_page} for query: {query}...")
        params['q'] = query
        params['page'] = current_page
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('response', {}).get('results', [])
            all_articles.extend(articles)
            total_pages = data.get('response', {}).get('pages', 1)
            current_page += 1
            backoff_time = 1  # Reset backoff time after a successful request
            retry_count = 0  # Reset retry count after a successful request
        elif response.status_code == 503:
            print(f"Service unavailable (503). Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponentially increase the backoff time
            retry_count += 1
            
            if retry_count >= 3:  # Retry a maximum of 3 times
                print(f"Max retries reached for {query}. Exiting.")
                break
        else:
            print(f"Error: {response.status_code}. Exiting.")
            break

# Loop through the education topics
for topic in education_topics:
    all_articles = []
    current_page = 1
    fetch_articles_with_backoff(topic)
    
    # Save all articles to a JSON file for the topic
    output_file = os.path.join(output_dir, f'{topic.replace(" ", "_").lower()}_articles.json')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(all_articles, file, ensure_ascii=False, indent=4)
    
    # Display the total number of articles retrieved for the topic
    print(f"Total number of articles retrieved for '{topic}': {len(all_articles)}")
    
    # Add to the total count of articles across all Education topics
    total_education_articles += len(all_articles)

# Display the total number of articles for all Education topics
print(f"\nTotal number of articles retrieved across all Education topics: {total_education_articles}")


Fetching page 1 for query: Education Inequality...
Total number of articles retrieved for 'Education Inequality': 156
Fetching page 1 for query: Remote Learning...
Fetching page 2 for query: Remote Learning...
Total number of articles retrieved for 'Remote Learning': 215
Fetching page 1 for query: Diversity Education...
Fetching page 2 for query: Diversity Education...
Total number of articles retrieved for 'Diversity Education': 276
Fetching page 1 for query: School Funding...
Fetching page 2 for query: School Funding...
Total number of articles retrieved for 'School Funding': 380
Fetching page 1 for query: Student Loan Debt...
Total number of articles retrieved for 'Student Loan Debt': 170

Total number of articles retrieved across all Education topics: 1197
