In [9]:
import pandas as pd
import requests
import time  # For delays between calls
from datetime import datetime, timedelta  # For date batching

# --- Configuration ---
API_KEY = 'H0LTM3EKC479SBOR'  # Replace with your real key
TICKER = 'AAPL'  # Worked well; change to another if needed (e.g., 'MSFT')
TOPICS = 'economy'  # Keeps relevance to S&P 500
LIMIT = 1000  # Max per docs
SORT = 'RELEVANCE'

# Date ranges for full 2024 (monthly batches to avoid rate limits)
start_year = 2024
months = [f"{start_year}{str(m).zfill(2)}" for m in range(1, 13)]  # '202401', '202402', etc.

# --- Function to Fetch Data for a Date Range ---
def fetch_news(ticker, start_date, end_date):
    url = (
        f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT'
        f'&tickers={ticker}'
        f'&topics={TOPICS}'
        f'&time_from={start_date}'
        f'&time_to={end_date}'
        f'&sort={SORT}'
        f'&limit={LIMIT}'
        f'&apikey={API_KEY}'
    )
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if 'feed' in data and data['feed']:
            return pd.DataFrame(data['feed'])
        else:
            print(f"No data for {start_date}-{end_date}. Skipping...")
            return pd.DataFrame()  # Empty DF if no results
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {start_date}-{end_date}: {e}")
        return pd.DataFrame()

# --- Batch Fetch and Merge ---
all_data = []
for month in months:
    start_date = f'{month}01T0000'
    # Calculate end of month
    dt = datetime.strptime(month, '%Y%m')
    next_month = dt + timedelta(days=31)  # Rough; will adjust to last day
    end_date = f'{next_month.strftime("%Y%m")}01T0000'  # Start of next month as end
    print(f"Fetching for {month} ( {start_date} to {end_date} )...")
    
    df_month = fetch_news(TICKER, start_date, end_date)
    if not df_month.empty:
        # Simplify to requested columns only
        if all(col in df_month.columns for col in ['time_published', 'title', 'summary']):
            df_simple = df_month[['time_published', 'title', 'summary']].copy()
            all_data.append(df_simple)
    
    time.sleep(2)  # Delay to respect rate limits (adjust as needed)

# --- Merge and Save ---
if all_data:
    df_merged = pd.concat(all_data, ignore_index=True)
    # Optional: Sort by datetime
    df_merged['time_published'] = pd.to_datetime(df_merged['time_published'], format='%Y%m%dT%H%M%S')
    df_merged = df_merged.sort_values('time_published')
    
    output_filename = f'sp500_news_simple_2024_{TICKER}.csv'
    df_merged.to_csv(output_filename, index=False)
    
    print(f"✅ Success! {len(df_merged)} articles saved to {output_filename}")
    print("\n--- First 5 Rows ---")
    print(df_merged.head())
else:
    print("No data fetched. Check API key, ticker, or try without dates/topics.")


Fetching for 202401 ( 20240101T0000 to 20240201T0000 )...
Fetching for 202402 ( 20240201T0000 to 20240301T0000 )...
Fetching for 202403 ( 20240301T0000 to 20240401T0000 )...
Fetching for 202404 ( 20240401T0000 to 20240501T0000 )...
Fetching for 202405 ( 20240501T0000 to 20240601T0000 )...
Fetching for 202406 ( 20240601T0000 to 20240701T0000 )...
Fetching for 202407 ( 20240701T0000 to 20240801T0000 )...
Fetching for 202408 ( 20240801T0000 to 20240901T0000 )...
Fetching for 202409 ( 20240901T0000 to 20241001T0000 )...
Fetching for 202410 ( 20241001T0000 to 20241101T0000 )...
Fetching for 202411 ( 20241101T0000 to 20241201T0000 )...
Fetching for 202412 ( 20241201T0000 to 20250101T0000 )...
✅ Success! 7006 articles saved to sp500_news_simple_2024_AAPL.csv

--- First 5 Rows ---
         time_published                                              title  \
354 2024-01-06 14:12:42  Barron's Weekend Stock Picks: Abercrombie & Fi...   
683 2024-01-06 15:04:00  What That Famous Investing Quote Ab