# Exploring GDELT Data

This notebook demonstrates how to quickly fetch global event data from the GDELT API, filter it to focus on events related to "military" or "conflict," and perform some basic exploration. Later, you can extend this work by curating and integrating the events with your defense spending data.

In [None]:
# ... existing code ...

import requests
import pandas as pd
import io
import time
import random
from datetime import datetime

def fetch_gdelt_data_robust(query="Vietnam War", maxrecords=5, max_retries=5):
    """
    Fetches data from the GDELT API with robust error handling and backoff strategy.
    
    Args:
        query (str): The search term for the API.
        maxrecords (int): Maximum number of records to retrieve (keep this small).
        max_retries (int): Maximum number of retry attempts.
        
    Returns:
        pd.DataFrame: DataFrame containing the returned CSV data, or None if there's an error.
    """
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    # Add a timestamp to make each query slightly different (helps avoid caching issues)
    timestamp = int(datetime.now().timestamp())
    
    params = {
        "query": f"{query}",
        "mode": "ArtList",
        "maxrecords": maxrecords,
        "format": "CSV",
        "timespan": "1day",  # Limit to recent data
        "ts": timestamp  # Add timestamp to avoid caching
    }
    
    # Custom headers that might help with rate limiting
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive"
    }
    
    for attempt in range(max_retries):
        try:
            # Calculate backoff delay: 5, 10, 20, 40, 80 seconds...
            delay = 5 * (2 ** attempt) + random.uniform(0, 2)
            
            print(f"Attempt {attempt + 1}/{max_retries}. Will wait {delay:.1f}s after request...")
            
            # Make the request
            response = requests.get(base_url, params=params, headers=headers, timeout=30)
            
            # Check if successful
            if response.status_code == 200:
                print("✅ Successfully fetched data!")
                df = pd.read_csv(io.StringIO(response.text))
                return df
            
            # Handle rate limiting
            if response.status_code == 429:
                print(f"⚠️ Rate limited (429). Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
                continue
                
            # Handle other errors
            print(f"❌ Error: Status code {response.status_code}")
            
        except Exception as e:
            print(f"❌ Error occurred: {str(e)}")
        
        # Wait before retrying
        print(f"Waiting {delay:.1f} seconds before next attempt...")
        time.sleep(delay)
    
    print("Failed to fetch data after maximum retries")
    return None

# Try alternative approach: using the GDELT GKG (Global Knowledge Graph) API
def fetch_gdelt_gkg(query="Ukraine", maxrecords=5):
    """
    Alternative approach using the GDELT GKG API which might have different rate limits.
    """
    base_url = "https://api.gdeltproject.org/api/v2/gkg"
    
    params = {
        "query": query,
        "format": "csv",
        "maxrecords": maxrecords,
        "timespan": "1day"
    }
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
    }
    
    print("Trying alternative GDELT GKG API...")
    
    try:
        response = requests.get(base_url, params=params, headers=headers, timeout=30)
        
        if response.status_code == 200:
            print("✅ Successfully fetched data from GKG API!")
            df = pd.read_csv(io.StringIO(response.text))
            return df
        else:
            print(f"❌ GKG API error: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"❌ GKG API error: {str(e)}")
        return None

# Test the robust function with a very small query
print("Attempting to fetch data with robust method...")
gdelt_df = fetch_gdelt_data_robust(query="Vietnam War", maxrecords=3, max_retries=3)

# If that fails, try the alternative API
if gdelt_df is None:
    print("\nTrying alternative API...")
    gdelt_df = fetch_gdelt_gkg(query="Ukraine", maxrecords=3)

# Display results if we got any data
if gdelt_df is not None:
    print("\nFirst few rows of the data:")
    print(gdelt_df.head(2))
    print("\nColumns in the dataset:")
    print(gdelt_df.columns.tolist())
else:
    print("\n❌ Both methods failed to retrieve data. Try again later or with different parameters.")
    print("Tips:")
    print("1. Wait at least 15 minutes before traying again")
    print("2. Try using a VPN or different network")
    print("3. Reduce the maxrecords parameter even further")
    print("4. Try a more specific query")

# ... existing code ...

Attempting to fetch data with robust method...
Attempt 1/3. Will wait 6.5s after request...
✅ Successfully fetched data!

First few rows of the data:
                                                 URL  \
0  https://baomoi.com/dan-toc-viet-nam-kien-cuong...   
1  https://www.vietnamplus.vn/trien-lam-viet-nam-...   

                                           MobileURL                 Date  \
0                                                NaN  2025-03-06 06:00:00   
1  https://www.vietnamplus.vn/trien-lam-viet-nam-...  2025-03-06 03:00:00   

                                               Title  
0  Dân tộc Việt Nam kiên cường trong lịch sử qua ...  
1  Triển lãm Việt Nam giai đoạn 1966 - 1976 qua ố...  

Columns in the dataset:
['URL', 'MobileURL', 'Date', 'Title']


In [13]:
# Display the first few rows to inspect the data structure
gdelt_df.head()

Unnamed: 0,URL,MobileURL,Date,Title
0,https://baomoi.com/dan-toc-viet-nam-kien-cuong...,,2025-03-06 06:00:00,Dân tộc Việt Nam kiên cường trong lịch sử qua ...
1,https://www.vietnamplus.vn/trien-lam-viet-nam-...,https://www.vietnamplus.vn/trien-lam-viet-nam-...,2025-03-06 03:00:00,Triển lãm Việt Nam giai đoạn 1966 - 1976 qua ố...
2,https://baotintuc.vn/van-hoa/trien-lam-viet-na...,,2025-03-06 03:30:00,Triển lãm Việt Nam giai đoạn 1966 - 1976 qua ố...
