In [None]:
from gdelt import gdelt
import pandas as pd
pd.set_option('display.max_columns', None)
from datetime import datetime, timedelta
from tqdm import tqdm # for progress bars
import concurrent.futures
import os
import calendar
import time
import requests

def fetch_day_with_retry(date_obj, gdelt_obj, retries = 3, delay = 10):
    """
    Fetches data for a single day. Retries if there is a timeout.
    
    Args:
        date_obj (datetime): The date for which to query data.
        gdelt_obj (gdelt object): The initialised GDELT object.
        retries (int): Number of retry attempts.
        delay (int): Delay between retries in seconds.
    
    Returns:
        pd.DataFrame or None: DataFrame with essential columns if data is returned; otherwise, None.
    """
    date_str = date_obj.strftime('%Y-%m-%d')
    attempt = 0
    
    while attempt <= retries:
        try:
            results = gdelt_obj.Search(date_str, table='events')
            df = pd.DataFrame(results)[['Actor1CountryCode', 'Actor2CountryCode', 'GoldsteinScale']]
            return df if not df.empty else None
        except requests.exceptions.ReadTimeout:
            print(f"Timeout on {date_str}, retrying ({attempt + 1}/{retries})...")
            attempt += 1
            time.sleep(delay)
        except Exception as e:
            print(f"Error on {date_str}: {e}")
            return None
    
    print(f"Failed to fetch data for {date_str} after {retries} retries.")
    return None

def fetch_gdelt_data(version, start_year, end_year, backup_folder = '../../data/raw'):
    """
    Fetches and aggregates GDELT data on a monthly basis for the given year range.
    Exports a CSV for each completed year as a backup.
    
    Args:
        version (int): GDELT version (1 or 2).
        start_year (int): Starting year.
        end_year (int): Ending year.
        backup_folder (str): Path to save yearly backups.
    
    Returns:
        pd.DataFrame: Combined yearly data.
    """
    gdelt_obj = gdelt(version = version)
    all_years_data = []

    # ensure backup folder exists
    os.makedirs(backup_folder, exist_ok = True)
    
    # loop through years with progress bar
    for year in tqdm(range(start_year, end_year + 1), desc = f"Processing years (v{version})"):
        monthly_data = []
        start_date = datetime(year, 1, 1)
        
        # process month by month in the given year
        while start_date.year == year:
            # determine last day of current month
            last_day = calendar.monthrange(year, start_date.month)[1]
            end_date = datetime(year, start_date.month, last_day)

            # create list of dates for current month
            month_dates = [start_date + timedelta(days = i) for i in range((end_date - start_date).days + 1)]
            
            # use ThreadPoolExecutor to fetch daily data concurrently
            with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
                month_results = list(executor.map(lambda d: fetch_day_with_retry(d, gdelt_obj), month_dates))
                # filter out none results (days with no data)
                month_results = [r for r in month_results if r is not None]
            
            if month_results:
                month_df = pd.concat(month_results, ignore_index = True)
                # drop rows with missing essential fields
                month_df = month_df.dropna(subset=['Actor1CountryCode', 'Actor2CountryCode', 'GoldsteinScale'])
                
                # group by country pairs and aggregate goldstein scores
                df_grouped = month_df.groupby(['Actor1CountryCode', 'Actor2CountryCode']).agg(
                    total_goldstein=('GoldsteinScale', 'sum'),
                    num_events=('GoldsteinScale', 'count')
                ).reset_index()
                
                # remove rows where the actors are from the same country
                df_grouped = df_grouped[df_grouped['Actor1CountryCode'] != df_grouped['Actor2CountryCode']]

                # add month_start (first day of the month) and source columns
                df_grouped['month_start'] = start_date.strftime('%Y-%m-%d')
                df_grouped['source'] = f"v{version}"
                monthly_data.append(df_grouped)
            
            # move to the next month
            start_date = end_date + timedelta(days = 1)

        if monthly_data:
            year_df = pd.concat(monthly_data, ignore_index = True)
            all_years_data.append(year_df)

            # save backup for the completed year
            backup_filename = f"{backup_folder}/gdelt_v{version}_{year}_monthly.csv"
            year_df.to_csv(backup_filename, index=False)

            # update progress
            tqdm.write(f"Completed year {year} for v{version}.")
    
    return pd.concat(all_years_data, ignore_index=True) if all_years_data else pd.DataFrame()

# ------------------------------------------------------------------------------
# MAIN SCRIPT: Fetch and combine GDELT v1 and v2 data
# ------------------------------------------------------------------------------

# fetch gdelt v1 data: 1980 to 2015 (v1 is valid until 17 Feb 2015)
gdelt_v1_df = fetch_gdelt_data(version = 1, start_year = 1984, end_year = 1985)
gdelt_v1_df = gdelt_v1_df[pd.to_datetime(gdelt_v1_df['month_start']) < pd.Timestamp('2015-02-18')]

# fetch gdelt v2 data: 2015 to 2024 (v2 is valid from 18 Feb 2015 onward)
gdelt_v2_df = fetch_gdelt_data(version = 2, start_year = 2015, end_year = 2024)
gdelt_v2_df = gdelt_v2_df[pd.to_datetime(gdelt_v2_df['month_start']) >= pd.Timestamp('2015-02-18')]

# combine the datasets
combined_df = pd.concat([gdelt_v1_df, gdelt_v2_df], ignore_index = True)
combined_df = combined_df.sort_values(by=['month_start', 'Actor1CountryCode', 'Actor2CountryCode'])

# save combined data to csv file
combined_df.to_csv('../../data/raw/gdelt_monthly.csv', index = False)
print("Combined GDELT data saved to '../../data/raw/gdelt_monthly.csv'")
print(combined_df.head())