In [49]:
from gdelt import gdelt
import pandas as pd
pd.set_option('display.max_columns', None)
from datetime import datetime, timedelta
from tqdm import tqdm # for progress bars
import concurrent.futures
import os
import calendar
import time
import requests

def fetch_day_with_retry(date_obj, gdelt_obj, retries = 3, delay = 10):
    """
    Fetches data for a single day. Retries if there is a timeout.
    
    Args:
        date_obj (datetime): The date for which to query data.
        gdelt_obj (gdelt object): The initialised GDELT object.
        retries (int): Number of retry attempts.
        delay (int): Delay between retries in seconds.
    
    Returns:
        pd.DataFrame or None: DataFrame with essential columns if data is returned; otherwise, None.
    """
    date_str = date_obj.strftime('%Y-%m-%d')
    attempt = 0
    
    while attempt <= retries:
        try:
            results = gdelt_obj.Search(date_str, table='events')
            df = pd.DataFrame(results)[['Actor1CountryCode', 'Actor2CountryCode', 'GoldsteinScale']]
            return df if not df.empty else None
        except requests.exceptions.ReadTimeout:
            print(f"Timeout on {date_str}, retrying ({attempt + 1}/{retries})...")
            attempt += 1
            time.sleep(delay)
        except Exception as e:
            print(f"Error on {date_str}: {e}")
            return None
    
    print(f"Failed to fetch data for {date_str} after {retries} retries.")
    return None

def fetch_gdelt_data(version, start_year, end_year, backup_folder = '../../data/raw'):
    """
    Fetches and aggregates GDELT data on a monthly basis for the given year range.
    Exports a CSV for each completed year as a backup.
    
    Args:
        version (int): GDELT version (1 or 2).
        start_year (int): Starting year.
        end_year (int): Ending year.
        backup_folder (str): Path to save yearly backups.
    
    Returns:
        pd.DataFrame: Combined yearly data.
    """
    gdelt_obj = gdelt(version = version)
    all_years_data = []

    # ensure backup folder exists
    os.makedirs(backup_folder, exist_ok = True)
    
    # loop through years with progress bar
    for year in tqdm(range(start_year, end_year + 1), desc = f"Processing years (v{version})"):
        monthly_data = []
        start_date = datetime(year, 1, 1)
        
        # process month by month in the given year
        while start_date.year == year:
            # determine last day of current month
            last_day = calendar.monthrange(year, start_date.month)[1]
            end_date = datetime(year, start_date.month, last_day)

            # create list of dates for current month
            month_dates = [start_date + timedelta(days = i) for i in range((end_date - start_date).days + 1)]
            
            # use ThreadPoolExecutor to fetch daily data concurrently
            with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
                month_results = list(executor.map(lambda d: fetch_day_with_retry(d, gdelt_obj), month_dates))
                # filter out none results (days with no data)
                month_results = [r for r in month_results if r is not None]
            
            if month_results:
                month_df = pd.concat(month_results, ignore_index = True)
                # drop rows with missing essential fields
                month_df = month_df.dropna(subset=['Actor1CountryCode', 'Actor2CountryCode', 'GoldsteinScale'])
                
                # group by country pairs and aggregate goldstein scores
                df_grouped = month_df.groupby(['Actor1CountryCode', 'Actor2CountryCode']).agg(
                    total_goldstein=('GoldsteinScale', 'sum'),
                    num_events=('GoldsteinScale', 'count')
                ).reset_index()
                
                # remove rows where the actors are from the same country
                df_grouped = df_grouped[df_grouped['Actor1CountryCode'] != df_grouped['Actor2CountryCode']]

                # add month_start (first day of the month) and source columns
                df_grouped['month_start'] = start_date.strftime('%Y-%m-%d')
                df_grouped['source'] = f"v{version}"
                monthly_data.append(df_grouped)
            
            # move to the next month
            start_date = end_date + timedelta(days = 1)

        if monthly_data:
            year_df = pd.concat(monthly_data, ignore_index = True)
            all_years_data.append(year_df)

            # save backup for the completed year
            backup_filename = f"{backup_folder}/gdelt_v{version}_{year}_monthly.csv"
            year_df.to_csv(backup_filename, index=False)

            # update progress
            tqdm.write(f"Completed year {year} for v{version}.")
    
    return pd.concat(all_years_data, ignore_index=True) if all_years_data else pd.DataFrame()

# ------------------------------------------------------------------------------
# MAIN SCRIPT: Fetch and combine GDELT v1 and v2 data
# ------------------------------------------------------------------------------

# fetch gdelt v1 data: 1980 to 2015 (v1 is valid until 17 Feb 2015)
gdelt_v1_df = fetch_gdelt_data(version = 1, start_year = 1984, end_year = 1985)
gdelt_v1_df = gdelt_v1_df[pd.to_datetime(gdelt_v1_df['month_start']) < pd.Timestamp('2015-02-18')]

# fetch gdelt v2 data: 2015 to 2024 (v2 is valid from 18 Feb 2015 onward)
gdelt_v2_df = fetch_gdelt_data(version = 2, start_year = 2015, end_year = 2024)
gdelt_v2_df = gdelt_v2_df[pd.to_datetime(gdelt_v2_df['month_start']) >= pd.Timestamp('2015-02-18')]

# combine the datasets
combined_df = pd.concat([gdelt_v1_df, gdelt_v2_df], ignore_index = True)
combined_df = combined_df.sort_values(by=['month_start', 'Actor1CountryCode', 'Actor2CountryCode'])

# save combined data to csv file
combined_df.to_csv('../../data/raw/gdelt_monthly.csv', index = False)
print("Combined GDELT data saved to '../../data/raw/gdelt_monthly.csv'")
print(combined_df.head())

Processing years (v1):   3%|▍               | 1/36 [21:02<12:16:36, 1262.77s/it]

Completed year 1980 for v1.


Processing years (v1):   6%|▉               | 2/36 [46:16<13:19:21, 1410.63s/it]

Completed year 1981 for v1.


Processing years (v1):   8%|█▏            | 3/36 [1:14:26<14:05:47, 1537.81s/it]

Completed year 1982 for v1.


Processing years (v1):  11%|█▌            | 4/36 [1:45:52<14:53:32, 1675.38s/it]

Completed year 1983 for v1.
Error on 1984-06-06: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))Error on 1984-06-07: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))

Error on 1984-06-09: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.Error on 1984-06-08: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1984-06-10: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.

Error on 1984-06-18: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.Error on 1984-06-21: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1984-06-19: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1984-06-20: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.

Processing years (v1):  14%|█▉            | 5/36 [3:58:50<33:59:55, 3948.24s/it]

Completed year 1984 for v1.
Error on 1985-01-14: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.Error on 1985-01-16: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.

Error on 1985-01-15: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1985-01-25: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.Error on 1985-01-22: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1985-01-26: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1985-01-24: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.

Error on 1985-01-23: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1985-02-05: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1985-02-04: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 198

Processing years (v1):  17%|██▎           | 6/36 [6:05:00<43:09:45, 5179.53s/it]

Completed year 1985 for v1.


Processing years (v1):  19%|██▋           | 7/36 [6:44:50<34:22:40, 4267.59s/it]

Completed year 1986 for v1.


Processing years (v1):  22%|███           | 8/36 [7:25:59<28:44:20, 3695.03s/it]

Completed year 1987 for v1.


Processing years (v1):  25%|███▌          | 9/36 [8:10:46<25:20:55, 3379.82s/it]

Completed year 1988 for v1.


Processing years (v1):  28%|███▌         | 10/36 [8:51:27<22:19:02, 3090.11s/it]

Completed year 1989 for v1.


Processing years (v1):  31%|███▉         | 11/36 [9:33:17<20:13:33, 2912.55s/it]

Completed year 1990 for v1.
Error on 1991-03-05: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.Error on 1991-03-04: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1991-03-03: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1991-03-02: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.

Error on 1991-03-01: HTTPConnectionPool(host='data.gdeltproject.org', port=80): Read timed out.
Error on 1991-03-07: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x378df6690>, 'Connection to raw.githubusercontent.com timed out. (connect timeout=None)'))Error on 1991-03-10: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded wi

Processing years (v1):  33%|████        | 12/36 [11:33:24<28:07:32, 4218.85s/it]

Error on 1991-12-31: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x319c51b90>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Completed year 1991 for v1.
Error on 1992-01-01: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x157b042d0>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))Error on 1992-01-05: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/uti

Processing years (v1):  36%|████▎       | 13/36 [12:52:26<27:57:59, 4377.38s/it]

Error on 1992-12-31: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3302ff350>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1993-01-02: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x33cb10890>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))Error on 1993-01-01: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_eve

Processing years (v1):  39%|████▋       | 14/36 [14:11:28<27:25:26, 4487.59s/it]

Error on 1993-12-31: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x157b07410>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1994-01-01: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3368c0d90>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1994-01-02: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_ev

Processing years (v1):  42%|█████       | 15/36 [15:30:31<26:37:34, 4564.51s/it]

Error on 1994-12-31: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x31ae20750>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1995-01-02: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x30b3662d0>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1995-01-03: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_ev

Processing years (v1):  44%|█████▎      | 16/36 [16:49:33<25:39:21, 4618.09s/it]

Error on 1995-12-31: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x31b990c50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1996-01-01: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x155d7b690>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1996-01-02: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_ev

Processing years (v1):  44%|█████▎      | 16/36 [17:01:34<21:16:57, 3830.89s/it]

Error on 1996-02-21: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16dae7d10>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1996-02-22: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_event_Column_Labels_Header_Row_Sep2016.tsv (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x312d67490>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))
Error on 1996-02-23: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /linwoodc3/gdeltPyR/master/utils/schema_csvs/GDELT_1.0_ev




KeyboardInterrupt: 