# 1. Imports

In [1]:
import requests          # For making HTTP requests to the GDELT API
import pandas as pd      # For handling tabular data as DataFrames
import time              # For adding delays to avoid API rate limiting
import json              # For safely parsing JSON responses

from datetime import datetime, timedelta  # For working with dates and times

# 2. Fetch Data from GDELT API

In [2]:
def fetch_gdelt_articles_safe(url):
    """
    Fetch articles from a full GDELT API URL safely.
    Applies basic validation and returns a cleaned pandas DataFrame.
    Only English-language articles are kept.
    """
    try:
        # Send a GET request to the provided URL with a 30-second timeout
        res = requests.get(url, timeout=30)

        # If the status code is not 200 (OK), stop and break. Return warning and end the running 
        if res.status_code != 200:
            print(f"⚠️ Status {res.status_code}, skipping.")
            return pd.DataFrame()

        # Strip whitespace from the response text
        text = res.text.strip()

        # GDELT JSON responses should start with "{" and end with "}"
        # If not, we consider it incomplete or invalid
        if not text.startswith("{") or not text.endswith("}"):
            print("⚠️ Incomplete JSON, skipping.")
            return pd.DataFrame()

        # Parse the JSON text into a Python dictionary
        data = json.loads(text)

        df = pd.DataFrame(data["articles"])

        # This df should have columns of url, url_mobile, title, seendate, socialimage, domain, language, sourcecountry
        for col in ["title", "seendate", "url", "sourcecountry", "language", "domain"]:
            if col not in df.columns:
                df[col] = None

        # Convert 'seendate' column to datetime, coercing invalid values to NaT
        df["seendate"] = pd.to_datetime(df["seendate"], errors="coerce")

        # Keep only rows where the source country is the United States ("US")
        df = df[df["language"] == "English"]

        # Restrict the DataFrame to the columns we want to keep
        df = df[["title", "seendate", "url", "sourcecountry", "language", "domain"]]
        return df

    except Exception as e:
        # If any error occurs, log it and return an empty DataFrame
        print(f"❌ Error: {e}")
        return pd.DataFrame()

In [3]:
def fetch_gdelt_articles_window(query, start_dt, end_dt, maxrecords=100):
    """
    Fetch Tesla articles from GDELT for a single window.
    start_dt and end_dt are datetime objects.
    """
    start_str = start_dt.strftime("%Y%m%d%H%M%S")
    end_str = end_dt.strftime("%Y%m%d%H%M%S")

    url = (
        f"https://api.gdeltproject.org/api/v2/doc/doc?"
        f"query={query}&mode=ArtList&maxrecords={maxrecords}"
        f"&startdatetime={start_str}&enddatetime={end_str}&format=JSON"
    )

    return fetch_gdelt_articles_safe(url)

In [4]:
def fetch_gdelt_articles_range(
    query,
    start_date,
    end_date,
    step_days=1,
    maxrecords=100
):
    """
    Collect GDELT articles for a given query over a date range.
    The range is split into windows of 'step_days' days.
    Only English-language articles from US sources are retained.
    """

    # If end_date is not provided, default to today's date (YYYY-MM-DD)
    if end_date is None:
        end_date = datetime.today().strftime("%Y-%m-%d")

    # Store all window DataFrames in a list
    all_articles = []

    # Convert input date strings to datetime objects
    current_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")

    # Loop over the date range in chunks of 'step_days'
    while current_dt < end_dt:
        # Compute the end of the current window
        next_dt = current_dt + timedelta(days=step_days)

        # Log which window we are fetching
        print(
            f"Fetching articles from "
            f"{current_dt.strftime('%Y-%m-%d')} "
            f"to {next_dt.strftime('%Y-%m-%d')}..."
        )

        # Fetch articles for this window
        df = fetch_gdelt_articles_window(
            query=query,
            start_dt=current_dt,
            end_dt=next_dt,
            maxrecords=maxrecords
        )

        # If we got any articles, append them to the list
        if not df.empty:
            all_articles.append(df)
            print(f"✅ {len(df)} articles collected in this window.")
        else:
            print("⚠️ No articles found in this window.")

        # Move to the next window
        current_dt = next_dt

        # Sleep to avoid hitting GDELT rate limits
        time.sleep(30)

    # After looping through all windows, combine results if any were found
    if all_articles:
        # Concatenate all DataFrames, drop duplicate URLs, and sort by date
        combined_df = (
            pd.concat(all_articles, ignore_index=True)
              .drop_duplicates(subset="url")
        )
        combined_df.sort_values("seendate", inplace=True)
        return combined_df
    else:
        # If no articles were collected at all, log and return empty DataFrame
        print("⚠️ No articles collected at all.")
        return pd.DataFrame()


In [5]:
def month_range(start, end):
    """Generate (month_start, month_end) for each month."""
    current = start.replace(day=1)
    while current <= end:
        next_month = (current.replace(day=28) + timedelta(days=4)).replace(day=1)
        yield current, min(next_month, end + timedelta(days=1))
        current = next_month


# Loop from Nov 2024 → Nov 2025
start_date = datetime(2023, 11, 1) # Change this start time
end_date   = datetime(2024, 11, 1) # Change this end time

for month_start, month_end in month_range(start_date, end_date):
    print(f"=== Fetching articles from {month_start.strftime('%Y-%m-%d')} to {month_end.strftime('%Y-%m-%d')} ===")
    # Format for logging + naming
    ym_label = month_start.strftime("%Y-%m")

    df_month = fetch_gdelt_articles_range(
        query="NVIDIA NVDA",               #
        start_date=month_start.strftime("%Y-%m-%d"),
        end_date=month_end.strftime("%Y-%m-%d"),
        step_days=1,
        maxrecords=250
    )

    # Save CSV
    filename = f"../data/other_data/gdelt_nvda_{ym_label}.csv"
    df_month.to_csv(filename, index=False)

    print(f"✔ Saved {len(df_month)} articles → {filename}\n")


=== Fetching articles from 2023-11-01 to 2023-12-01 ===
Fetching articles from 2023-11-01 to 2023-11-02...
✅ 6 articles collected in this window.
Fetching articles from 2023-11-02 to 2023-11-03...
✅ 14 articles collected in this window.
Fetching articles from 2023-11-03 to 2023-11-04...
✅ 9 articles collected in this window.
Fetching articles from 2023-11-04 to 2023-11-05...
✅ 8 articles collected in this window.
Fetching articles from 2023-11-05 to 2023-11-06...
✅ 7 articles collected in this window.
Fetching articles from 2023-11-06 to 2023-11-07...
✅ 25 articles collected in this window.
Fetching articles from 2023-11-07 to 2023-11-08...
✅ 22 articles collected in this window.
Fetching articles from 2023-11-08 to 2023-11-09...
✅ 19 articles collected in this window.
Fetching articles from 2023-11-09 to 2023-11-10...
✅ 19 articles collected in this window.
Fetching articles from 2023-11-10 to 2023-11-11...
✅ 16 articles collected in this window.
Fetching articles from 2023-11-11 to 2

In [7]:
# get all csv files in the data/other_data/ directory that start with "gdelt_nvda_"
import os
csv_files = [
    f for f in os.listdir("../data/other_data/")
    if f.startswith("gdelt_nvda_") and f.endswith(".csv")
]

# Load and concatenate all CSV files into a single DataFrame
all_dfs = []
for csv_file in csv_files:
    df = pd.read_csv(os.path.join("../data/other_data/", csv_file))
    all_dfs.append(df)

df_gdelt_nvda = pd.concat(all_dfs, ignore_index=True)

df_gdelt_nvda.to_csv("../data/NVIDIA_NewsHeadlines_20231101-Present.csv", index=False)