In [1]:
import requests          # For making HTTP requests to the GDELT API
import pandas as pd      # For handling tabular data as DataFrames
import time              # For adding delays to avoid API rate limiting
import json              # For safely parsing JSON responses

from datetime import datetime, timedelta  # For working with dates and times

# 2. Fetch Data from GDELT API

In [2]:
def fetch_gdelt_articles_safe(url):
    """
    Fetch articles from a full GDELT API URL safely.
    Applies basic validation and returns a cleaned pandas DataFrame.
    Only English-language articles are kept.
    """
    try:
        # Send a GET request to the provided URL with a 30-second timeout
        res = requests.get(url, timeout=30)

        # If the status code is not 200 (OK), stop and break. Return warning and end the running 
        if res.status_code != 200:
            print(f"⚠️ Status {res.status_code}, skipping.")
            return pd.DataFrame()

        # Strip whitespace from the response text
        text = res.text.strip()

        # GDELT JSON responses should start with "{" and end with "}"
        # If not, we consider it incomplete or invalid
        if not text.startswith("{") or not text.endswith("}"):
            print("⚠️ Incomplete JSON, skipping.")
            return pd.DataFrame()

        # Parse the JSON text into a Python dictionary
        data = json.loads(text)

        df = pd.DataFrame(data["articles"])

        # This df should have columns of url, url_mobile, title, seendate, socialimage, domain, language, sourcecountry
        for col in ["title", "seendate", "url", "sourcecountry", "language", "domain"]:
            if col not in df.columns:
                df[col] = None

        # Convert 'seendate' column to datetime, coercing invalid values to NaT
        df["seendate"] = pd.to_datetime(df["seendate"], errors="coerce")

        # Keep only rows where the source country is the United States ("US")
        df = df[df["language"] == "English"]

        # Restrict the DataFrame to the columns we want to keep
        df = df[["title", "seendate", "url", "sourcecountry", "language", "domain"]]
        return df

    except Exception as e:
        # If any error occurs, log it and return an empty DataFrame
        print(f"❌ Error: {e}")
        return pd.DataFrame()

In [3]:
def fetch_gdelt_articles_window(query, start_dt, end_dt, maxrecords=100):
    """
    Fetch Tesla articles from GDELT for a single window.
    start_dt and end_dt are datetime objects.
    """
    start_str = start_dt.strftime("%Y%m%d%H%M%S")
    end_str = end_dt.strftime("%Y%m%d%H%M%S")

    url = (
        f"https://api.gdeltproject.org/api/v2/doc/doc?"
        f"query={query}&mode=ArtList&maxrecords={maxrecords}"
        f"&startdatetime={start_str}&enddatetime={end_str}&format=JSON"
    )

    return fetch_gdelt_articles_safe(url)

In [4]:
def fetch_gdelt_articles_range(
    query,
    start_date,
    end_date,
    step_days=1,
    maxrecords=100
):
    """
    Collect GDELT articles for a given query over a date range.
    The range is split into windows of 'step_days' days.
    Only English-language articles from US sources are retained.
    """

    # If end_date is not provided, default to today's date (YYYY-MM-DD)
    if end_date is None:
        end_date = datetime.today().strftime("%Y-%m-%d")

    # Store all window DataFrames in a list
    all_articles = []

    # Convert input date strings to datetime objects
    current_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")

    # Loop over the date range in chunks of 'step_days'
    while current_dt < end_dt:
        # Compute the end of the current window
        next_dt = current_dt + timedelta(days=step_days)

        # Log which window we are fetching
        print(
            f"Fetching articles from "
            f"{current_dt.strftime('%Y-%m-%d')} "
            f"to {next_dt.strftime('%Y-%m-%d')}..."
        )

        # Fetch articles for this window
        df = fetch_gdelt_articles_window(
            query=query,
            start_dt=current_dt,
            end_dt=next_dt,
            maxrecords=maxrecords
        )

        # If we got any articles, append them to the list
        if not df.empty:
            all_articles.append(df)
            print(f"✅ {len(df)} articles collected in this window.")
        else:
            print("⚠️ No articles found in this window.")

        # Move to the next window
        current_dt = next_dt

        # Sleep to avoid hitting GDELT rate limits
        time.sleep(60)

    # After looping through all windows, combine results if any were found
    if all_articles:
        # Concatenate all DataFrames, drop duplicate URLs, and sort by date
        combined_df = (
            pd.concat(all_articles, ignore_index=True)
              .drop_duplicates(subset="url")
        )
        combined_df.sort_values("seendate", inplace=True)
        return combined_df
    else:
        # If no articles were collected at all, log and return empty DataFrame
        print("⚠️ No articles collected at all.")
        return pd.DataFrame()


In [14]:
gdelt_NVDA = fetch_gdelt_articles_range(
    query="NVIDIA NVDA",
    start_date="2024-11-01",
    end_date="2025-11-01",
    step_days=7,
    maxrecords=250
)

print(f"\nTotal articles collected: {len(gdelt_NVDA)}")
gdelt_NVDA.head()

Fetching articles from 2024-11-01 to 2024-11-08...
✅ 150 articles collected in this window.
Fetching articles from 2024-11-08 to 2024-11-15...
✅ 198 articles collected in this window.
Fetching articles from 2024-11-15 to 2024-11-22...
✅ 250 articles collected in this window.
Fetching articles from 2024-11-22 to 2024-11-29...
✅ 220 articles collected in this window.
Fetching articles from 2024-11-29 to 2024-12-06...
✅ 180 articles collected in this window.
Fetching articles from 2024-12-06 to 2024-12-13...
✅ 206 articles collected in this window.
Fetching articles from 2024-12-13 to 2024-12-20...
✅ 230 articles collected in this window.
Fetching articles from 2024-12-20 to 2024-12-27...
✅ 140 articles collected in this window.
Fetching articles from 2024-12-27 to 2025-01-03...
✅ 175 articles collected in this window.
Fetching articles from 2025-01-03 to 2025-01-10...
✅ 250 articles collected in this window.
Fetching articles from 2025-01-10 to 2025-01-17...
✅ 217 articles collected in t

Unnamed: 0,title,seendate,url,sourcecountry,language,domain
43,Why Nvidia stock is sinking today,2024-11-01 01:30:00+00:00,https://www.fool.com.au/2024/11/01/why-nvidia-...,Australia,English,fool.com.au
126,Stock market suffers a Halloween selloff as te...,2024-11-01 01:30:00+00:00,https://www.morningstar.com/news/marketwatch/2...,China,English,morningstar.com
145,US close : Stocks sharply lower following tech...,2024-11-01 01:45:00+00:00,https://www.sharecast.com/news/market-report-u...,United Kingdom,English,sharecast.com
30,Why Nvidia Stock Is Sinking Today | The Motley...,2024-11-01 02:15:00+00:00,https://www.fool.com/investing/2024/10/31/why-...,United States,English,fool.com
105,Billionaire Philippe Laffont of Coatue Is Dump...,2024-11-01 03:00:00+00:00,https://finance.yahoo.com/news/billionaire-phi...,United States,English,finance.yahoo.com


In [15]:
gdelt_NVDA.to_csv('nvda1year.csv')

In [16]:
# assuming df is your dataframe
gdelt_NVDA['seendate'] = pd.to_datetime(gdelt_NVDA['seendate'])  # ensure it's a datetime
gdelt_NVDA['date'] = gdelt_NVDA['seendate'].dt.date              # extract just the date (no time)

# unique days
unique_days = gdelt_NVDA['date'].unique()
print(f"Total unique days: {len(unique_days)}")
print("First few unique dates:", sorted(unique_days)[:10])

Total unique days: 355
First few unique dates: [datetime.date(2024, 11, 1), datetime.date(2024, 11, 2), datetime.date(2024, 11, 3), datetime.date(2024, 11, 4), datetime.date(2024, 11, 5), datetime.date(2024, 11, 6), datetime.date(2024, 11, 7), datetime.date(2024, 11, 8), datetime.date(2024, 11, 9), datetime.date(2024, 11, 10)]


In [17]:
articles_per_day = gdelt_NVDA.groupby('date').size().reset_index(name='article_count')

# sort by date
articles_per_day = articles_per_day.sort_values('date')

# show top few
print(articles_per_day.head(100))

          date  article_count
0   2024-11-01             24
1   2024-11-02             20
2   2024-11-03             10
3   2024-11-04             31
4   2024-11-05             28
..         ...            ...
95  2025-02-04             34
96  2025-02-05             35
97  2025-02-06             41
98  2025-02-07             22
99  2025-02-08             17

[100 rows x 2 columns]
