In [113]:
ECONOMIC_EVENTCODES = {
    "021","022","023","024","025",
    "070","071","072","073","074","075","076","077",
    "081","082","083","084","085","086","087","089",
    "092","093","094","097",
    "131","132","135","136","138",
    "151","152","153","154","155",
}

In [114]:
ACTOR_TYPE_MAP = {
    "GOV": "Government",
    "PRESIDENT": "President",
    "LEG": "Legislature",
    "MIL": "Military",
    "COP": "Police",
    "BUS": "Business",
    "ECON": "Economic Organization",
    "EDU": "Education Sector",
    "HLH": "Health Sector",
    "JUD": "Judiciary",
    "MED": "Media Organization",
    "REL": "Religious Group",
    "OPP": "Opposition",
    "INT": "International Organization",
}

In [115]:
EVENTCODE_DESC = {
    # 02x – Economic policy / cooperation statements
    "021": "Statement of intent regarding economic policy",
    "022": "Agreement to pursue economic cooperation",
    "023": "Promise to change or adjust economic policies",
    "024": "Request for economic support or aid",
    "025": "Appeal for economic cooperation",

    # 07x – Trade and market-related interactions
    "070": "Statement of intent regarding trade actions",
    "071": "Offer trade concessions",
    "072": "Implement or announce trade restrictions",
    "073": "Approve trade agreements",
    "074": "Reject trade agreements",
    "075": "Engage in economic or trade negotiations",
    "076": "Announce economic cooperation or joint actions",
    "077": "Implement economic actions or agreements",

    # 08x – Economic aid and response
    "081": "Provide economic aid",
    "082": "Request economic aid",
    "083": "Condemn or criticize economic actions",
    "084": "Express dissatisfaction with economic situation or actions",
    "085": "Reduce economic assistance",
    "086": "Halt economic assistance",
    "087": "Resume or restart economic assistance",
    "089": "Provide financial or humanitarian aid",

    # 09x – Sanctions and economic pressure
    "092": "Impose economic sanctions, embargo, or boycott",
    "093": "Reduce or halt economic assistance",
    "094": "Lift or remove sanctions",
    "097": "Issue economic threats or warnings",

    # 13x – Economic relations level change
    "131": "Reduce economic relations",
    "132": "Terminate economic cooperation",
    "135": "Resume economic cooperation",
    "136": "Expand economic relations",
    "138": "Agree to enhance economic cooperation",

    # 15x – Fiscal, monetary, market interventions
    "151": "Announce fiscal or monetary policy measures",
    "152": "Change interest rates",
    "153": "Conduct market intervention",
    "154": "Make public statements regarding currency or exchange rates",
    "155": "Implement price controls or market regulations",
}

In [116]:
import pandas as pd
import requests, zipfile, os
from tqdm import tqdm
from datetime import datetime, timedelta

BASE_URL = "http://data.gdeltproject.org/gdeltv2/"
MASTER_LIST_URL = "masterfilelist.txt"

def get_available_files(BASE_URL ,MASTER_LIST_URL, days=1):
    """
    masterfilelist에서 파일명 기반 timestamp 추출 후,
    최근 days일치 export CSV만 반환
    """
    print("[INFO] masterfilelist 다운로드 중…")
    r = requests.get(BASE_URL + MASTER_LIST_URL)
    lines = r.text.strip().split("\n")

    records = []
    for line in lines:
        parts = line.split(" ")
        if len(parts) != 3:
            continue

        md5, size, fname = parts

        # export.CSV.zip 파일만 선택
        if not fname.endswith("export.CSV.zip"):
            continue

        # 파일명에서 http://data.gdeltproject.org/gdeltv2/ 지우기
        fname = fname.replace(BASE_URL, "")
        # 파일명에서 timestamp 추출
        # 예: http://data.gdeltproject.org/gdeltv2/20250226121500.export.CSV.zip → 20250226121500
        ts_str = fname.split(".")[0]

        try:
            ts = pd.to_datetime(ts_str, format="%Y%m%d%H%M%S", utc=True)
        except:
            continue

        records.append({
            "filename": fname,
            "timestamp": ts
        })

    df = pd.DataFrame(records)

    cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
    df = df[df["timestamp"] >= cutoff]

    return df["filename"].tolist()


def download_gdelt_files(days=1, out_dir="gdelt_raw"):
    os.makedirs(out_dir, exist_ok=True)

    fnames = get_available_files(days)
    print(f"[INFO] 다운로드 대상 파일 수: {len(fnames)}")

    downloaded = []

    for fname in tqdm(fnames):
        url = f"http://data.gdeltproject.org/gdeltv2/{fname}"
        out_path = os.path.join(out_dir, fname)

        if os.path.exists(out_path):
            downloaded.append(out_path)
            continue

        try:
            r = requests.get(url, timeout=10)
            if r.status_code == 200:
                with open(out_path, "wb") as f:
                    f.write(r.content)
                downloaded.append(out_path)
        except:
            pass

    print(f"[INFO] 다운로드 완료: {len(downloaded)}개 파일")
    return downloaded

In [117]:
def load_gdelt_csv(zip_path):
    """GDELT CSV(61컬럼) 로드"""
    cols = [f"col_{i}" for i in range(61)]
    try:
        with zipfile.ZipFile(zip_path, "r") as z:
            csv_name = z.namelist()[0]
            df = pd.read_csv(
                z.open(csv_name),
                sep="\t",
                names=cols,
                header=None,
                low_memory=False
            )
        return df
    except:
        return None

In [118]:
def extract_economic_events(df, ECONOMIC_EVENTCODES):
    """경제 EventCode만 선택하고 datetime & actor 정리"""
    df = df[df["col_26"].astype(str).isin(ECONOMIC_EVENTCODES)]

    # 날짜 처리
    df["datetime"] = pd.to_datetime(df["col_1"], format="%Y%m%d", errors="coerce")

    # Actor1/Actor2 (간단 버전)
    df["actor1"] = df["col_5"].fillna("") + df["col_6"].fillna("")
    df["actor2"] = df["col_15"].fillna("") + df["col_16"].fillna("")

    # URL ( 뉴스 원문 링크 )
    df["url"] = df["col_60"]

    return df

In [120]:
target_events = 10000
days = 1

all_events = []

while True:
    print(f"\n### Downloading last {days} day(s)... ###")
    files = download_gdelt_files(days=days)

    temp_events = []
    for f in tqdm(files):
        df = load_gdelt_csv(f)
        if df is not None:
            econ = extract_economic_events(df)
            temp_events.append(econ)

    if len(temp_events) > 0:
        downloaded_events = pd.concat(temp_events, ignore_index=True)
        all_events.append(downloaded_events)

    total = sum(len(x) for x in all_events)
    print(f"Current total economic events: {total}")

    if total >= target_events:
        print("Target reached!")
        break

    days += 1  # 날짜 범위 확대


### Downloading last 1 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 97


100%|██████████| 97/97 [00:00<00:00, 229.22it/s]


[INFO] 다운로드 완료: 97개 파일


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

Current total economic events: 790

### Downloading last 2 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 193


100%|██████████| 193/193 [00:00<00:00, 94358.40it/s]


[INFO] 다운로드 완료: 193개 파일


100%|██████████| 193/193 [00:03<00:00, 55.84it/s]


Current total economic events: 1992

### Downloading last 3 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 289


100%|██████████| 289/289 [00:00<00:00, 140376.82it/s]


[INFO] 다운로드 완료: 289개 파일


100%|██████████| 289/289 [00:04<00:00, 57.95it/s]


Current total economic events: 3653

### Downloading last 4 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 385


100%|██████████| 385/385 [00:00<00:00, 156035.08it/s]


[INFO] 다운로드 완료: 385개 파일


100%|██████████| 385/385 [00:06<00:00, 57.75it/s]


Current total economic events: 5994

### Downloading last 5 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 481


100%|██████████| 481/481 [00:00<00:00, 166939.20it/s]


[INFO] 다운로드 완료: 481개 파일


100%|██████████| 481/481 [00:08<00:00, 57.18it/s]


Current total economic events: 8997

### Downloading last 6 day(s)... ###
[INFO] masterfilelist 다운로드 중…
[INFO] 다운로드 대상 파일 수: 577


100%|██████████| 577/577 [00:00<00:00, 161804.73it/s]


[INFO] 다운로드 완료: 577개 파일


100%|██████████| 577/577 [00:10<00:00, 55.26it/s]


Current total economic events: 12761
Target reached!


In [121]:
events = pd.concat(all_events, ignore_index=True)

In [122]:
def eventcode_to_text(code: str):
    """Convert EventCode to readable description."""
    if code is None:
        return "Unknown event"
    return EVENTCODE_DESC.get(str(code), f"Unknown action (EventCode {code})")

In [124]:
def normalize_country(country):
    if isinstance(country, str) and len(country) > 0:
        return country.upper()
    return "N/A"

In [127]:
def reduce_gdelt_row(row, is_print=False):
    """
    Reduce a full 61-column GDELT row into a compact structured dict.
    """

    # --- 날짜 ---
    try:
        date = pd.to_datetime(row["col_1"], format="%Y%m%d").strftime("%Y-%m-%d")
    except:
        date = None

    # --- Actor1 ---
    actor1 = row.get('actor1')

    # --- Actor2 ---
    actor2 = row.get('actor2')

    # --- Action ---
    event_code = str(row.get("col_26"))
    event_desc = eventcode_to_text(event_code)

    # --- 경제 영향 지표 ---
    tone = row.get("col_34")
    goldstein = row.get("col_30")

    # --- Source URL ---
    url = row.get("col_60")

    # --- Summary (natural language) ---
    if actor1 and actor2:
        actors = f"{actor1} acted toward {actor2}"
    elif actor1:
        actors = f"{actor1} made an action"
    else:
        actors = "An actor made an action"

    summary = (
        f"On {date}, {actors}: {event_desc}. "
        f"Tone={tone}, Goldstein={goldstein}. "
        f"Source: {url}"
    )

    if is_print:
        print(summary)

    # --- 최종 compact dict 반환 ---
    return {
        "date": date,
        "actor1": actor1,
        "actor2": actor2,
        "event_code": event_code,
        "event_desc": event_desc,
        "tone": tone,
        "goldstein": goldstein,
        "url": url,
    }

In [128]:
events

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_55,col_56,col_57,col_58,col_59,col_60,datetime,actor1,actor2,url
0,1276948730,20251201,202512,2025,2025.9068,BGD,DHAKA,BGD,,,...,5778,23.7231,90.4086,-2737683,20251201060000,https://www.bssnews.net/news-flash/337414,2025-12-01,BGD|DHAKA,BGDOPP|BANGLADESH,https://www.bssnews.net/news-flash/337414
1,1276948733,20251201,202512,2025,2025.9068,BGD,DHAKA,BGD,,,...,5778,23.7231,90.4086,-2737683,20251201060000,https://www.bssnews.net/news-flash/337414,2025-12-01,BGD|DHAKA,COP|LAW ENFORCEMENT AGENCIES,https://www.bssnews.net/news-flash/337414
2,1276948742,20251201,202512,2025,2025.9068,BGD,DHAKA,BGD,,,...,5778,23.7231,90.4086,-2737683,20251201060000,https://www.bssnews.net/news-flash/337414,2025-12-01,BGD|DHAKA,OPP|OUSTED PRIME MINISTER,https://www.bssnews.net/news-flash/337414
3,1276948875,20251201,202512,2025,2025.9068,DEU,GERMANY,DEU,,,...,28554,50.4333,30.5167,-1044367,20251201060000,https://www.lewrockwell.com/2025/12/no_author/...,2025-12-01,DEU|GERMANY,CAN|CANADA,https://www.lewrockwell.com/2025/12/no_author/...
4,1276948917,20251201,202512,2025,2025.9068,FRA,FRANCE,FRA,,,...,28554,50.4333,30.5167,-1044367,20251201060000,https://www.lewrockwell.com/2025/12/no_author/...,2025-12-01,FRA|FRANCE,CAN|CANADA,https://www.lewrockwell.com/2025/12/no_author/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12756,1277135122,20251202,202512,2025,2025.9096,MIL,DEFENSE FORCE,,,,...,13277,24.0000,121,-2637939,20251202054500,https://www.opednews.com/populum/page.php?f=Ta...,2025-12-02,MIL|DEFENSE FORCE,TWN|TAIWAN,https://www.opednews.com/populum/page.php?f=Ta...
12757,1277135429,20251202,202512,2025,2025.9096,USA,UNITED STATES,USA,,,...,31959,10.5000,-66.9167,-938457,20251202054500,https://www.cnn.com/2025/12/02/politics/trump-...,2025-12-02,USA|UNITED STATES,|,https://www.cnn.com/2025/12/02/politics/trump-...
12758,1277135580,20251202,202512,2025,2025.9096,VEN,VENEZUELA,VEN,,,...,31959,10.5000,-66.9167,-938457,20251202054500,https://www.cnn.com/2025/12/02/politics/trump-...,2025-12-02,VEN|VENEZUELA,USA|UNITED STATES,https://www.cnn.com/2025/12/02/politics/trump-...
12759,1277136942,20251202,202512,2025,2025.9096,RWA,RWANDAN,RWA,,,...,21846,-10.7067,40.6328,-312482,20251202060000,https://allafrica.com/stories/202512020011.html,2025-12-02,RWA|RWANDAN,MOZ|MOZAMBIQUE,https://allafrica.com/stories/202512020011.html


In [129]:
reduced = events.apply(reduce_gdelt_row, axis=1)
df_reduced = pd.DataFrame(list(reduced))

In [130]:
df_unique = df_reduced.drop_duplicates(
    subset=["date", "actor1", "event_code", "url"],
    keep="first"
)

In [131]:
df_unique.loc[df_unique['actor2'] == '|', 'actor2'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [132]:
df_unique

Unnamed: 0,date,actor1,actor2,event_code,event_desc,tone,goldstein,url
0,2025-12-01,BGD|DHAKA,BGDOPP|BANGLADESH,151,Announce fiscal or monetary policy measures,-4.651163,-7.2,https://www.bssnews.net/news-flash/337414
3,2025-12-01,DEU|GERMANY,CAN|CANADA,154,Make public statements regarding currency or e...,-6.150583,-7.2,https://www.lewrockwell.com/2025/12/no_author/...
4,2025-12-01,FRA|FRANCE,CAN|CANADA,154,Make public statements regarding currency or e...,-6.150583,-7.2,https://www.lewrockwell.com/2025/12/no_author/...
5,2025-12-01,GBR|BRITAIN,CAN|CANADA,154,Make public statements regarding currency or e...,-6.150583,-7.2,https://www.lewrockwell.com/2025/12/no_author/...
6,2025-12-01,IGOWSTNAT|NATO,USA|AMERICAN,138,Agree to enhance economic cooperation,-6.178490,-7.0,https://nuclear-news.net/2025/12/01/3-a-what-d...
...,...,...,...,...,...,...,...,...
9753,2025-11-27,USABUS|AMERICAN,,138,Agree to enhance economic cooperation,-6.880428,-7.0,https://original.antiwar.com/Michelle_Ellner/2...
9754,2025-11-27,BUS|BANK,,154,Make public statements regarding currency or e...,0.154083,-7.2,https://www.moneycontrol.com/news/trends/woman...
9755,2025-11-27,GOV|PRIME MINISTER,,138,Agree to enhance economic cooperation,-4.710145,-7.0,https://www.brisbanetimes.com.au/politics/fede...
9756,2025-11-27,GOV|GOVERNMENT,MIL|DEFENSE FORCE,154,Make public statements regarding currency or e...,-6.086957,-7.2,https://www.chiangraitimes.com/politics/japan-...


In [None]:
df_reduced

In [None]:
COLUMNS_TO_DROP = [
    "actor1",
    "actor2",
    "tone",
    "goldstein",
    "url",
]

In [None]:
df_reduced_clean = df_reduced.drop(columns=COLUMNS_TO_DROP, errors="ignore")

In [None]:
df_reduced_clean