# GPT 생성 DEMO 코드

In [None]:
import pandas as pd

data = pd.read_parquet("df_filtered_2012_final.parquet")
data

In [None]:
len(data['date'].unique())

In [None]:
sampled = data.sample(n=100, random_state=42)
# convert date to datetime (YYYYMMDD) and sort sampled ascending by date
sampled['date'] = pd.to_datetime(sampled['date'], format='%Y%m%d', errors='coerce')
sampled = sampled.sort_values('date', ascending=True)
sampled['date'] = sampled['date'].dt.strftime('%Y-%m-%d')
sampled

In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv()
import json
import time

In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv()
import json
import time

def make_batch_prompt(data_frame):
    """
    rows: list of dicts, where each dict has keys:
          ['idx', 'date', 'actor1', 'actor2', 'event_code', 'event_desc']
    """
    
    # 1. 리스트에 있는 모든 사건을 하나의 문자열로 변환
    events_text = ""
    for index, row in data_frame.iterrows():
        events_text += (
            f"Index: {index} | "
            f"Date: {row['date']} | "
            f"Actors: {row['actor1']} -> {row['actor2']} | "
            f"Event: {row['event_desc']} (Code: {row['event_code']})\n"
        )

    # 2. 프롬프트 구성 (영어 번역 및 지침 포함)
    prompt = (
        f"Here is a list of sequential events:\n\n"
        f"{events_text}\n"
        "--------------------------------------------------\n"
        "Task: Identify causal relationships BETWEEN these events. "
        "Find pairs where an earlier event (Cause) logically led to a later event (Effect).\n\n"
        
        "Requirements:\n"
        "1. Identify the 'cause_idx' and 'effect_idx' from the list above.\n"
        "2. Strict Temporal Order: The Cause must happen BEFORE the Effect. Future events cannot be the cause of past events. (Check the 'Date' order).\n"
        "3. Provide a very brief 'reason' explaining the connection. (Less than 15 words)\n"
        "4. Ignore unrelated events. If no relationships are found, return [].\n\n"
        
        "Output Format (JSON list):\n"
        "[\n"
        "  {\"cause_idx\": 1, \"effect_idx\": 3, \"reason\": \"The attack in event 1 caused the retaliation in event 3\"},\n"
        "  {\"cause_idx\": 5, \"effect_idx\": 6, \"reason\": \"...\"}\n"
        "]"
    )
    
    return prompt

In [None]:
prompts = make_batch_prompt(sampled)

# save prompts to a text file (one prompt per block)
out_dir = Path(".")
prompts_file = out_dir / "sampled_prompts.txt"
with prompts_file.open("w", encoding="utf-8") as f:
    f.write(prompts)


In [None]:
# Optional: call OpenAI API if available
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
from openai import OpenAI

if OPENAI_API_KEY is None:
    print("OPENAI_API_KEY not found in environment. Skipping API calls.")
else:
    try:
        import openai
    except Exception as e:
        print("openai package not installed or failed to import:", e)
        openai = None
        
        
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )

    print("Calling OpenAI API...")
    
    if openai is not None:
        openai.api_key = OPENAI_API_KEY
        # adjust model and rate settings as needed
        # MODEL = "gpt-3.5-turbo"  # or "gpt-4" if available in your account
        MODEL = "gpt-4.1-nano"  # or "gpt-4" if available in your account
        delay_between_calls = 1.0  # seconds

        results = []
        prompt_text = prompts
        
        try:
            resp = client.chat.completions.create(
                    model=MODEL,  # 또는 사용 중인 모델명
                    messages=[{"role": "user", "content": prompt_text}],
                    max_tokens=2048, # JSON이 길어질 수 있으므로 넉넉하게 잡는 것을 추천
                    temperature=0.0,
                )
                
            # 4. 응답 접근 방식 변경: 딕셔너리([])가 아니라 점(.)으로 접근
            content = resp.choices[0].message.content.strip()
        except Exception as e:
            content = f"__error__: {e}"
        # attempt to parse JSON from the reply
        parsed = None
        if not content.startswith("__error__"):
            # try to find first JSON array in the text
            try:
                # naive extraction: find first '[' and last ']' and parse
                start = content.find('[')
                end = content.rfind(']')
                if start != -1 and end != -1 and end > start:
                    json_text = content[start:end+1]
                    parsed = json.loads(json_text)
            except Exception:
                parsed = None

        results.append({
            "prompt": prompt_text,
            "response_text": content,
            "parsed_json": parsed,
        })

        # simple pacing
        time.sleep(delay_between_calls)

        # save responses to file
        out_file = out_dir / "gpt_responses.jsonl"
        with out_file.open("w", encoding="utf-8") as f:
            for r in results:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
        print(f"Saved {len(results)} responses to {out_file}")

        # also produce a pandas DataFrame view (no re-import of pandas; uses existing namespace)
        try:
            gpt_results = pd.DataFrame(results).set_index("index")
            display(gpt_results.head(20))
        except Exception:
            pass

In [None]:
# gpt_responses.jsonl 열어서 확인하기
# 기존에 쓰기용으로 열린 핸들(f)이 있으면 닫기

path = Path("results/gpt_responses.jsonl")
if not path.exists():
    print(f"{path} not found")
else:
    records = []
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                records.append(json.loads(line))
            except Exception as e:
                records.append({"_raw": line, "_error": str(e)})

    print(f"Loaded {len(records)} records from {path}\n")

    if len(records) == 0:
        print("No records to display.")
    else:
        # 출력: 첫 레코드의 원본 응답과 파싱된 JSON 확인
        first = records[0]
        print("First record - response_text:")
        print(first.get("response_text", first.get("_raw")))
        print("\nFirst record - parsed_json:")
        print(first.get("parsed_json"))

        # pandas로 테이블 형태 확인 (pandas 이미 임포트되어 있으므로 재임포트 불필요)
        try:
            df_results = pd.DataFrame(records)
            display(df_results.head(20))
        except Exception as e:
            print("Failed to create/display DataFrame:", e)

# 결과 parsing

In [205]:
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv()
import json
import time
import pandas as pd

In [206]:
MODEL = "gpt-4.1-nano"
# MODEL = "gpt-5-nano"
# PROMPT_VERSION = "v1"
# PROMPT_VERSION = "v3"
# PROMPT_VERSION = "v4"
# PROMPT_VERSION = "v5"
# PROMPT_VERSION = "v6"
PROMPT_VERSION = "v7"
# number_of_samples = 500
number_of_samples = 2000
# temperature = 0.5
temperature = 0.0
data_type = "usa"  # "2012"

In [207]:
data = pd.read_parquet(f"df_filtered_{data_type}_final.parquet")

In [208]:
# gpt_responses.jsonl 열어서 확인하기
# 기존에 쓰기용으로 열린 핸들(f)이 있으면 닫기

# file_name = f"results/gpt-5-nano_gpt_responses.jsonl"
# file_name = f"results/{MODEL}-{PROMPT_VERSION}-{number_of_samples}-{temperature}_gpt_responses.jsonl"
file_name = f"results/{data_type}-{MODEL}-{PROMPT_VERSION}-{number_of_samples}-{temperature}-100_gpt_responses.jsonl"
path = Path(file_name)
if not path.exists():
    print(f"{path} not found")
else:
    records = []
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                records.append(json.loads(line))
            except Exception as e:
                records.append({"_raw": line, "_error": str(e)})

    print(f"Loaded {len(records)} records from {path}\n")

    if len(records) == 0:
        print("No records to display.")
    else:
        # 출력: 첫 레코드의 원본 응답과 파싱된 JSON 확인
        first = records[0]
        print("First record - response_text:")
        print(first.get("response_text", first.get("_raw")))
        print("\nFirst record - parsed_json:")
        print(first.get("parsed_json"))

        # pandas로 테이블 형태 확인 (pandas 이미 임포트되어 있으므로 재임포트 불필요)
        try:
            df_results = pd.DataFrame(records)
            display(df_results.head(20))
        except Exception as e:
            print("Failed to create/display DataFrame:", e)

Loaded 100 records from results/usa-gpt-4.1-nano-v7-2000-0.0-100_gpt_responses.jsonl

First record - response_text:
[
  {
    "cause_idx": 17233,
    "effect_idx": 28741,
    "reason": "ISR's appeal for economic cooperation likely prompted USA's offer of trade concessions."
  },
  {
    "cause_idx": 28741,
    "effect_idx": 153305,
    "reason": "USA's offer of trade concessions probably led MED's statement of trade intent."
  },
  {
    "cause_idx": 63253,
    "effect_idx": 76358,
    "reason": "BUS's trade statement of intent likely influenced USMED's dissatisfaction expression."
  },
  {
    "cause_idx": 78840,
    "effect_idx": 153305,
    "reason": "PSEMED's trade concessions offer probably triggered USA's trade statement of intent."
  },
  {
    "cause_idx": 153305,
    "effect_idx": 226826,
    "reason": "USA's trade statement of intent likely prompted USMED's similar statement."
  },
  {
    "cause_idx": 226826,
    "effect_idx": 231872,
    "reason": "USMED's trade statement p

Unnamed: 0,prompt,response_text,parsed_json
0,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 17233,\n ""effect_i...","[{'cause_idx': 17233, 'effect_idx': 28741, 're..."
1,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 3064,\n ""effect_id...","[{'cause_idx': 3064, 'effect_idx': 100468, 're..."
2,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 4504,\n ""effect_id...","[{'cause_idx': 4504, 'effect_idx': 14847, 'rea..."
3,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 120124,\n ""effect_...","[{'cause_idx': 120124, 'effect_idx': 124850, '..."
4,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 3119,\n ""effect_id...",
5,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 2753,\n ""effect_id...","[{'cause_idx': 2753, 'effect_idx': 4326, 'reas..."
6,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 38027,\n ""effect_i...","[{'cause_idx': 38027, 'effect_idx': 59461, 're..."
7,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 27793,\n ""effect_i...",
8,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 2984,\n ""effect_id...","[{'cause_idx': 2984, 'effect_idx': 60204, 'rea..."
9,You are an expert in identifying causal relati...,"[\n {\n ""cause_idx"": 10187,\n ""effect_i...","[{'cause_idx': 10187, 'effect_idx': 26626, 're..."


In [209]:
for i, result in enumerate(records):
    if result['parsed_json'] is None:
        temp_str = result['response_text']
        temp_str = temp_str.replace("\n", "")
        temp_str = temp_str.split(',  {')
        temp_str = temp_str[:-1]
        temp_str = ',  {'.join(temp_str)
        temp_str = temp_str + ']'
        temp_str = eval(temp_str)
        result['parsed_json'] = temp_str
        
df_results = pd.DataFrame(records)

In [210]:
df_results['triple_num'] = df_results['parsed_json'].apply(lambda x: len(x) if x is not None else 0)

In [211]:
ADD_REASON = True


triple_txt = []

for triples in df_results['parsed_json']:
    for triple in triples:
        triple_txt.append((triple['cause_idx'], "cause", triple['effect_idx'], triple['reason']))
    

print(len(triple_txt)) # 4696
triple_df = pd.DataFrame(triple_txt, columns=['cause_idx', 'relation', 'effect_idx', 'reason'])
triple_df = triple_df.iloc[triple_df.loc[:, ['cause_idx', 'relation', 'effect_idx']].drop_duplicates().index, :]

triple_txt = []
hallucinated = []
for triple in triple_df.itertuples():
    if triple.cause_idx not in data.index or triple.effect_idx not in data.index:
        hallucinated.append(triple.Index)
        continue
    if ADD_REASON:
        triple_txt.append(f"{triple.cause_idx}\t{triple.relation}\t{triple.effect_idx}\t{triple.reason}")
    else:
        triple_txt.append(f"{triple.cause_idx}\t{triple.relation}\t{triple.effect_idx}")
print(len(triple_txt))

from pathlib import Path
if ADD_REASON:
    path = Path(f"results/{data_type}-{MODEL}-prompt-{PROMPT_VERSION}-{number_of_samples}-{temperature}-100_triples_reason.txt")
else:
    path = Path(f"results/{data_type}-{MODEL}-prompt-{PROMPT_VERSION}-{number_of_samples}-{temperature}-100_triples.txt")
with path.open("w", encoding="utf-8") as f:
    for line in triple_txt:
        f.write(line + "\n")

2660
1849


In [212]:
triple_df = triple_df.drop(hallucinated)

In [213]:
triple_df.drop(['reason'], axis=1, inplace=True)
# group effect_idx by cause_idx into a dict (unique, sorted lists)

In [214]:
cause_to_effects = (
    triple_df.groupby("cause_idx")['effect_idx']
    .apply(lambda s: s.tolist())
)
cause_to_effects

cause_idx
2402           [46857]
2493            [4341]
2524            [3766]
2566           [18619]
2573            [8123]
               ...    
42349566    [42364508]
42583829    [42583829]
42918209    [42918274]
43413280    [43481037]
43610343    [43645272]
Name: effect_idx, Length: 1636, dtype: object

In [215]:
data_merged = data.merge(cause_to_effects, left_on=data.index, right_on="cause_idx", how='left')
data_merged

Unnamed: 0,cause_idx,date,actor1,actor2,event_code,event_desc,effect_idx
0,3,20090604,PSE,USAGOV,022,Agreement to pursue economic cooperation,
1,65,20090604,PTY,USA,022,Agreement to pursue economic cooperation,
2,67,20090604,PTY,USACVL,022,Agreement to pursue economic cooperation,
3,318,20090604,REL,USA,071,Offer trade concessions,
4,767,20090604,RUS,USA,071,Offer trade concessions,
...,...,...,...,...,...,...,...
708335,43648029,20140217,USA,VNM,138,Agree to enhance economic cooperation,
708336,43648474,20140217,VNM,USALEG,071,Offer trade concessions,
708337,43648718,20140217,ZAF,USABUS,073,Approve trade agreements,
708338,43648869,20140217,ZWE,USAMED,085,Reduce economic assistance,


In [216]:
from tqdm import tqdm
cause_to_effects_dict = cause_to_effects.to_dict()
cause_to_effects_reason_dict = {}

for k, v in tqdm(cause_to_effects_dict.items()):
    if k not in cause_to_effects_reason_dict:
        cause_to_effects_reason_dict[k] = []
    for effects in v:
        cause_to_effects_reason_dict[k].append(tuple(data_merged.loc[data_merged['cause_idx'] == effects, ['cause_idx', 'date', 'actor1', 'actor2', 'event_code', 'event_desc']].values[0]))
    

100%|██████████| 1636/1636 [00:37<00:00, 43.16it/s]


In [217]:
cause_to_effects_reason_dict_tmp = {}
for k, v in cause_to_effects_reason_dict.items():
    cause_to_effects_reason_dict_tmp[k] = tuple(v)
    
cause_to_effects_reason_dict_tmp_df = pd.DataFrame.from_dict(cause_to_effects_reason_dict_tmp, orient='index')
    

In [218]:
data_merged_merged = data_merged.merge(cause_to_effects_reason_dict_tmp_df, left_on=data_merged['cause_idx'], right_index=True, how='left')

In [219]:
data_merged_merged.drop(['key_0'], axis=1, inplace=True)

In [220]:
# 컬럼들 중에 튜플/리스트가 들어있는 컬럼만 찾아 JSON으로 직렬화
tuple_cols = [c for c in data_merged_merged.columns if data_merged_merged[c].apply(lambda x: isinstance(x, (tuple, list))).any()]

for c in tuple_cols:
    data_merged_merged[c] = data_merged_merged[c].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (tuple, list)) else x)

output_file = f"results/df_filtered_{data_type}_final_causal_matched_all-prompt-{PROMPT_VERSION}-{MODEL}-{number_of_samples}-{temperature}"
data_merged_merged.to_parquet(f"{output_file}.parquet")

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


In [221]:
data = pd.read_parquet(f"{output_file}.parquet")

In [222]:
cols = ['effect_idx', 0, 1, 2, 3, 4]
cols_exist = [c for c in cols if c in data.columns]

before = len(data)
if cols_exist:
    data = data.loc[~data[cols_exist].isnull().all(axis=1)].copy()
after = len(data)

print(f"Dropped {before-after} rows; remaining {after}")
data

Dropped 706704 rows; remaining 1636


Unnamed: 0,cause_idx,date,actor1,actor2,event_code,event_desc,effect_idx,0,1,2,...,24,25,26,27,28,29,30,31,32,33
10,2402,20090604,USA,AFG,072,Implement or announce trade restrictions,[46857],"[46857, ""20090607"", ""BUS"", ""USA"", ""070"", ""Stat...",,,...,,,,,,,,,,
18,2493,20090604,USA,BRA,071,Offer trade concessions,[4341],"[4341, ""20090604"", ""USAMED"", ""MOS"", ""154"", ""Ma...",,,...,,,,,,,,,,
21,2524,20090604,USA,BUS,071,Offer trade concessions,[3766],"[3766, ""20090604"", ""USAGOV"", ""USAGOV"", ""073"", ...",,,...,,,,,,,,,,
23,2566,20090604,USABUS,USA,070,Statement of intent regarding trade actions,[18619],"[18619, ""20090605"", ""KOR"", ""USAMED"", ""084"", ""E...",,,...,,,,,,,,,,
25,2573,20090604,USA,CAN,022,Agreement to pursue economic cooperation,[8123],"[8123, ""20090605"", ""CAN"", ""USA"", ""071"", ""Offer...",,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687007,42349566,20140105,GOV,USA,070,Statement of intent regarding trade actions,[42364508],"[42364508, ""20140105"", ""USA"", ""IRQ"", ""138"", ""A...",,,...,,,,,,,,,,
691176,42583829,20140112,ARE,MNCUSA,070,Statement of intent regarding trade actions,[42583829],"[42583829, ""20140112"", ""ARE"", ""MNCUSA"", ""070"",...",,,...,,,,,,,,,,
696363,42918209,20140127,USA,JUD,081,Provide economic aid,[42918274],"[42918274, ""20140127"", ""USAJUD"", ""HLH"", ""084"",...",,,...,,,,,,,,,,
704553,43413280,20140211,EUR,USA,154,Make public statements regarding currency or e...,[43481037],"[43481037, ""20140212"", ""USAGOV"", ""IRN"", ""085"",...",,,...,,,,,,,,,,


In [223]:
data.to_parquet(f"{output_file}_all_cleaned.parquet")

In [20]:
import pandas as pd

data = pd.read_parquet(f"{output_file}_all_cleaned.parquet")
data

Unnamed: 0,cause_idx,date,actor1,actor2,event_code,event_desc,effect_idx,0,1,2,3,4,5,6,7,8,9
4,21021446,20120101,AFG,DNK,154,Make public statements regarding currency or e...,[21033405],"[21033405, ""20120101"", ""USA"", ""MED"", ""070"", ""S...",,,,,,,,,
26,21021950,20120101,BGD,BGDCVL,071,Offer trade concessions,[21052503],"[21052503, ""20120102"", ""ZMBGOV"", ""PTY"", ""071"",...",,,,,,,,,
29,21021995,20120101,BGD,USA,073,Approve trade agreements,[21061891],"[21061891, ""20120103"", ""IGOUNO"", ""SSDMIL"", ""07...",,,,,,,,,
44,21022197,20120101,BUS,EDU,071,Offer trade concessions,"[21025647, 21038483]","[21025647, ""20120101"", ""GOVMIL"", ""SAU"", ""022"",...","[21038483, ""20120102"", ""EGY"", ""GOV"", ""071"", ""O...",,,,,,,,
52,21022293,20120101,BUS,MED,070,Statement of intent regarding trade actions,"[21051599, 21036415]","[21051599, ""20120102"", ""USAMED"", ""LEG"", ""154"",...","[21036415, ""20120102"", ""CHR"", ""USA"", ""138"", ""A...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207442,24793459,20120511,NGOUSAHLH,BUS,074,Reject trade agreements,[24822171],"[24822171, ""20120512"", ""USAGOV"", ""GBR"", ""071"",...",,,,,,,,,
214184,24918669,20120516,CHN,CHNSET,075,Engage in economic or trade negotiations,[24925595],"[24925595, ""20120516"", ""GOV"", ""AUSCOP"", ""152"",...",,,,,,,,,
219561,25012495,20120519,CAN,JUD,025,Appeal for economic cooperation,[25038589],"[25038589, ""20120520"", ""GBR"", ""scoGOV"", ""084"",...",,,,,,,,,
227188,25152692,20120523,USA,CUB,072,Implement or announce trade restrictions,[25188792],"[25188792, ""20120524"", ""USAJUD"", ""USA"", ""084"",...",,,,,,,,,


In [None]:
data.reset_index(inplace=True)
