In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display, clear_output

In [3]:
csv = pd.read_csv('/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason.csv')

In [4]:
csv.head()

Unnamed: 0,idx,cause,effect,reason,evaluation
0,0,Israel Appeal for economic cooperation to Unit...,United States Offer trade concessions to Unite...,ISR's appeal for economic cooperation likely p...,O
1,1,United States Offer trade concessions to Unite...,United States Statement of intent regarding tr...,USA's offer of trade concessions probably led ...,X
2,2,business sector Statement of intent regarding ...,health sector multinational corporation in Uni...,BUS's trade statement of intent likely influen...,O
3,3,"Palestine, State of medical sector Offer trade...",United States Statement of intent regarding tr...,PSEMED's trade concessions offer probably trig...,O
4,4,United States Statement of intent regarding tr...,United States medical sector Statement of inte...,USA's trade statement of intent likely prompte...,O


In [5]:
!pip install pycountry



In [6]:
import pandas as pd
import pycountry

# ==============================
# 1) 3글자 단위 split 함수
# ==============================
def split_triplets(s):
    if pd.isna(s):
        return []
    s = str(s).upper()
    return [s[i:i+3] for i in range(0, len(s), 3) if len(s[i:i+3]) == 3]


# ==============================
# 2) ISO & GDELT 지역 코드 설정
# ==============================
ISO_ALPHA3_SET = {c.alpha_3 for c in pycountry.countries}

GDELT_REGION_CODES = {
    "AFR": "Africa",
    "ASA": "Asia",
    "EUR": "Europe",
    "MEA": "Middle East & North Africa",
    "XAF": "Sub-Saharan Africa",
    "XAS": "East & Southeast Asia",
    "XME": "Middle East",
    "XEU": "European region",
    "SAF": "South Africa region",
    "WAF": "West Africa",
    "EAF": "East Africa",
}


# ==============================
# 3) identity / org / role 기본 사전
# ==============================
identity_map = {
    "CHR": "Christian",
    "MOS": "Muslim",
    "JEW": "Jewish",
    "BUD": "Buddhist",
    "HIN": "Hindu",
    "PRO": "Protestant",
}

org_map = {
    "NGO": "Non-Governmental Organization",
    "IGO": "Inter-Governmental Organization",
    "MNC": "Multinational Corporation",
    "IMG": "Migrant Group",
    "UNO": "United Nations Organization",
    "WBK": "World Bank Group",
    "ADB": "Asian Development Bank",
    "DEV": "Development Organization",
}

role_map = {
    "GOV": "Government",
    "MIL": "Military",
    "BUS": "Business Sector",
    "CVL": "Civilian Group",
    "HLH": "Health Sector",
    "EDU": "Education Sector",
    "JUD": "Judiciary",
    "REL": "Religious Organization",
    "LAB": "Labor Union",
    "COP": "Police",
    "OPP": "Opposition Group",
    "REF": "Refugee Group",
    "AGR": "Agricultural Sector",
    "MED": "Medical Sector",
    "PTY": "Political Party",
    "HRI": "Human Rights Organization",
    "ELI": "Elite",
    "CRM": "Criminal Organization",
    "UAF": "Armed Forces",
    "LEG": "Legislature",
    "RAD": "Radical Group",
    "REB": "Rebel Group",
    "SET": "Settler Group",
    "SPY": "Intelligence Agency",
}


# ==============================
# 4) 국가 코드 추출
# ==============================
def extract_country_codes(tokens):
    """ISO 코드 또는 지역코드만 필터링한 뒤, identity/org/role에 속한 값은 제거"""

    identity_set = set(identity_map.keys())
    role_set = set(role_map.keys())
    org_set = set(org_map.keys())

    initial = {t for t in tokens if t in ISO_ALPHA3_SET or t in GDELT_REGION_CODES}

    filtered = initial - identity_set - role_set - org_set
    return filtered


# ==============================
# 5) Natural-language builder
# ==============================
def build_phrase(identity, roles, org, country):
    if identity and roles and country:
        return f"{identity.lower()} {roles.lower()} in {country}"
    if identity and country:
        return f"{identity.lower()} group in {country}"
    if identity:
        return f"{identity.lower()} group"
    if org and roles and country:
        return f"{roles.lower()} {org.lower()} in {country}"
    if org and country:
        return f"{org.lower()} in {country}"
    if roles and country:
        return f"{country} {roles.lower()}"
    if org:
        return org.lower()
    if country:
        return country
    return None


# ==============================
# 6) parse_actor()
# ==============================
def parse_actor(code):
    original = code
    code = code.strip().upper()

    # ============================
    # 0) Pure role code
    # ============================
    if code in role_map:
        return role_map[code].lower()

    # ============================
    # 1) Pure identity code
    # ============================
    if code in identity_map:
        return identity_map[code].lower() + " group"

    # ============================
    # 2) Pure org code
    # ============================
    if code in org_map:
        return org_map[code].lower()

    # ============================
    # 3) Pure country/region code
    # ============================
    if code in country_map:
        return country_map[code]

    # ============================================
    # 복합 코드 파싱 시작
    # ============================================
    identity = None
    org = None
    country = None
    roles = []

    # -------------------
    # identity prefix
    # -------------------
    for p in sorted(identity_map, key=lambda x: -len(x)):
        if code.startswith(p):
            identity = identity_map[p]
            code = code[len(p):]
            break

    # -------------------
    # organization prefix
    # -------------------
    for p in sorted(org_map, key=lambda x: -len(x)):
        if code.startswith(p):
            org = org_map[p]
            code = code[len(p):]
            break

    # -------------------
    # country prefix
    # -------------------
    for c in sorted(country_map, key=lambda x: -len(x)):
        if code.startswith(c):
            country = country_map[c]
            code = code[len(c):]
            break

    # -------------------
    # role suffixes (반복)
    # -------------------
    changed = True
    while changed:
        changed = False
        for r in sorted(role_map, key=lambda x: -len(x)):
            if code.endswith(r):
                roles.append(role_map[r])
                code = code[:-len(r)]
                changed = True

    # 자연어 조립
    roles_final = ", ".join(roles) if roles else None
    phrase = build_phrase(identity, roles_final, org, country)

    if phrase:
        return phrase

    # fallback
    return original.lower()

In [7]:
import re

# =========================================================
# 1. 현재 데이터프레임(csv)의 텍스트에서 국가 코드 추출 및 Map 구성
# =========================================================
print("Extracting country codes from dataframe text...")

# 분석할 컬럼 리스트
target_cols = ['cause', 'effect', 'reason']

# 전체 텍스트에서 3글자 대문자 토큰 수집
all_tokens = set()
for col in target_cols:
    if col in csv.columns:
        extracted = csv[col].dropna().astype(str).apply(lambda x: re.findall(r'\b[A-Z]{3}\b', x))
        for tokens in extracted:
            all_tokens.update(tokens)

# 앞서 정의된 extract_country_codes 함수 활용
valid_country_codes = extract_country_codes(list(all_tokens))

# country_map 갱신
country_map = {}
for c in valid_country_codes:
    obj = pycountry.countries.get(alpha_3=c)
    if obj:
        country_map[c] = obj.name
country_map.update(GDELT_REGION_CODES)

print(f"Country Map Built: {len(country_map)} entries found.")


def replace_gdelt_codes(text):
    if pd.isna(text):
        return text
    text = str(text)

    # 정규식 치환용 내부 함수
    def replacer(match):
        code = match.group()
        # 이미 정의된 parse_actor 함수 사용
        decoded = parse_actor(code)

        # parse_actor는 실패 시 원래 코드를 소문자로 반환함.
        # 변환된 결과가 단순히 소문자화 된 것과 다르면(즉, 해석 성공 시) 교체
        if decoded != code.lower():
            return decoded
        return code

    # 정규식: 단어 경계(\b)가 있는 3글자 이상의 대문자(예: ISR, USAGOV) 찾기
    return re.sub(r'\b[A-Z]{3,}\b', replacer, text)


Extracting country codes from dataframe text...
Country Map Built: 89 entries found.


In [8]:

print("Replacing acronyms in text columns...")

for col in target_cols:
    if col in csv.columns:
        csv[col] = csv[col].apply(replace_gdelt_codes)


Replacing acronyms in text columns...


In [9]:
csv.head()

Unnamed: 0,idx,cause,effect,reason,evaluation
0,0,Israel Appeal for economic cooperation to Unit...,United States Offer trade concessions to Unite...,Israel's appeal for economic cooperation likel...,O
1,1,United States Offer trade concessions to Unite...,United States Statement of intent regarding tr...,United States's offer of trade concessions pro...,X
2,2,business sector Statement of intent regarding ...,health sector multinational corporation in Uni...,business sector's trade statement of intent li...,O
3,3,"Palestine, State of medical sector Offer trade...",United States Statement of intent regarding tr...,"Palestine, State of medical sector's trade con...",O
4,4,United States Statement of intent regarding tr...,United States medical sector Statement of inte...,United States's trade statement of intent like...,O


In [10]:

# 파일 저장
save_path = '/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason_preprocessed.csv'
csv.to_csv(save_path, index=False)

In [16]:
import pandas as pd
import re
from difflib import SequenceMatcher

# ==========================================
# 1. 파일 불러오기
# ==========================================
file_path = '/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason_preprocessed.csv'
csv = pd.read_csv(file_path)

# ==========================================
# 2. 불용어(Stopwords) 강화
# ==========================================
# GDELT 데이터 특성상 자주 나오지만 변별력이 없는 단어 + 인과관계 연결어 제거
DOMAIN_STOPWORDS = {
    # 문법적 불용어
    "a", "an", "the", "in", "on", "at", "to", "for", "of", "by", "with", "and", "or", "but",
    "is", "are", "was", "were", "be", "been", "this", "that", "it", "he", "she", "they", "have", "has", "had",

    # 인과관계 추측성 단어 (이게 겹친다고 내용이 같은 건 아님)
    "likely", "probably", "may", "might", "could", "would", "due", "because", "caused", "prompted", "led", "result", "resulted",

    # GDELT 도메인 일반 명사 (너무 흔함)
    "group", "sector", "region", "government", "military", "official", "force", "party",
    "country", "nation", "state", "organization", "international", "member", "leader",
    "civilian", "rebel", "opposition", "business", "police"
}

# ==========================================
# 3. 헬퍼 함수
# ==========================================
def get_similarity(str1, str2):
    """문자열 유사도 (동어반복 체크용)"""
    return SequenceMatcher(None, str1, str2).ratio()

def extract_keywords(text):
    """텍스트에서 3글자 이상 의미있는 단어만 set으로 추출"""
    if not isinstance(text, str):
        return set()
    # 소문자 변환 및 정규식 추출
    tokens = set(re.findall(r'\b[a-z]{3,}\b', text.lower()))
    # 불용어 제거
    return tokens - DOMAIN_STOPWORDS

# ==========================================
# 4. Ultra-Strict 레이블링 로직
# ==========================================
def ultra_strict_labeling(row):
    # 1. 이미 사람이 평가한 값이 있다면 유지 (없으면 로직 수행)
    if pd.notna(row.get('evaluation')) and row['evaluation'] != '':
        return row['evaluation']

    # 텍스트 가져오기
    cause = str(row['cause']).strip() if pd.notna(row['cause']) else ""
    effect = str(row['effect']).strip() if pd.notna(row['effect']) else ""
    reason = str(row['reason']).strip() if pd.notna(row['reason']) else ""

    # ---------------------------------------------------
    # [Step 1] 확실한 'X' 필터링 (Format 불량)
    # ---------------------------------------------------
    # 내용이 없거나 너무 짧으면(단어 3개 미만) 무조건 X
    if not cause or not effect or not reason or len(reason.split()) < 3:
        return 'X'

    # ---------------------------------------------------
    # [Step 2] 동어반복 체크 (Tautology) -> X
    # ---------------------------------------------------
    # Reason이 Cause나 Effect를 거의 그대로 복사한 경우 (유사도 0.75 이상)
    if get_similarity(reason, cause) > 0.75 or get_similarity(reason, effect) > 0.75:
        return 'X'

    # ---------------------------------------------------
    # [Step 3] 키워드 매칭 분석
    # ---------------------------------------------------
    cause_kws = extract_keywords(cause)
    effect_kws = extract_keywords(effect)
    reason_kws = extract_keywords(reason)

    # Reason이 Cause와 겹치는 단어들
    match_with_cause = reason_kws.intersection(cause_kws)
    # Reason이 Effect와 겹치는 단어들
    match_with_effect = reason_kws.intersection(effect_kws)
    # 전체 겹치는 단어 합집합 개수
    total_matches = match_with_cause.union(match_with_effect)

    # ---------------------------------------------------
    # [Step 4] 엄격한 'O' 선정 기준
    # ---------------------------------------------------

    # Rule 1: "연결고리 법칙" (가장 중요)
    # 훌륭한 Reason은 원인(Cause)의 요소와 결과(Effect)의 요소를 모두 언급하며 연결함.
    # 예: Cause(US tariff) -> Effect(China anger) / Reason(China anger due to US tariff)
    # 조건: Cause와도 겹치고 AND Effect와도 겹쳐야 함.
    if len(match_with_cause) >= 2 and len(match_with_effect) >= 2:
        return 'O'

    # Rule 2: "풍부한 정보 법칙"
    # 한쪽(Cause나 Effect)에만 치우쳤더라도, 겹치는 핵심 키워드가 3개 이상이면 O
    # (단, 불용어가 이미 많이 제거되었으므로 3개 겹치면 꽤 많이 겹치는 것임)
    if len(total_matches) >= 4:
        return 'O'

    # Rule 3: "희귀 고유명사 법칙"
    # 아주 긴 단어(8글자 이상, 예: 'Afghanistan', 'Negotiation')가 포함되면 1개만 겹쳐도 인정
    # (짧은 단어는 우연히 겹칠 수 있지만 긴 단어는 의도적임)
    long_words = [w for w in total_matches if len(w) >= 8]
    if len(long_words) >= 2:
        # 긴 단어가 있어도 최소한의 문맥 확인을 위해 전체 매칭이 2개는 되어야 함
        if len(total_matches) >= 2:
            return 'O'

    # ---------------------------------------------------
    # [Step 5] 나머지는 전부 'X' (Default Fallback)
    # ---------------------------------------------------
    return 'X'

# ==========================================
# 5. 실행 및 저장
# ==========================================
print("Ultra-Strict 레이블링 시작...")

# 로직 적용
csv['evaluation'] = csv.apply(ultra_strict_labeling, axis=1)

# 통계 출력
stats = csv['evaluation'].value_counts()
total = len(csv)

print(f"\n=== 결과 통계 (총 {total}개) ===")
print(stats)
print(f"\n'O' 비율: {stats.get('O', 0)/total*100:.1f}%")
print(f"'X' 비율: {stats.get('X', 0)/total*100:.1f}%")

# 샘플 확인 (제대로 걸러졌는지)
print("\n[Sample 'X' - 설명 부족/동어반복/키워드 불일치]")
print(csv[csv['evaluation'] == 'X'][['cause', 'effect', 'reason']].head(3))

print("\n[Sample 'O' - 양쪽 연결/풍부한 키워드]")
print(csv[csv['evaluation'] == 'O'][['cause', 'effect', 'reason']].head(3))

# 저장
save_path = '/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason_processed1.csv'
csv.to_csv(save_path, index=False)
print(f"\n저장 완료: {save_path}")

Ultra-Strict 레이블링 시작...

=== 결과 통계 (총 1528개) ===
evaluation
O    1413
X     115
Name: count, dtype: int64

'O' 비율: 92.5%
'X' 비율: 7.5%

[Sample 'X' - 설명 부족/동어반복/키워드 불일치]
                                                cause  \
1   United States Offer trade concessions to Unite...   
29  sar Statement of intent regarding trade action...   
31  legislature Offer trade concessions to United ...   

                                               effect  \
1   United States Statement of intent regarding tr...   
29  multinational corporation in United States App...   
31  government Express dissatisfaction with econom...   

                                               reason  
1   United States's offer of trade concessions pro...  
29  US's economic cooperation agreement followed U...  
31  Leg's trade offer probably led to USGOV's diss...  

[Sample 'O' - 양쪽 연결/풍부한 키워드]
                                               cause  \
0  Israel Appeal for economic cooperation to Unit...   
2  busi