In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display, clear_output

In [3]:
csv = pd.read_csv('/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason.csv')

In [4]:
csv.head()

Unnamed: 0,idx,cause,effect,reason,evaluation
0,0,Israel Appeal for economic cooperation to Unit...,United States Offer trade concessions to Unite...,ISR's appeal for economic cooperation likely p...,O
1,1,United States Offer trade concessions to Unite...,United States Statement of intent regarding tr...,USA's offer of trade concessions probably led ...,X
2,2,business sector Statement of intent regarding ...,health sector multinational corporation in Uni...,BUS's trade statement of intent likely influen...,O
3,3,"Palestine, State of medical sector Offer trade...",United States Statement of intent regarding tr...,PSEMED's trade concessions offer probably trig...,O
4,4,United States Statement of intent regarding tr...,United States medical sector Statement of inte...,USA's trade statement of intent likely prompte...,O


In [5]:
!pip install pycountry



In [6]:
import pandas as pd
import pycountry

# ==============================
# 1) 3글자 단위 split 함수
# ==============================
def split_triplets(s):
    if pd.isna(s):
        return []
    s = str(s).upper()
    return [s[i:i+3] for i in range(0, len(s), 3) if len(s[i:i+3]) == 3]


# ==============================
# 2) ISO & GDELT 지역 코드 설정
# ==============================
ISO_ALPHA3_SET = {c.alpha_3 for c in pycountry.countries}

GDELT_REGION_CODES = {
    "AFR": "Africa",
    "ASA": "Asia",
    "EUR": "Europe",
    "MEA": "Middle East & North Africa",
    "XAF": "Sub-Saharan Africa",
    "XAS": "East & Southeast Asia",
    "XME": "Middle East",
    "XEU": "European region",
    "SAF": "South Africa region",
    "WAF": "West Africa",
    "EAF": "East Africa",
}


# ==============================
# 3) identity / org / role 기본 사전
# ==============================
identity_map = {
    "CHR": "Christian",
    "MOS": "Muslim",
    "JEW": "Jewish",
    "BUD": "Buddhist",
    "HIN": "Hindu",
    "PRO": "Protestant",
}

org_map = {
    "NGO": "Non-Governmental Organization",
    "IGO": "Inter-Governmental Organization",
    "MNC": "Multinational Corporation",
    "IMG": "Migrant Group",
    "UNO": "United Nations Organization",
    "WBK": "World Bank Group",
    "ADB": "Asian Development Bank",
    "DEV": "Development Organization",
}

role_map = {
    "GOV": "Government",
    "MIL": "Military",
    "BUS": "Business Sector",
    "CVL": "Civilian Group",
    "HLH": "Health Sector",
    "EDU": "Education Sector",
    "JUD": "Judiciary",
    "REL": "Religious Organization",
    "LAB": "Labor Union",
    "COP": "Police",
    "OPP": "Opposition Group",
    "REF": "Refugee Group",
    "AGR": "Agricultural Sector",
    "MED": "Medical Sector",
    "PTY": "Political Party",
    "HRI": "Human Rights Organization",
    "ELI": "Elite",
    "CRM": "Criminal Organization",
    "UAF": "Armed Forces",
    "LEG": "Legislature",
    "RAD": "Radical Group",
    "REB": "Rebel Group",
    "SET": "Settler Group",
    "SPY": "Intelligence Agency",
}


# ==============================
# 4) 국가 코드 추출
# ==============================
def extract_country_codes(tokens):
    """ISO 코드 또는 지역코드만 필터링한 뒤, identity/org/role에 속한 값은 제거"""

    identity_set = set(identity_map.keys())
    role_set = set(role_map.keys())
    org_set = set(org_map.keys())

    initial = {t for t in tokens if t in ISO_ALPHA3_SET or t in GDELT_REGION_CODES}

    filtered = initial - identity_set - role_set - org_set
    return filtered


# ==============================
# 5) Natural-language builder
# ==============================
def build_phrase(identity, roles, org, country):
    if identity and roles and country:
        return f"{identity.lower()} {roles.lower()} in {country}"
    if identity and country:
        return f"{identity.lower()} group in {country}"
    if identity:
        return f"{identity.lower()} group"
    if org and roles and country:
        return f"{roles.lower()} {org.lower()} in {country}"
    if org and country:
        return f"{org.lower()} in {country}"
    if roles and country:
        return f"{country} {roles.lower()}"
    if org:
        return org.lower()
    if country:
        return country
    return None


# ==============================
# 6) parse_actor()
# ==============================
def parse_actor(code):
    original = code
    code = code.strip().upper()

    # ============================
    # 0) Pure role code
    # ============================
    if code in role_map:
        return role_map[code].lower()

    # ============================
    # 1) Pure identity code
    # ============================
    if code in identity_map:
        return identity_map[code].lower() + " group"

    # ============================
    # 2) Pure org code
    # ============================
    if code in org_map:
        return org_map[code].lower()

    # ============================
    # 3) Pure country/region code
    # ============================
    if code in country_map:
        return country_map[code]

    # ============================================
    # 복합 코드 파싱 시작
    # ============================================
    identity = None
    org = None
    country = None
    roles = []

    # -------------------
    # identity prefix
    # -------------------
    for p in sorted(identity_map, key=lambda x: -len(x)):
        if code.startswith(p):
            identity = identity_map[p]
            code = code[len(p):]
            break

    # -------------------
    # organization prefix
    # -------------------
    for p in sorted(org_map, key=lambda x: -len(x)):
        if code.startswith(p):
            org = org_map[p]
            code = code[len(p):]
            break

    # -------------------
    # country prefix
    # -------------------
    for c in sorted(country_map, key=lambda x: -len(x)):
        if code.startswith(c):
            country = country_map[c]
            code = code[len(c):]
            break

    # -------------------
    # role suffixes (반복)
    # -------------------
    changed = True
    while changed:
        changed = False
        for r in sorted(role_map, key=lambda x: -len(x)):
            if code.endswith(r):
                roles.append(role_map[r])
                code = code[:-len(r)]
                changed = True

    # 자연어 조립
    roles_final = ", ".join(roles) if roles else None
    phrase = build_phrase(identity, roles_final, org, country)

    if phrase:
        return phrase

    # fallback
    return original.lower()

In [7]:
import re

# =========================================================
# 1. 현재 데이터프레임(csv)의 텍스트에서 국가 코드 추출 및 Map 구성
# =========================================================
print("Extracting country codes from dataframe text...")

# 분석할 컬럼 리스트
target_cols = ['cause', 'effect', 'reason']

# 전체 텍스트에서 3글자 대문자 토큰 수집
all_tokens = set()
for col in target_cols:
    if col in csv.columns:
        extracted = csv[col].dropna().astype(str).apply(lambda x: re.findall(r'\b[A-Z]{3}\b', x))
        for tokens in extracted:
            all_tokens.update(tokens)

# 앞서 정의된 extract_country_codes 함수 활용
valid_country_codes = extract_country_codes(list(all_tokens))

# country_map 갱신
country_map = {}
for c in valid_country_codes:
    obj = pycountry.countries.get(alpha_3=c)
    if obj:
        country_map[c] = obj.name
country_map.update(GDELT_REGION_CODES)

print(f"Country Map Built: {len(country_map)} entries found.")


def replace_gdelt_codes(text):
    if pd.isna(text):
        return text
    text = str(text)

    # 정규식 치환용 내부 함수
    def replacer(match):
        code = match.group()
        # 이미 정의된 parse_actor 함수 사용
        decoded = parse_actor(code)

        # parse_actor는 실패 시 원래 코드를 소문자로 반환함.
        # 변환된 결과가 단순히 소문자화 된 것과 다르면(즉, 해석 성공 시) 교체
        if decoded != code.lower():
            return decoded
        return code

    # 정규식: 단어 경계(\b)가 있는 3글자 이상의 대문자(예: ISR, USAGOV) 찾기
    return re.sub(r'\b[A-Z]{3,}\b', replacer, text)


Extracting country codes from dataframe text...
Country Map Built: 89 entries found.


In [8]:

print("Replacing acronyms in text columns...")

for col in target_cols:
    if col in csv.columns:
        csv[col] = csv[col].apply(replace_gdelt_codes)


Replacing acronyms in text columns...


In [9]:
csv.head()

Unnamed: 0,idx,cause,effect,reason,evaluation
0,0,Israel Appeal for economic cooperation to Unit...,United States Offer trade concessions to Unite...,Israel's appeal for economic cooperation likel...,O
1,1,United States Offer trade concessions to Unite...,United States Statement of intent regarding tr...,United States's offer of trade concessions pro...,X
2,2,business sector Statement of intent regarding ...,health sector multinational corporation in Uni...,business sector's trade statement of intent li...,O
3,3,"Palestine, State of medical sector Offer trade...",United States Statement of intent regarding tr...,"Palestine, State of medical sector's trade con...",O
4,4,United States Statement of intent regarding tr...,United States medical sector Statement of inte...,United States's trade statement of intent like...,O


In [10]:

# 파일 저장
save_path = '/content/drive/My Drive/STAR_fellowship/stacked_cause_effect_reason_preprocessed.csv'
csv.to_csv(save_path, index=False)