In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import hashlib
import time

tqdm.pandas()

In [2]:
def get_report_text(url):
    try:
        res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        soup = BeautifulSoup(res.content, 'html.parser')

        # 텍스트가 들어있는 iframe 안 링크 찾기
        iframe = soup.find('iframe')
        if not iframe:
            return ''
        iframe_url = "https://dart.fss.or.kr" + iframe['src']
        
        res = requests.get(iframe_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        iframe_soup = BeautifulSoup(res.content, 'html.parser')

        # 본문 텍스트 추출
        text = iframe_soup.get_text(separator=' ', strip=True)
        return text
    except Exception as e:
        return ''


In [7]:
def group_events_by_similarity(texts, threshold=0.75):
    # stop_words='korean' 제거 → 오류 해결
    vectorizer = TfidfVectorizer(stop_words=None, max_df=0.8)
    tfidf = vectorizer.fit_transform(texts)
    sim_matrix = cosine_similarity(tfidf)

    n = len(texts)
    groups = [-1] * n
    group_id = 0

    for i in range(n):
        if groups[i] == -1:
            groups[i] = group_id
            for j in range(i + 1, n):
                if sim_matrix[i][j] >= threshold:
                    groups[j] = group_id
            group_id += 1
    return groups

In [9]:
# CSV 불러오기
df = pd.read_csv('data/merged_event_final.csv')

# 보고서 텍스트 수집
df['text'] = df['source'].progress_apply(get_report_text)

# 기업별 사건 그룹핑
df['event_id'] = None

for corp_id in tqdm(df['company_corp_id'].unique(), desc="기업별 사건 그룹핑"):
    sub_df = df[df['company_corp_id'] == corp_id].copy()

    if len(sub_df) == 1:
        df.loc[sub_df.index, 'event_id'] = f"{corp_id}_evt_0"
        continue

    texts = sub_df['text'].fillna('').tolist()

    # 🔻 추가: 모두 빈 텍스트면 건너뛰기
    if all(len(text.strip()) == 0 for text in texts):
        df.loc[sub_df.index, 'event_id'] = f"{corp_id}_evt_999"  # 임시 event_id
        continue

    groups = group_events_by_similarity(texts, threshold=0.75)

    for idx, g in zip(sub_df.index, groups):
        df.loc[idx, 'event_id'] = f"{corp_id}_evt_{g}"

100%|██████████| 72/72 [00:04<00:00, 16.47it/s]
기업별 사건 그룹핑: 100%|██████████| 26/26 [00:00<00:00, 1368.40it/s]


In [10]:
# 저장
df.to_csv('data/classified_events.csv', index=False)

# 기업별 사건 수 요약
event_counts = df[['company_corp_id', 'event_id']].drop_duplicates()
summary = event_counts['company_corp_id'].value_counts().reset_index()
summary.columns = ['company_corp_id', 'unique_event_count']
summary.to_csv('company_event_summary.csv', index=False)

print("✅ 분류 완료! 'classified_events.csv', 'company_event_summary.csv' 생성됨.")

✅ 분류 완료! 'classified_events.csv', 'company_event_summary.csv' 생성됨.
