### 실제로 사기가 얼마나 발생한가???

-코스피 200 기준

In [2]:
import pandas as pd

file_path = 'data\상장법인목록.csv'
df = pd.read_csv(file_path, encoding='cp949')

df_simplified = df[['회사명', '종목코드']].copy()

df_simplified.columns = ['company_name', 'stock_code']

df_simplified['stock_code'] = df_simplified['stock_code'].astype(str).str.zfill(6)

print(df_simplified.head())

df_simplified.to_csv('data\kospi200_company_code.csv', index=False, encoding='utf-8-sig')


  company_name stock_code
0         한화비전     489790
1    HD현대마린솔루션     443060
2         에이피알     278470
3         엘앤에프     066970
4        포스코DX     022100


  df_simplified.to_csv('data\kospi200_company_code.csv', index=False, encoding='utf-8-sig')


In [3]:
import requests
import zipfile
import io

api_key = "a3e8c0f55e2d783460b8337e65f89c08c66b9bc0"
url = f"https://opendart.fss.or.kr/api/corpCode.xml?crtfc_key={api_key}"

response = requests.get(url)

if response.status_code == 200:
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall("corpcode_data")
        print("압축 풀림 완료: corpcode_data 폴더 확인")
else:
    print("에러 발생:", response.text)


압축 풀림 완료: corpcode_data 폴더 확인


코스피 200 종류, 기업 고유 코드, 종목코드 연결

In [5]:
import pandas as pd
import xml.etree.ElementTree as ET

# (1) 코스피 상장법인 목록 불러오기
df_kospi = pd.read_csv("data\상장법인목록.csv", encoding="cp949")[["회사명", "종목코드"]]
df_kospi.columns = ["corp_name", "stock_code"]
df_kospi["stock_code"] = df_kospi["stock_code"].astype(str).str.zfill(6)

# (2) CORPCODE.xml 직접 경로에서 파싱
tree = ET.parse("corpcode_data\CORPCODE.xml")
root = tree.getroot()

# (3) corp_code 정보 추출
corp_info = []
for child in root:
    corp_code = child.findtext("corp_code")
    corp_name = child.findtext("corp_name")
    stock_code = child.findtext("stock_code")
    corp_info.append({"corp_name": corp_name, "corp_code": corp_code, "stock_code": stock_code})

df_corp_code = pd.DataFrame(corp_info)

# (4) 코스피 기업과 corp_code 매핑
df_merged = pd.merge(df_kospi, df_corp_code, on="stock_code", how="inner")
df_merged = df_merged[["corp_name_x", "stock_code", "corp_code"]]
df_merged.columns = ["corp_name", "stock_code", "corp_code"]

df_merged = df_merged.rename(columns={"corp_name": "company_name"})

# 확인
df_merged
df_merged.to_csv("kospi200.csv", index=False, encoding="utf-8-sig")

  tree = ET.parse("corpcode_data\CORPCODE.xml")


In [15]:
df_merged[df_merged["corp_name"] == "삼성물산"]

Unnamed: 0,corp_name,stock_code,corp_code
45,삼성물산,28260,149655


In [11]:
import pandas as pd
import requests
import time

# df_merged는 이미 만들어져 있다고 가정

# DART API 키
API_KEY = "a3e8c0f55e2d783460b8337e65f89c08c66b9bc0"

# 키워드 기반 이벤트 탐색 함수
def search_events(corp_code, api_key, keywords=["횡령", "배임"], start_date="20190101", end_date="20240630"):
    url = f"https://opendart.fss.or.kr/api/list.json"
    params = {
        "crtfc_key": api_key,
        "corp_code": corp_code,
        "bgn_de": start_date,
        "end_de": end_date,
        "page_count": 100
    }
    try:
        res = requests.get(url, params=params)
        data = res.json()

        if data["status"] != "000":
            return 0, []

        hit_links = []
        for item in data["list"]:
            title = item.get("report_nm", "")
            if any(kw in title for kw in keywords):
                rcp_no = item.get("rcept_no")
                link = f"https://dart.fss.or.kr/dsaf001/main.do?rcpNo={rcp_no}"
                hit_links.append(link)

        return len(hit_links), hit_links

    except Exception as e:
        return "에러", [str(e)]

# 전체 기업에 대해 반복 수행
results = []
for idx, row in df_merged.iterrows():
    corp_name = row["corp_name"]
    corp_code = row["corp_code"]

    count, links = search_events(corp_code, API_KEY)
    results.append({
        "기업명": corp_name,
        "이벤트 발생 횟수": count,
        "관련 링크": "; ".join(links) if links else "없음"
    })

    time.sleep(0.3)  # DART API 제한 방지

# 결과 데이터프레임
df_result = pd.DataFrame(results)

# 엑셀 또는 CSV 저장 가능
df_result.to_csv("event_result.csv", index=False, encoding="utf-8-sig")
df_result


Unnamed: 0,기업명,이벤트 발생 횟수,관련 링크
0,한화비전,0,없음
1,HD현대마린솔루션,0,없음
2,에이피알,0,없음
3,엘앤에프,0,없음
4,포스코DX,0,없음
...,...,...,...
195,대한전선,0,없음
196,한국앤컴퍼니,2,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...
197,대한항공,0,없음
198,유한양행,0,없음


In [23]:
### 페이지 반복 코드

import pandas as pd
import requests
import time

# df_merged는 'corp_name', 'corp_code' 컬럼을 포함한 DataFrame이라고 가정
# 예: df_merged = pd.DataFrame([...])

API_KEY = "a3e8c0f55e2d783460b8337e65f89c08c66b9bc0"

def search_events(corp_code, api_key, keywords=["횡령", "배임"], start_date="20200624", end_date="20250624"):
    url = "https://opendart.fss.or.kr/api/list.json"
    page_no = 1
    hit_links = []

    while True:
        params = {
            "crtfc_key": api_key,
            "corp_code": corp_code,
            "bgn_de": start_date,
            "end_de": end_date,
            "page_no": page_no,
            "page_count": 100
        }

        try:
            res = requests.get(url, params=params, timeout=10)
            data = res.json()

            if data.get("status") != "000" or "list" not in data:
                break  # 오류 or 결과 없음

            for item in data["list"]:
                title = item.get("report_nm", "")
                if any(kw in title for kw in keywords):
                    rcp_no = item.get("rcept_no")
                    link = f"https://dart.fss.or.kr/dsaf001/main.do?rcpNo={rcp_no}"
                    hit_links.append(link)

            # 마지막 페이지 도달
            if len(data["list"]) < 100:
                break

            page_no += 1
            time.sleep(0.3)

        except Exception as e:
            return 0, [f"요청 실패: {str(e)}"]

    return len(hit_links), hit_links

# 실행 예시
results = []
for idx, row in df_merged.iterrows():
    corp_name = row["corp_name"]
    corp_code = row["corp_code"]

    count, links = search_events(corp_code, API_KEY)
    results.append({
        "기업명": corp_name,
        "이벤트 발생 횟수": count,
        "관련 링크": "; ".join(links) if links else "없음"
    })
    time.sleep(0.3)

# 결과 저장
df_result = pd.DataFrame(results)
df_result.to_csv("event_result3.csv", index=False, encoding="utf-8-sig")


In [None]:
import requests
import time

# DART 인증키
API_KEY = "a3e8c0f55e2d783460b8337e65f89c08c66b9bc0"

# 삼성전자 corp_code (예: 00126380)
CORP_CODE = "00126380"

def search_events_single(corp_code, api_key, keywords=["횡령", "배임"], start_date="20190101", end_date="20240630"):
    url = "https://opendart.fss.or.kr/api/list.json"
    page_no = 1
    hit_links = []

    while True:
        params = {
            "crtfc_key": api_key,
            "corp_code": corp_code,
            "bgn_de": start_date,
            "end_de": end_date,
            "page_no": page_no,
            "page_count": 100
        }

        try:
            res = requests.get(url, params=params, timeout=10)
            data = res.json()

            if data.get("status") != "000" or "list" not in data:
                break

            for item in data["list"]:
                title = item.get("report_nm", "")
                if any(kw in title for kw in keywords):
                    rcp_no = item.get("rcept_no")
                    link = f"https://dart.fss.or.kr/dsaf001/main.do?rcpNo={rcp_no}"
                    print(f"[HIT] {title} → {link}")
                    hit_links.append(link)

            if len(data["list"]) < 100:
                break

            page_no += 1
            time.sleep(0.2)

        except Exception as e:
            print(f"에러 발생: {str(e)}")
            break

    print(f"\n총 발견된 보고서 수: {len(hit_links)}")
    return hit_links

# 실행
search_events_single(CORP_CODE, API_KEY)


[HIT] 횡령ㆍ배임사실확인 → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20210120800650
[HIT] 횡령ㆍ배임사실확인 → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20190903800354

총 발견된 보고서 수: 2


['https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20210120800650',
 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20190903800354']

In [21]:

import requests
import time

# DART 인증키
API_KEY = "a3e8c0f55e2d783460b8337e65f89c08c66b9bc0"

# 삼성물산 corp_code 
CORP_CODE = "00149655"

def search_events_single(corp_code, api_key, keywords=["횡령", "배임"], start_date="20200101", end_date="20250630"): # end_date 설정
    url = "https://opendart.fss.or.kr/api/list.json"
    page_no = 1
    hit_links = []

    while True:
        params = {
            "crtfc_key": api_key,
            "corp_code": corp_code,
            "bgn_de": start_date,
            "end_de": end_date,
            "page_no": page_no,
            "page_count": 100
        }

        try:
            res = requests.get(url, params=params, timeout=10)
            data = res.json()

            if data.get("status") != "000" or "list" not in data:
                break

            for item in data["list"]:
                title = item.get("report_nm", "")
                if any(kw in title for kw in keywords):
                    rcp_no = item.get("rcept_no")
                    link = f"https://dart.fss.or.kr/dsaf001/main.do?rcpNo={rcp_no}"
                    print(f"[HIT] {title} → {link}")
                    hit_links.append(link)

            if len(data["list"]) < 100:
                break

            page_no += 1
            time.sleep(0.2)

        except Exception as e:
            print(f"에러 발생: {str(e)}")
            break

    print(f"\n총 발견된 보고서 수: {len(hit_links)}")
    return hit_links

# 실행
search_events_single(CORP_CODE, API_KEY)

[HIT] [기재정정]횡령ㆍ배임사실확인               → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250207801436
[HIT] 횡령ㆍ배임사실확인               → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250203801077
[HIT] [기재정정]횡령ㆍ배임사실확인               → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240208801439
[HIT] 횡령ㆍ배임사실확인               → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240205801064
[HIT] 횡령ㆍ배임혐의발생 → https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20200904800573

총 발견된 보고서 수: 5


['https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250207801436',
 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250203801077',
 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240208801439',
 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240205801064',
 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20200904800573']

In [11]:
import pandas as pd

df = pd.read_csv("data\event_result_data\event_result3.csv")

total_event =df["이벤트 발생 횟수"].sum()

print(total_event)

72


  df = pd.read_csv("data\event_result_data\event_result3.csv")


In [23]:
df = pd.read_csv("merged_event.csv")
df

Unnamed: 0.1,Unnamed: 0,company_name,이벤트 발생 횟수,source,stock_code,corp_code
0,0,한화비전,0,없음,489790,1867758
1,1,HD현대마린솔루션,0,없음,443060,1194689
2,2,에이피알,0,없음,278470,1190568
3,3,엘앤에프,0,없음,66970,398701
4,4,포스코DX,0,없음,22100,155212
...,...,...,...,...,...,...
195,195,대한전선,0,없음,1440,113207
196,196,한국앤컴퍼니,6,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,240,160047
197,197,대한항공,1,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,3490,113526
198,198,유한양행,0,없음,100,145109


In [26]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# 테스트용 데이터프레임 생성 (원래는 event_result3.csv 일부 샘플을 불러오는 방식)
df = pd.read_csv("merged_event.csv").head(50)  # 상위 3개만 테스트

# 테스트용 함수 정의
def get_titles_from_source_column(links):
    if links == "없음" or pd.isna(links):
        return "없음"

    urls = links.split("; ")
    titles = []

    for url in urls:
        try:
            headers = {
                "User-Agent": "Mozilla/5.0"
            }
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            title_tag = soup.find("head").find("title")
            title = title_tag.text.strip() if title_tag else "제목 없음"
            titles.append(title)
            time.sleep(0.3)  # 요청 간 딜레이
        except Exception as e:
            titles.append("오류")

    return "; ".join(titles)

# title 열 추가 (테스트용)
df["title"] = df["source"].apply(get_titles_from_source_column)

# 결과 확인
print(df[["company_name", "source", "title"]])


   company_name                                             source  \
0          한화비전                                                 없음   
1     HD현대마린솔루션                                                 없음   
2          에이피알                                                 없음   
3          엘앤에프                                                 없음   
4         포스코DX                                                 없음   
5        에코프로머티                                                 없음   
6        두산로보틱스                                                 없음   
7     이수스페셜티케미컬                                                 없음   
8           OCI                                                 없음   
9      LG에너지솔루션                                                 없음   
10        SK스퀘어                                                 없음   
11        카카오페이                                                 없음   
12      HD현대중공업                                                 없음   
13         크래프톤     

In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# CSV 파일 로드
df = pd.read_csv("merged_event.csv")

# 제목을 저장할 리스트
titles = []

# 각 source에서 제목 추출
for idx, row in df.iterrows():
    source = row['source']
    if source == '없음' or pd.isna(source):
        titles.append("없음")
    else:
        try:
            urls = [url.strip() for url in source.split(';') if url.strip()]
            title_list = []
            for url in urls:
                res = requests.get(url, timeout=10)
                soup = BeautifulSoup(res.text, 'html.parser')
                title_tag = soup.find('head').find('title')
                title_text = title_tag.text.strip() if title_tag else "제목 없음"
                title_list.append(title_text)
                time.sleep(0.5)  # 과도한 요청 방지
            titles.append(" | ".join(title_list))
        except Exception as e:
            titles.append(f"에러: {str(e)}")

# 새로운 열 추가
df['title'] = titles
df = df.rename(columns={"이벤트 발생 횟수": "event_type"})
df = df.drop(columns="Unnamed: 0")
df = df.rename(columns={
    "corp_code": "company_corp_id"
})

upload_df = df[["company_corp_id", "event_type", "source", "title"]]

# 결과 저장
upload_df.to_csv("merged_event_with_title.csv", index=False, encoding="utf-8-sig")


In [40]:
import pandas as pd
import re



# 정규표현식으로 날짜 추출 후 datetime 형식으로 변환
date_pattern = r"(\d{4}\.\d{2}\.\d{2})$"

df["published_date"] = df["title"].str.extract(date_pattern)[0]  # 문자열 추출
df["published_date"] = pd.to_datetime(df["published_date"], format="%Y.%m.%d", errors='coerce')  # datetime 변환

# title에서 날짜 제거
df["title"] = df["title"].str.replace(date_pattern, "", regex=True).str.strip(" /")  # 뒤 공백/슬래시 제거

# 저장
df.to_csv("merged_event_updated.csv", index=False, encoding="utf-8-sig")

# 확인
print(df[["title", "published_date"]].head())


  title published_date
0    없음            NaT
1    없음            NaT
2    없음            NaT
3    없음            NaT
4    없음            NaT


In [44]:
df = pd.read_csv("merged_event_with_title.csv")
print(df.columns)

Index(['company_corp_id', 'event_type', 'source', 'title'], dtype='object')


In [46]:
import pandas as pd
import re

df = pd.read_csv("merged_event_with_title.csv")

def extract_dates(title):
    if pd.isna(title) or title == "없음":
        return ""
    dates = re.findall(r"\d{4}\.\d{2}\.\d{2}", title)
    return " | ".join(dates) if dates else ""

df["published_date"] = df["title"].apply(extract_dates)

# 저장
df.to_csv("merged_event_with_date.csv", index=False, encoding="utf-8-sig")


In [49]:
df = pd.read_csv("merged_event_with_title.csv")
df

Unnamed: 0,company_corp_id,event_type,source,title
0,1867758,0,없음,없음
1,1194689,0,없음,없음
2,1190568,0,없음,없음
3,398701,0,없음,없음
4,155212,0,없음,없음
...,...,...,...,...
195,113207,0,없음,없음
196,160047,6,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,한국앤컴퍼니/횡령ㆍ배임사실확인/2025.05.30 | 한국앤컴퍼니/횡령ㆍ배임사실확인...
197,113526,1,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,대한항공/횡령ㆍ배임사실확인/2020.11.24
198,145109,0,없음,없음


In [50]:
df["company_corp_id"] = df["company_corp_id"].astype(str).str.zfill(8)

# 확인
print(df["company_corp_id"].head())

# 저장
df.to_csv("merged_event_with_date_fixed.csv", index=False, encoding="utf-8-sig")

0    01867758
1    01194689
2    01190568
3    00398701
4    00155212
Name: company_corp_id, dtype: object


In [53]:
df = pd.read_csv("merged_event_with_date.csv")
df

Unnamed: 0,company_corp_id,event_type,source,title,published_date
0,1867758,0,없음,없음,
1,1194689,0,없음,없음,
2,1190568,0,없음,없음,
3,398701,0,없음,없음,
4,155212,0,없음,없음,
...,...,...,...,...,...
195,113207,0,없음,없음,
196,160047,6,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,한국앤컴퍼니/횡령ㆍ배임사실확인/2025.05.30 | 한국앤컴퍼니/횡령ㆍ배임사실확인...,2025.05.30 | 2025.05.30 | 2023.03.31 | 2023.03...
197,113526,1,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,대한항공/횡령ㆍ배임사실확인/2020.11.24,2020.11.24
198,145109,0,없음,없음,


In [54]:
import pandas as pd

# CSV 불러오기
df = pd.read_csv("merged_event_with_date.csv")

# 1. event_type > 0 필터링
df = df[df['event_type'] > 0].copy()

# 2. 각 행에서 source, title, published_date 분리 후 새 행으로 확장
rows = []
for _, row in df.iterrows():
    sources = str(row['source']).split(" | ")
    titles = str(row['title']).split(" | ")
    dates = str(row['published_date']).split(" | ")
    
    max_len = max(len(sources), len(titles), len(dates))
    
    for i in range(max_len):
        rows.append({
            'company_corp_id': str(row['company_corp_id']).zfill(8),
            'event_type': row['event_type'],
            'source': sources[i] if i < len(sources) else None,
            'title': titles[i] if i < len(titles) else None,
            'published_date': dates[i] if i < len(dates) else None
        })

# 새로운 DataFrame 생성
df_exploded = pd.DataFrame(rows)

# 확인
print(df_exploded.head())

# 저장
df_exploded.to_csv("exploded_event_data.csv", index=False, encoding="utf-8-sig")


  company_corp_id  event_type  \
0        01350869           2   
1        01350869           2   
2        01032486           1   
3        00877059           3   
4        00877059           3   

                                              source  \
0  https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...   
1                                               None   
2  https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...   
3  https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...   
4                                               None   

                           title published_date  
0    우리금융지주/횡령ㆍ배임혐의발생/2025.01.31     2025.01.31  
1    우리금융지주/횡령ㆍ배임혐의발생/2024.11.22     2024.11.22  
2      두산밥캣/횡령ㆍ배임혐의발생/2024.07.26     2024.07.26  
3  삼성바이오로직스/횡령ㆍ배임혐의발생/2024.02.20     2024.02.20  
4  삼성바이오로직스/횡령ㆍ배임사실확인/2024.02.14     2024.02.14  


In [58]:
import pandas as pd

# CSV 불러오기
df = pd.read_csv("merged_event_with_date.csv")

# 1. event_type > 0 필터링
df = df[df['event_type'] > 0].copy()

# 2. 행 분해
new_rows = []

for _, row in df.iterrows():
    corp_id = str(row['company_corp_id']).zfill(8)
    event_type = row['event_type']

    sources = str(row['source']).split(" | ") if pd.notna(row['source']) else []
    titles = str(row['title']).split(" | ") if pd.notna(row['title']) else []
    dates = str(row['published_date']).split(" | ") if pd.notna(row['published_date']) else []

    max_len = max(len(sources), len(titles), len(dates))

    for i in range(max_len):
        new_rows.append({
            'company_corp_id': corp_id,
            'event_type': event_type,
            'source': sources[i] if i < len(sources) else None,
            'title': titles[i] if i < len(titles) else None,
            'published_date': dates[i] if i < len(dates) else None
        })

# 3. 새 데이터프레임 만들기
df_expanded = pd.DataFrame(new_rows)
df_expanded = df_expanded.drop(columns="source")
df_expanded = df_expanded.drop(columns="event_type")
df_expanded = df_expanded.drop(columns="title")

# 4. 저장
df_expanded.to_csv("exploded_event_data.csv", index=False, encoding="utf-8-sig")


In [61]:
df = pd.read_csv("merged_event_with_title.csv")
df

Unnamed: 0,company_corp_id,event_type,source,title
0,1867758,0,없음,없음
1,1194689,0,없음,없음
2,1190568,0,없음,없음
3,398701,0,없음,없음
4,155212,0,없음,없음
...,...,...,...,...
195,113207,0,없음,없음
196,160047,6,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,한국앤컴퍼니/횡령ㆍ배임사실확인/2025.05.30 | 한국앤컴퍼니/횡령ㆍ배임사실확인...
197,113526,1,https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2...,대한항공/횡령ㆍ배임사실확인/2020.11.24
198,145109,0,없음,없음


In [1]:
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv("merged_event_with_title.csv")

# 여러 링크와 제목이 있는 행을 분리하여 새로운 행 생성
expanded_rows = []

for _, row in df.iterrows():
    if row['source'] == '없음':
        expanded_rows.append(row)
    else:
        sources = str(row['source']).split(';')
        titles = str(row['title']).split('|')
        
        # 빈칸 제거
        sources = [s.strip() for s in sources if s.strip()]
        titles = [t.strip() for t in titles if t.strip()]
        
        # 길이 맞추기
        length = min(len(sources), len(titles))

        for i in range(length):
            new_row = row.copy()
            new_row['source'] = sources[i]
            new_row['title'] = titles[i]
            expanded_rows.append(new_row)

# 새로운 데이터프레임으로 재구성
df_expanded = pd.DataFrame(expanded_rows)

# 결과 확인 또는 저장
df_expanded.to_csv("merged_event_expanded.csv", index=False, encoding="utf-8-sig")
print("완료: merged_event_expanded.csv 저장됨.")


완료: merged_event_expanded.csv 저장됨.


In [1]:
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('merged_event_expanded_with_date.csv')

# Filter rows where event_type >= 1
df_filtered = df[df['event_type'] >= 1].copy()

# Function to remove date from title
def remove_date_from_title(title):
    if pd.isna(title) or title == '없음':
        return title
    
    # Remove date pattern (YYYY.MM.DD) from the end of title
    # Pattern matches: /YYYY.MM.DD at the end of the string
    cleaned_title = re.sub(r'/\d{4}\.\d{2}\.\d{2}$', '', title)
    
    # Also remove any trailing slashes or spaces
    cleaned_title = cleaned_title.rstrip('/ ').rstrip()
    
    return cleaned_title

# Apply the function to clean titles
df_filtered['title'] = df_filtered['title'].apply(remove_date_from_title)

# Save the cleaned data
df_filtered.to_csv('merged_event_final.csv', index=False, encoding='utf-8-sig')

print(f"Original data: {len(df)} rows")
print(f"Filtered data (event_type >= 1): {len(df_filtered)} rows")
print("Cleaned data saved to 'merged_event_final.csv'")

# Show some examples of cleaned titles
print("\nExamples of cleaned titles:")
for i, row in df_filtered.head(10).iterrows():
    if row['title'] != '없음':
        print(f"  {row['title']}")

Original data: 246 rows
Filtered data (event_type >= 1): 72 rows
Cleaned data saved to 'merged_event_final.csv'

Examples of cleaned titles:
  우리금융지주/횡령ㆍ배임혐의발생
  우리금융지주/횡령ㆍ배임혐의발생
  두산밥캣/횡령ㆍ배임혐의발생
  삼성바이오로직스/횡령ㆍ배임혐의발생
  삼성바이오로직스/횡령ㆍ배임사실확인
  삼성바이오로직스/횡령ㆍ배임혐의발생
  삼성물산/횡령ㆍ배임사실확인
  삼성물산/횡령ㆍ배임사실확인
  삼성물산/횡령ㆍ배임사실확인
  삼성물산/횡령ㆍ배임사실확인


In [7]:
df=pd.read_csv('merged_event_final.csv')
df=df.drop(columns='event_type')
df.to_csv('merged_event_final.csv',index=False)

In [9]:

# Convert company_corp_id to string and pad with leading zeros to make it 8 digits
df['company_corp_id'] = df['company_corp_id'].astype(str).str.zfill(8)
df = df.rename(columns={'published_date':'date'})

# Save the updated data
df.to_csv('merged_event_final.csv', index=False, encoding='utf-8-sig')

print("Updated company_corp_id to 8 digits with leading zeros")
print("First few rows:")
print(df[['company_corp_id', 'title']].head(10))

Updated company_corp_id to 8 digits with leading zeros
First few rows:
  company_corp_id               title
0        01350869    우리금융지주/횡령ㆍ배임혐의발생
1        01350869    우리금융지주/횡령ㆍ배임혐의발생
2        01032486      두산밥캣/횡령ㆍ배임혐의발생
3        00877059  삼성바이오로직스/횡령ㆍ배임혐의발생
4        00877059  삼성바이오로직스/횡령ㆍ배임사실확인
5        00877059  삼성바이오로직스/횡령ㆍ배임혐의발생
6        00149655      삼성물산/횡령ㆍ배임사실확인
7        00149655      삼성물산/횡령ㆍ배임사실확인
8        00149655      삼성물산/횡령ㆍ배임사실확인
9        00149655      삼성물산/횡령ㆍ배임사실확인


In [34]:
from supabase import create_client, Client
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Supabase 설정
SUPABASE_URL = "https://ijlsernhoziwwszzjwvd.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlqbHNlcm5ob3ppd3dzenpqd3ZkIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTA3NDU5NzEsImV4cCI6MjA2NjMyMTk3MX0.V3gsXfjnhtEomSN0Rg_WZRKBq0vrDs62ulI5JQqk-lI"

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# 1. 데이터 조회 (event_id와 source 컬럼)
try:
    response = supabase.table("events").select("event_id, source").execute()

    rows = response.data
    print(rows)
    if not rows:
        print("연결 성공했으나, 조회된 데이터가 없습니다.")
    else:
        print("Supabase에서 데이터 불러오기 성공")
except Exception as e:
    print(" Supabase에서 데이터 조회 실패:", e)
    rows = []

# 2. 본문 크롤링 함수
def extract_body_text(url):
    try:
        if not url.startswith("http"):
            return "invalid_url"
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(res.content, "html.parser")
        body = soup.find("body")
        return body.get_text(strip=True) if body else "본문 없음"
    except Exception as e:
        return f"error: {e}"

# 3. DataFrame 만들고 본문 수집
if rows:
    df = pd.DataFrame(rows)
    df["body_text"] = df["source"].apply(extract_body_text)
    print(df.head())
    # df.to_csv("event_contents.csv", index=False)  # 저장 옵션
else:
    print("DataFrame 생성 실패: 불러온 row가 없음")


[{'event_id': '07b9e31e-e80d-485a-88f0-b8261679b2ed', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250131800004'}, {'event_id': '616b5e7b-073c-469a-8bb1-fa9b63c44388', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20241122800595'}, {'event_id': '0a9aea87-fab8-443f-b984-18470329c31d', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240726800623'}, {'event_id': 'b48891cd-10f0-458b-90e2-a953037d890f', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240220800753'}, {'event_id': '546b4048-2891-4689-b453-1f76219ad4f2', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20240214800847'}, {'event_id': '06d06aac-53df-47f1-830f-29b39c739224', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20201015800469'}, {'event_id': 'dcc4bb28-c6cd-4473-b5fa-d8160ff9b93a', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250207801436'}, {'event_id': '66c62be5-c103-4110-a231-5ccb39f3f6fa', 'source': 'https://dart.fss.or.kr/dsaf001/main.do?rcpNo=2