# 아산병원 상세페이지 1278건 크롤링
`amc_disease_links.csv`에 담긴 링크를 기반으로
각 질환의 **질환명 / 증상 / 진료과** 정보를 수집하여 `amc_disease_1278pages.csv`로 저장합니다.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# 링크 CSV 파일 불러오기
df_links = pd.read_csv("amc_disease_links.csv")
links = df_links['상세페이지링크'].tolist()
print(f"총 링크 수: {len(links)}")

In [None]:
# def get_symptom_text(soup):
#     dts = soup.select('div.contDescription dt')
#     for dt in dts:
#         if "증상" in dt.get_text(strip=True):
#             dd = dt.find_next_sibling("dd")
#             if dd:
#                 ps = dd.find_all("p")
#                 if ps:
#                     return " ".join(p.get_text(strip=True) for p in ps)
#                 else:
#                     return dd.get_text(strip=True)  # <p> 없을 경우 직접 dd 텍스트 추출
#     return ''

In [None]:
def get_symptom_text(soup):
    # 1차 시도: 본문 내부 dt -> dd
    dts = soup.select('div.contDescription dt')
    for dt in dts:
        if "증상" in dt.get_text(strip=True):
            dd = dt.find_next_sibling("dd")
            if dd:
                ps = dd.find_all("p")
                if ps:
                    return " ".join(p.get_text(strip=True) for p in ps)
                else:
                    return dd.get_text(strip=True)

    # 2차 시도: 썸네일 옆 요약 정보
    try:
        dd = soup.select_one(
            "#content > div.healthinfoWrap.clearfix > div.regionReviewLeft > div.otherRegionBox > ul > li > div.contBox > dl > dd:nth-child(2)"
        )
        if dd:
            return ", ".join(a.get_text(strip=True) for a in dd.find_all("a"))
    except:
        pass

    return ''


In [None]:
def get_dept_text(soup):
    dts = soup.select('div.otherRegionBox dt')
    for dt in dts:
        if "진료과" in dt.get_text(strip=True):
            dd = dt.find_next_sibling("dd")
            if dd:
                return ", ".join(a.get_text(strip=True) for a in dd.find_all("a"))
    return ''

In [None]:
results = []
headers = {"User-Agent": "Mozilla/5.0"}

# 셀렉터
sel_title = "#content > div.healthinfoWrap.clearfix > div.regionReviewLeft > div.otherRegionBox > ul > li > div.contBox > strong"
sel_symptom = "#content > div.healthinfoWrap.clearfix > div.regionReviewLeft > div.contDescription > dl > dd:nth-child(6)"
sel_dept = "#content > div.healthinfoWrap.clearfix > div.regionReviewLeft > div.otherRegionBox > ul > li > div.contBox > dl > dd:nth-child(6) > a"

for i, url in enumerate(links):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')

        title_tag = soup.select_one(sel_title)
        # symptom_tag = soup.select_one(sel_symptom)
        # dept_tag = soup.select_one(sel_dept)

        title = title_tag.get_text(strip=True) if title_tag else ''
        symptom = get_symptom_text(soup)
        dept = get_dept_text(soup)

        results.append({'병명': title, '증상': symptom, '진료과': dept})
        print(f"[{i+1}/{len(links)}] 완료: {title}")
        time.sleep(1)

    except Exception as e:
        print(f"[에러] {url} - {e}")
        continue

In [None]:
# CSV 저장
df = pd.DataFrame(results)
df.to_csv("amc_disease_v4.csv", index=False, encoding="utf-8-sig")
print("✅ CSV 저장 완료")