In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm


In [2]:
def extract_symptoms(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "html.parser")
        symptom_title = soup.find(lambda tag: tag.name in ['strong', 'h3'] and '증상' in tag.text)
        if not symptom_title:
            return None
        symptom_texts = []
        for sibling in symptom_title.find_next_siblings():
            if sibling.name in ['h3', 'strong'] and '증상' not in sibling.text:
                break
            if sibling.name in ['p', 'ul', 'dd']:
                text = sibling.get_text(separator=" ").strip().replace("\n", " ")
                if text:
                    symptom_texts.append(text)
        return " ".join(symptom_texts).strip() if symptom_texts else None
    except Exception as e:
        print(f"Error processing {url} (증상): {e}")
        return None


In [3]:
def extract_departments(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "html.parser")
        dept_row = soup.find('th', string=lambda x: x and '진료과' in x)
        if not dept_row:
            return None
        td = dept_row.find_next_sibling('td')
        if not td:
            return None
        dept_list = [a.text.strip() for a in td.find_all('a') if a.text.strip()]
        return ", ".join(dept_list) if dept_list else None
    except Exception as e:
        print(f"Error processing {url} (진료과): {e}")
        return None


In [4]:
df = pd.read_csv("naver_health_terms.csv")
df.head()

Unnamed: 0,질환명,링크
0,가드네렐라 바지날리스 [Gardnerella Vaginalis],https://terms.naver.com/entry.naver?docId=6225...
1,가랑이통증 [Perineal pain],https://terms.naver.com/entry.naver?docId=6225...
2,가래 검사 [sputum examination],https://terms.naver.com/entry.naver?docId=2119...
3,"가래톳 [bubo, groin lump]",https://terms.naver.com/entry.naver?docId=9275...
4,가성근시 [pseudomyopia],https://terms.naver.com/entry.naver?docId=9272...


In [5]:
symptoms = []
departments = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    url = row['링크']
    symptoms.append(extract_symptoms(url))
    departments.append(extract_departments(url))

df['증상'] = symptoms
df['진료과'] = departments

df.to_csv("naver_health_v1.csv", index=False, encoding='utf-8-sig')
print("CSV 저장 완료")


100%|██████████| 1762/1762 [16:18<00:00,  1.80it/s]

CSV 저장 완료





In [6]:
df['증상'].isna().sum()

522