# Install Requirements

In [None]:
!pip install selenium
!pip install webdriver-manager


# Code

In [None]:
import time
import json
import urllib.parse
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
    TimeoutException,
    InvalidSessionIdException,
    WebDriverException
)


def scroll_to_bottom(driver, pause_sec=1.5, max_scroll=30):
    """iframe 내부 스크롤"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_sec)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        scroll_count += 1
        if scroll_count >= max_scroll:
            break

def detect_sections(url):
    """
    1) p.gtit => chapter, p.pty3 => buchik
    2) 만약 둘 다 없으면 chapter 로 처리
    3) return (sections, pgroup_count)
    """
    chrome_driver_path = YOUR_CHROME_DRIVER_PATH
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    sections = []
    pgroup_count = 0
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID, 'lawService')))
        driver.switch_to.frame("lawService")
        time.sleep(1)

        scroll_to_bottom(driver, pause_sec=2, max_scroll=30)
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.pgroup"))
        )
        pgroup_list = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
        pgroup_count = len(pgroup_list)

        for i, pg in enumerate(pgroup_list):
            has_chapter = pg.find_elements(By.CSS_SELECTOR, "p.gtit")
            has_buchik = pg.find_elements(By.CSS_SELECTOR, "p.pty3")
            if has_chapter:
                sections.append((i, "chapter"))
            elif has_buchik:
                sections.append((i, "buchik"))
            else:
                sections.append((i, "chapter"))

    finally:
        driver.quit()

    # 만약 끝까지 돌았는데도 sections가 비어 있으면 => 전체를 통으로 chapter로 취급
    if not sections:
        sections = [(0, "chapter")]

    # 마지막 end-marker
    sections.append((pgroup_count, ""))  # end
    #print(sections)
    return sections, pgroup_count


def expand_buchik_if_any(driver, pgroup_div):
    """부칙 펼치기 (arView(...))"""
    # 부칙 관련해서 접혀있는 경우 펼치기
    links = pgroup_div.find_elements(By.CSS_SELECTOR, "p.pty3 a.nun[onclick*='arView(']")
    for l in links:
        try:
            l.click()
            time.sleep(1)
        except:
            pass


def close_dom_dialog_if_any(driver):
    """
    DOM 팝업 .ui-dialog[role='dialog'] 있으면 닫기
    리턴: dialog 텍스트
    """
    try:
        dialog = driver.find_element(By.CSS_SELECTOR, "div.ui-dialog[role='dialog']")
        text = dialog.text.strip()
        close_btn = dialog.find_element(By.CSS_SELECTOR, "a[onclick*='TempJoDeleLayer.hiddenTempLsLinkLayer']")
        close_btn.click()  # element not interactable 에러가 나면 예외
        time.sleep(0.5)
        return text
    except NoSuchElementException:
        return None
    except WebDriverException as e:
        # element not interactable or else
        print(f"      -> fail to close DOM dialog: {e}")
        return None


class RestartSectionError(Exception):
    """Raised when we want to restart the same pgroup, skipping the link that caused error."""


class SectionCrawler:
    """
    (챕터 전용) 한 구간(장)을 크롤링할 때,
    - article/링크를 순회 중 DOM 팝업 닫기 에러 발생하면
      -> 해당 링크를 "failed_links"에 기록
      -> 브라우저 새로 시작
    - 이미 실패한 링크는 skip
    """
    def __init__(self, url, section_type, section_num, law_name,
                 pgroup_list, start_idx, end_idx, skip_popup_text="조문에서 위임한 사항을 규정한 하위법령이 없습니다"):
        self.url = url
        self.section_type = section_type
        self.section_num = section_num
        self.law_name = law_name
        self.pgroup_list = pgroup_list
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.skip_popup_text = skip_popup_text

        self.failed_links = set()  # (pgroup_idx, article_idx, link_text)
        self.results = []

        # 챕터명 (guess)
        self.current_title = f"{section_type.upper()}_{section_num}"

    def run(self):
        """
        브라우저 계속 재시작하며 pgroup/조문/링크 순회.
        (chapter 전용)
        """
        pcount = len(self.pgroup_list)
        if self.end_idx > pcount:
            self.end_idx = pcount

        i = self.start_idx
        while i < self.end_idx:
            if i >= pcount:
                break
            print(f"   -> pgroup {i} / {self.section_type}:{self.section_num} starting...")

            try:
                # 새 브라우저 세션 열기
                self.driver = self._new_driver()
                # scroll + find pgroup
                scroll_to_bottom(self.driver, pause_sec=2, max_scroll=30)
                all_pgroups = self.driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
                if i >= len(all_pgroups):
                    self.driver.quit()
                    break

                pg = all_pgroups[i]
                # chapter title 파악
                if i == self.start_idx:
                    # 첫 pgroup에 p.gtit이 있으면 그걸 챕터명으로
                    gtit = pg.find_elements(By.CSS_SELECTOR, "p.gtit")
                    if gtit:
                        self.current_title = gtit[0].text.strip()

                self._process_chapter_pgroup(i, pg)
                self.driver.quit()
                i += 1
            except RestartSectionError as e:
                print("   -> Restarting the same pgroup due to link error.")
                self.driver.quit()
            except WebDriverException as ex_drv:
                 # WebDriver 관련 에러도 처리
                 print(f"   -> skip pgroup {i} due to WebDriver error: {ex_drv}")
                 try:
                     self.driver.quit()
                 except:
                     pass
                 # 다음 pgroup으로 넘어가며 계속
                 i += 1
            except Exception as ex:
                print(f"   -> skip pgroup {i} due to error: {ex}")
                try:
                    self.driver.quit()
                except:
                    pass
                i += 1

        return self.results

    def _new_driver(self):
        chrome_driver_path = YOUR_CHROME_DRIVER_PATH
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument('disable-gpu')
        drv = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

        drv.get(self.url)
        WebDriverWait(drv, 15).until(EC.presence_of_element_located((By.ID, 'lawService')))
        drv.switch_to.frame("lawService")
        time.sleep(1)
        return drv

    def _process_chapter_pgroup(self, pgroup_idx, pgroup_elem):
        # 해당 pgroup에서 기사(조문)들 처리
        article_elems = pgroup_elem.find_elements(By.CSS_SELECTOR, "div.lawcon p.pty1_p4")
        print(f"     => pgroup {pgroup_idx} article count = {len(article_elems)}")

        for art_j, art in enumerate(article_elems):
            self._process_article(pgroup_idx, art_j, art)

    def _process_article(self, pgroup_idx, art_j, art):
        # 삭제조항?
        del_inputs = art.find_elements(By.XPATH, ".//input[starts-with(@id,'delJo') and @value='삭제']")
        is_deleted = (len(del_inputs) > 0) or ("삭제" in art.text)

        # 조문 제목
        try:
            label_elem = art.find_element(By.TAG_NAME, "label")
            article_title = label_elem.text.strip()
        except NoSuchElementException:
            article_title = art.text.strip()

        if is_deleted:
            full_text = "삭제된 조항"
            related_articles = []
        else:
            # 본문 텍스트
            lawcon_div = art.find_element(By.XPATH, "./ancestor::div[@class='lawcon']")
            all_ps = lawcon_div.find_elements(
                By.CSS_SELECTOR,
                "p.pty1_p4, p.pty1_de2_1, p.pty1_de2, p.pty1_de2h, p.pty1_p2, p.pty1_p3"
            )
            text_list = [p.text.strip() for p in all_ps]
            full_text = "\n".join(text_list).strip()

            # 링크 처리(팝업)
            related_articles = []
            link_els = lawcon_div.find_elements(By.CSS_SELECTOR, "a.link")

            link_idx = 0
            while link_idx < len(link_els):
                link_el = link_els[link_idx]
                link_idx += 1
                try:
                    onclick_attr = link_el.get_attribute("onclick") or ""
                    link_text = link_el.text.strip()
                except:
                    continue

                if (pgroup_idx, art_j, link_text) in self.failed_links:
                    # 이미 실패했던 링크는 skip
                    continue

                # 불필요한 onclick skip
                if any(sk in onclick_attr for sk in ["joStmdPop", "fJoHstShow", "arView", "fncArLawPop"]):
                    continue

                if ("fncLsLawPop" in onclick_attr) or ("fncLsPttnLinkPop" in onclick_attr):
                    # 새창 혹은 DOM팝업
                    try:
                        popup_res = self._click_popup_link(link_el, link_text)
                        if popup_res:
                            related_articles.append((link_text, popup_res))
                    except RestartSectionError:
                        self.failed_links.add((pgroup_idx, art_j, link_text))
                        raise
                else:
                    continue

        self.results.append({
            "law_name": self.law_name,
            "chapter": self.current_title,
            "article": article_title,
            "text": full_text,
            "deleted": is_deleted,
            "keywords": [],
            "related_articles": related_articles
        })

    def _click_popup_link(self, link_el, link_text):
        main_window = self.driver.current_window_handle
        before_handles = self.driver.window_handles

        try:
            link_el.click()
            time.sleep(0.7)
        except Exception as e:
            print(f"       -> link click error for '{link_text}': {e}")
            raise RestartSectionError(f"Link click error: {e}")

        after_handles = self.driver.window_handles
        # 새 윈도우가 열렸는지 확인
        if len(after_handles) > len(before_handles):
            # 새창
            for w in after_handles:
                if w not in before_handles:
                    self.driver.switch_to.window(w)
                    break
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                body_text = self.driver.find_element(By.TAG_NAME, "body").text.strip()
                if self.skip_popup_text in body_text:
                    print("      -> new window popup says no sub-regulation, skip link.")
                    body_text = ""
            except Exception as ex_popup:
                print("       -> new window pop error: ", ex_popup)
                body_text = ""
            try:
                self.driver.close()
            except:
                pass
            self.driver.switch_to.window(main_window)
            self.driver.switch_to.frame("lawService")
            time.sleep(0.5)
            return body_text
        else:
            # DOM 팝업?
            try:
                dialog_text = self._close_dom_dialog()
                if dialog_text and self.skip_popup_text in dialog_text:
                    print("      -> DOM popup says no sub-regulation, skip link.")
                    dialog_text = ""
                return dialog_text
            except RestartSectionError:
                raise

    def _close_dom_dialog(self):
        try:
            dialog = self.driver.find_element(By.CSS_SELECTOR, "div.ui-dialog[role='dialog']")
            dtext = dialog.text.strip()
            close_btn = dialog.find_element(By.CSS_SELECTOR, "a[onclick*='TempJoDeleLayer.hiddenTempLsLinkLayer']")
            close_btn.click()
            time.sleep(0.5)
            return dtext
        except NoSuchElementException:
            return ""
        except WebDriverException as e:
            print(f"      -> fail to close DOM dialog: {e}")
            raise RestartSectionError("DOM dialog close fail.")

def _crawl_buchik_merged(url, pgroup_list, start_idx, end_idx, buchik_index, law_name):
    """
    부칙 구간: start_idx ~ end_idx-1에 속하는 모든 div.pgroup을 순회하면서
    <p class="pty3">, <p class="pty3_dep1">, <p class="pty3_dep2"> ... 등 부칙 텍스트들을
    하나로 합쳐서 최종적으로 '하나의 dict'에 저장해 반환합니다.
    """
    chrome_driver_path = YOUR_CHROME_DRIVER_PATH
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')

    merged_texts = []
    # 부칙 제목이 될 변수 (첫 번째 pgroup에서 p.pty3 등을 읽어 결정)
    title_text = f"부칙 {buchik_index}"
    driver = None

    try:
        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
        driver.get(url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, 'lawService'))
        )
        driver.switch_to.frame("lawService")
        time.sleep(1)

        scroll_to_bottom(driver, pause_sec=1.5, max_scroll=30)
        all_pgroups = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")

        # end_idx 범위가 실제 pgroup 총 개수를 넘지 않도록 조정
        if end_idx > len(all_pgroups):
            end_idx = len(all_pgroups)

        # start_idx부터 end_idx-1까지 부칙에 해당하는 모든 pgroup 순회
        for idx in range(start_idx, end_idx):
            if idx >= len(all_pgroups):
                break
            pg = all_pgroups[idx]

            # 접혀있을 수 있는 부칙 내용 펼치기
            #_expand_buchik_if_any(driver, pg)

            # 첫 pgroup에서 p.pty3 요소를 찾아 실제 부칙 제목 추출 (가능하면)
            if idx == start_idx:
                pty3_list = pg.find_elements(By.CSS_SELECTOR, "p.pty3")
                if pty3_list:
                    tmp_title = pty3_list[0].text.strip()
                    if tmp_title:
                        title_text = tmp_title

            # pty3, pty3_dep1, pty3_dep2 등의 모든 <p>를 수집해 한 덩어리로 합침
            lines = pg.find_elements(By.CSS_SELECTOR, "p.pty3, p.pty3_dep1, p.pty3_dep2")
            for line in lines:
                txt = line.text.strip()
                if txt:
                    merged_texts.append(txt)

    finally:
        if driver:
            driver.quit()

    # 수집한 모든 <p> 텍스트를 합쳐서 하나의 긴 문자열로 만든다.
    full_text = "\n".join(merged_texts).strip()

    # 결과를 'list' 한 개 요소로 반환(다른 챕터/부칙과 형식을 맞추기 위함)
    result = {
        "law_name": law_name,
        "chapter": title_text,   # 여기서는 '부칙' 제목
        "article": title_text,   # 동일하게 사용
        "text": full_text,
        "deleted": False,
        "keywords": [],
        "related_articles": []
    }
    return [result]




def single_section_with_retry(url, start_idx, end_idx, section_type, section_num, law_name, max_retry=3):
    """
    invalid session id => 전체 구간 재시도(최대 max_retry)
    chapter 인 경우: SectionCrawler
    buchik 인 경우: _crawl_buchik_merged (한 덩어리로 처리)
    """
    attempts = 0
    final_data = []
    while attempts < max_retry:
        attempts += 1
        print(f"[{section_type}:{section_num}] Attempt {attempts}/{max_retry}")
        try:
            # 우선 pgroup_list를 구해야 하는데, buchik이면 실제로 SectionCrawler 로직이 필요치 않으므로
            # 간단히 pgroup_list 개수만 확인할 목적
            drv = webdriver.Chrome(service=Service(YOUR_CHROME_DRIVER_PATH))
            drv.get(url)
            WebDriverWait(drv, 15).until(EC.presence_of_element_located((By.ID, 'lawService')))
            drv.switch_to.frame("lawService")
            time.sleep(1)
            scroll_to_bottom(drv, pause_sec=2, max_scroll=30)
            pgroup_list = drv.find_elements(By.CSS_SELECTOR, "div.pgroup")
            drv.quit()

            if section_type == "chapter":
                # 챕터 크롤러
                crawler = SectionCrawler(
                    url=url,
                    section_type=section_type,
                    section_num=section_num,
                    law_name=law_name,
                    pgroup_list=pgroup_list,
                    start_idx=start_idx,
                    end_idx=end_idx
                )
                data = crawler.run()
            else:
                # buchik은 한 덩어리로
                data = _crawl_buchik_merged(
                    url, pgroup_list, start_idx, end_idx, section_num, law_name
                )

            final_data = data
            break
        except InvalidSessionIdException as e:
            print(f" -> invalid session on attempt {attempts}, retry: {e}")
            if attempts == max_retry:
                print("   => max retry reached for invalid session id. skip this section.")
            else:
                continue
        except WebDriverException as e:
            print(f" -> WebDriverException on attempt {attempts}, retry: {e}")
            if attempts == max_retry:
                print("   => max retry reached for WebDriverException. skip this section.")
            else:
                continue

        except Exception as e:
            # 예기치 못한 에러는 즉시 중단
            print(f" -> unexpected error on attempt {attempts}, stop. {e}")
            break
    return final_data


def crawl_law(url, start_chapter=None, end_chapter=None):
    """
    - url: "https://www.law.go.kr/법령/소방기본법"
    - detect_sections => (sections, pgroup_count)
    - 장(챕터) / 부칙 => single_section_with_retry
    - 결과는 law_name 폴더에 chapter_X.json, buchik_X.json 저장
    - 최종적으로 final.json 저장
    """
    last_part = url.split('/')[-1]
    law_name = urllib.parse.unquote(last_part)

    # law_name 폴더 생성
    os.makedirs(law_name, exist_ok=True)

    sections, pgroup_count = detect_sections(url)

    all_data = []
    chapter_count = 0
    buchik_count = 0

    for i in range(len(sections)-1):
        start_idx, sec_type = sections[i]
        end_idx, _ = sections[i+1]
        if not sec_type:
            continue

        if sec_type == "chapter":
            chapter_count += 1
            if start_chapter is not None and chapter_count < start_chapter:
                print(f"   -> skip chapter {chapter_count}, < start_chapter")
                continue
            if end_chapter is not None and chapter_count > end_chapter:
                print(f"   -> skip chapter {chapter_count}, > end_chapter")
                continue

            print(f"\n===== CHAPTER {chapter_count} pgroup[{start_idx}~{end_idx-1}] =====")
            partial = single_section_with_retry(url, start_idx, end_idx, "chapter", chapter_count, law_name)
            all_data.extend(partial)

            fname = os.path.join(law_name, f"chapter_{chapter_count}.json")
            with open(fname, "w", encoding="utf-8") as f:
                json.dump(partial, f, ensure_ascii=False, indent=4)
            print(f" -> {fname} saved, count={len(partial)}")

        else:  # sec_type == "buchik"
            buchik_count += 1
            print(f"\n===== BUCHIK {buchik_count} pgroup[{start_idx}~{end_idx-1}] =====")
            partial = single_section_with_retry(url, start_idx, end_idx, "buchik", buchik_count, law_name)
            all_data.extend(partial)

            fname = os.path.join(law_name, f"buchik_{buchik_count}.json")
            with open(fname, "w", encoding="utf-8") as f:
                json.dump(partial, f, ensure_ascii=False, indent=4)
            print(f" -> {fname} saved, count={len(partial)}")

    # 최종 합본
    final_fname = os.path.join(law_name, "final.json")
    with open(final_fname, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print(f"\n[{final_fname}] => {len(all_data)} results.")

    return all_data




In [13]:

def main():
    url_list = ["https://www.law.go.kr/%EB%B2%95%EB%A0%B9/%EA%B3%B5%EA%B3%B5%EA%B8%B0%EA%B4%80%EC%9D%98%EC%86%8C%EB%B0%A9%EC%95%88%EC%A0%84%EA%B4%80%EB%A6%AC%EC%97%90%EA%B4%80%ED%95%9C%EA%B7%9C%EC%A0%95"]
    # 예: 10장만 가져오고 싶으면 start_chapter=10, end_chapter=10
    
    for url in url_list:
        data = crawl_law(url, start_chapter=None, end_chapter=None)
        print("DONE. items:", len(data))


if __name__ == "__main__":
    main()


===== CHAPTER 1 pgroup[0~0] =====
[chapter:1] Attempt 1/3
   -> pgroup 0 / chapter:1 starting...
     => pgroup 0 article count = 1
 -> 공공기관의소방안전관리에관한규정\chapter_1.json saved, count=1

===== CHAPTER 2 pgroup[1~1] =====
[chapter:2] Attempt 1/3
   -> pgroup 1 / chapter:2 starting...
     => pgroup 1 article count = 1
 -> 공공기관의소방안전관리에관한규정\chapter_2.json saved, count=1

===== CHAPTER 3 pgroup[2~2] =====
[chapter:3] Attempt 1/3
   -> pgroup 2 / chapter:3 starting...
     => pgroup 2 article count = 1
 -> 공공기관의소방안전관리에관한규정\chapter_3.json saved, count=1

===== CHAPTER 4 pgroup[3~3] =====
[chapter:4] Attempt 1/3
   -> pgroup 3 / chapter:4 starting...
     => pgroup 3 article count = 1
 -> 공공기관의소방안전관리에관한규정\chapter_4.json saved, count=1

===== CHAPTER 5 pgroup[4~4] =====
[chapter:5] Attempt 1/3
   -> pgroup 4 / chapter:5 starting...
     => pgroup 4 article count = 1
 -> 공공기관의소방안전관리에관한규정\chapter_5.json saved, count=1

===== CHAPTER 6 pgroup[5~5] =====
[chapter:6] Attempt 1/3
   -> pgroup 5 / chap