In [None]:
pip install openai requests pillow selenium


In [None]:
import time
import json
import urllib.parse
import os
import requests
import base64
import re
from PIL import Image
from io import BytesIO

image_cache: dict[str, str] = {}


from openai import OpenAI
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY")
 )

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    InvalidSessionIdException,
    WebDriverException,
    StaleElementReferenceException
)


def sanitize_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', '_', name)

def scroll_to_bottom(driver, pause_sec=1.5, max_scroll=30):
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_sec)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        scroll_count += 1
        if scroll_count >= max_scroll:
            break

def scroll_inside_div(driver, div_selector="#contentBody", pause_sec=1.5, max_scroll=30):
    try:
        container = driver.find_element(By.CSS_SELECTOR, div_selector)
    except NoSuchElementException:
        print(f"[scroll_inside_div] {div_selector} not found.")
        return

    last_scroll_height = driver.execute_script("return arguments[0].scrollHeight;", container)
    scroll_count = 0

    while True:
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", container)
        time.sleep(pause_sec)

        new_scroll_height = driver.execute_script("return arguments[0].scrollHeight;", container)
        if new_scroll_height == last_scroll_height:
            break

        last_scroll_height = new_scroll_height
        scroll_count += 1
        if scroll_count >= max_scroll:
            break

def get_page_title_via_h2(url) -> str:
    chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    driver.get(url)
    time.sleep(1)

    title_text = ""
    try:
        h2_el = driver.find_element(By.CSS_SELECTOR, "div#conTop h2")
        title_text = h2_el.text.strip()
    except NoSuchElementException:
        pass
    except Exception as e:
        print(f"[get_page_title_via_h2] Exception occurred: {e}")
    finally:
        driver.quit()

    return title_text

def _smart_scroll(driver, sel="#conScroll", pause=1.0, max_round=20):
    box = None
    try:
        time.sleep(1)
        box = driver.find_element(By.CSS_SELECTOR, sel)
    except NoSuchElementException:
        print(f"[WARN] {_smart_scroll.__name__}: {sel} not found, retrying in 2s...")
        time.sleep(2)
        try:
            box = driver.find_element(By.CSS_SELECTOR, sel)
        except NoSuchElementException:
            print(f"[FAIL] {_smart_scroll.__name__}: {sel} still not found after retry")
            return

    same_cnt = 0
    last = driver.execute_script("return arguments[0].scrollHeight", box)

    for _ in range(max_round):
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", box)
        time.sleep(pause)
        cur = driver.execute_script("return arguments[0].scrollHeight", box)
        if cur == last:
            same_cnt += 1
        else:
            same_cnt = 0
            last = cur
        if same_cnt >= 2:  
            break

def _looks_like_revision(seg: str) -> bool:
    seg = seg.strip()
    if seg.startswith("(") and seg.endswith(")"):
        seg = seg[1:-1]
    return bool(seg) and re.fullmatch(r"[0-9,\- ]+", seg) is not None

def extract_law_name_from_url(url: str) -> str:
    parts = [urllib.parse.unquote(p) for p in urllib.parse.urlparse(url).path.split("/") if p]
    for seg in reversed(parts):
        if not _looks_like_revision(seg):
            return sanitize_filename(seg)
    return sanitize_filename(parts[-1]) if parts else "law"

def parse_image_with_gpt(image_url: str) -> str:
    if "button" in image_url.lower():
        print(f"[parse_image_with_gpt] 'button' in URL => Skip: {image_url}")
        return ""

    try:
        response = requests.get(image_url, timeout=10, verify=False)
        response.raise_for_status()
        img_data = response.content

        content_type = response.headers.get("Content-Type", "").lower()

        if "gif" in content_type or image_url.lower().endswith('.gif'):
            try:
                pil_img = Image.open(BytesIO(img_data))
                pil_img = pil_img.convert("RGB")
                buffer = BytesIO()
                pil_img.save(buffer, format="JPEG")
                converted_data = buffer.getvalue()
                print(f"[parse_image_with_gpt] GIF->JPG Conversion Complete: {image_url}")
                ext = "jpg"
                encoded = base64.b64encode(converted_data).decode("utf-8")
            except Exception as e:
                print(f"[parse_image_with_gpt] GIF Conversion Failed, skip: {e}")
                return ""
        else:
            if "png" in content_type:
                ext = "png"
            else:
                ext = "jpg"

            encoded = base64.b64encode(img_data).decode("utf-8")

        messages = [
            {
              "role": "system",
              "content": "You are an expert OCR and table-to-text conversion agent. Your job is to convert any image—especially tables or diagrams—into clean, plain text. You must answer in Korean. Do not output any explanation, disclaimer, or commentary. Just the converted text, in Korean."
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "The image below may be a table or a diagram. Please convert it into text form. Do not say anything else—only output the converted text. You must answer in Korean."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/{ext};base64,{encoded}"
                        },
                    },
                ]
            }
        ]

        completion = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=messages
        )
        return completion.choices[0].message.content.strip()

    except Exception as e:
        print(f"[parse_image_with_gpt] Image Processing Error Occurred: {e}")
        return ""
    
from selenium.common.exceptions import NoSuchFrameException

def switch_into_content_frame(driver, wait_sec: int = 10) -> bool:
    try:
        WebDriverWait(driver, wait_sec).until(
             lambda d: d.find_elements(By.ID, "contentBody")
                   or d.find_elements(By.TAG_NAME, "iframe")
         )
    except TimeoutException:
        return False

    frames = driver.find_elements(By.TAG_NAME, "iframe")
    if not frames:
        return False            

    frame_ids = ["lawService", "admRulService", "viewer", "viewFrame"]

    for fid in frame_ids:
        try:
            driver.switch_to.frame(fid)
            return True
        except NoSuchFrameException:
            continue

    driver.switch_to.frame(frames[0])
    return True

def detect_sections(url):
    try:
        sections, pgroup_count = detect_sections_code1(url)
        if not sections or len(sections) < 2:
            raise ValueError("code1 detect_sections results are not normal.")
        print("[detect_sections] Sections are divided by Code 1.")
        return sections, pgroup_count
    except Exception as e:
        print(f"[detect_sections] Code 1 Failed. Fallback to Code 2: {e}")

    sections, pgroup_count = detect_sections_code2(url)
    return sections, pgroup_count

def detect_sections_code1(url):
    chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    sections = []
    pgroup_count = 0
    try:
        driver.get(url)
        switch_into_content_frame(driver) 
        time.sleep(1)
        _smart_scroll(driver, "#conScroll")
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.pgroup"))
        )
        pgroup_list = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
        pgroup_count = len(pgroup_list)
        for i, pg in enumerate(pgroup_list):
            has_chapter = pg.find_elements(By.CSS_SELECTOR, "p.gtit")
            has_buchik = pg.find_elements(By.CSS_SELECTOR, "p.pty3")
            if has_chapter:
                sections.append((i, "chapter"))
            elif has_buchik:
                sections.append((i, "buchik"))
            else:
                sections.append((i, "chapter"))
    finally:
        driver.quit()

    if not sections:
        sections = [(0, "chapter")]
        pgroup_count = 1

    sections.append((pgroup_count, ""))
    return sections, pgroup_count

def detect_sections_code2(url):
    chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
    options = webdriver.ChromeOptions()
    options.add_argument("headless")
    options.add_argument("disable-gpu")

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    sections = []
    pgroup_count = 0
    try:
        driver.get(url)
        time.sleep(1)
        switch_into_content_frame(driver)   
        time.sleep(1)
        
        _smart_scroll(driver, "#conScroll")

        pgroup_list = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
        pgroup_count = len(pgroup_list)
        print(f"[detect_sections_code2] found pgroup_count={pgroup_count}")

        if pgroup_count == 0:
            sections = [(0, "chapter")]
            pgroup_count = 1
        else:
            for i, pg in enumerate(pgroup_list):
                has_chapter = pg.find_elements(By.CSS_SELECTOR, "p.gtit")
                has_buchik = pg.find_elements(By.CSS_SELECTOR, "p.pty3")
                if has_chapter:
                    sections.append((i, "chapter"))
                elif has_buchik:
                    sections.append((i, "buchik"))
                else:
                    sections.append((i, "chapter"))
    finally:
        driver.quit()

    sections.append((pgroup_count, ""))
    return sections, pgroup_count

from selenium.common.exceptions import NoSuchFrameException

class SectionCrawler:
    def __init__(self, url, section_type, section_num, law_name,
                 pgroup_list, start_idx, end_idx, page_len, 
                 skip_popup_text="There is no subordinate legislation stipulating the matters delegated by the statute."):
        self.url = url
        self.section_type = section_type
        self.section_num = section_num
        self.law_name = law_name
        self.pgroup_list = pgroup_list
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.skip_popup_text = skip_popup_text
        
        self.image_cache = image_cache
        
        self.page_len = page_len
        self.page_len_threshold = int(page_len * 0.90)   
        self._single_saved = False

        self.failed_links = set()
        self.visited_links = set()  
        self.results = []
        self.current_title = f"{section_type.upper()}_{section_num}"

    def run(self):
        pcount = len(self.pgroup_list)
        if self.end_idx > pcount:
            self.end_idx = pcount

        i = self.start_idx
        while i < self.end_idx:
            if self._single_saved:      
                break
            if i >= pcount:
                break
            print(f"   -> pgroup {i} / {self.section_type}:{self.section_num} starting...")

            try:
                self.driver = self._new_driver()
                _smart_scroll(self.driver, "#conScroll")

                all_pgroups = self.driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
                if i >= len(all_pgroups):
                    self.driver.quit()
                    break

                pg = all_pgroups[i]

                if i == self.start_idx:
                    gtit = pg.find_elements(By.CSS_SELECTOR, "p.gtit")
                    if gtit:
                        self.current_title = gtit[0].text.strip()

                self._process_chapter_pgroup(i, pg)

                self.driver.quit()
                i += 1

            except WebDriverException as ex_drv:
                print(f"   -> WebDriverException occurred but pgroup {i} still continue: {ex_drv}")

                try:
                    self.driver.quit()
                except:
                    pass
                #i += 1
                continue

            except Exception as ex:
                print(f"   -> skip pgroup {i} due to error: {ex}")
                try:
                    self.driver.quit()
                except:
                    pass
                i += 1

        return self.results


    def _new_driver(self):
        chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
        options.add_argument("disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-software-rasterizer")
        options.page_load_strategy = 'eager'
        drv = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
        drv.set_page_load_timeout(120)
        drv.get(self.url)
        time.sleep(1)

        switch_into_content_frame(drv)
        time.sleep(1)
        
        _smart_scroll(drv, "#conScroll")

        return drv

    def _process_chapter_pgroup(self, pgroup_idx, pgroup_elem):
        try:
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p.pty1_p4"))
            )
        except TimeoutException:
            print(f"     => no p.pty1_p4 in pgroup {pgroup_idx}")

        article_elems = pgroup_elem.find_elements(By.CSS_SELECTOR, "p.pty1_p4")
        print(f"     => pgroup {pgroup_idx} article count = {len(article_elems)}")

        for art_j, art in enumerate(article_elems):
            if self._single_saved:
                break
            self._process_article(pgroup_idx, art_j, art)

    def _process_article(self, pgroup_idx, art_j, art):
        del_inputs = art.find_elements(By.XPATH, ".//input[starts-with(@id,'delJo') and @value='삭제']")
        is_deleted = (len(del_inputs) > 0) or ("삭제" in art.text)

        try:
            label_elem = art.find_element(By.TAG_NAME, "label")
            article_title = label_elem.text.strip()
        except NoSuchElementException:
            article_title = art.text.strip()

        if is_deleted:
            pgroup_elem_parent = art.find_element(By.XPATH, "./ancestor::div[@class='pgroup']")
            full_text = pgroup_elem_parent.text.strip()
            related_articles = []
        else:
            pgroup_elem_parent = art.find_element(By.XPATH, "./ancestor::div[@class='pgroup']")
            all_ps = pgroup_elem_parent.find_elements(
                By.CSS_SELECTOR,
                "p.pty1_p4, p.pty1_de2_1, p.pty1_de2, p.pty1_de2h, p.pty1_p2, p.pty1_p3, p.pty1_de3"
            )
            text_list = [p.text.strip() for p in all_ps]
            full_text = "\n".join(text_list).strip()

            img_els = pgroup_elem_parent.find_elements(By.TAG_NAME, "img")
            for img in pgroup_elem_parent.find_elements(By.TAG_NAME, "img"):
                src = (img.get_attribute("src") or "").strip()
                if not src or src.lower().endswith(".gif"):
                    continue
                if src not in self.image_cache:
                    self.image_cache[src] = parse_image_with_gpt(src)
                if self.image_cache[src]:
                    full_text += self.image_cache[src] + "\n-----------------------------\n"

            link_els = pgroup_elem_parent.find_elements(By.CSS_SELECTOR, "a.link")
            link_texts = []
            for link_el in link_els:
                link_txt = link_el.text.strip()
                if link_txt:
                    link_texts.append(link_txt)

            related_articles = []
            for txt in link_texts:
                related_articles.append((txt, "링크본문_미수집"))  
                
        print(f"full_text len = {len(full_text)}, "
              f"threshold = {self.page_len_threshold}")
                
        if not self._single_saved and len(full_text) >= self.page_len_threshold:
            self.results = [{                    
                "law_name": self.law_name,
                "chapter": "",
                "article": self.law_name,          
                "text": full_text,
                "deleted": False,
                "keywords": [],
                "related_articles": related_articles   
            }]
            self._single_saved = True             
            return                                

        self.results.append({
            "law_name": self.law_name,
            "chapter": self.current_title,
            "article": article_title,
            "text": full_text,
            "deleted": is_deleted,
            "keywords": [],
            "related_articles": related_articles
        })

    def _collect_links_and_popup(self, pgroup_idx, art_j, pgroup_elem_parent):
        related_articles = []
        main_window = self.driver.current_window_handle

        link_index = 0
        while True:
            try:
                link_els = pgroup_elem_parent.find_elements(By.CSS_SELECTOR, "a.link")
                if link_index >= len(link_els):
                    break

                link_el = link_els[link_index]
                link_text = link_el.text.strip()
                onclick_attr = link_el.get_attribute("onclick") or ""

                outer_html = link_el.get_attribute("outerHTML")
                print(f"       [DEBUG] link_index={link_index}, text='{link_text}', onclick='{onclick_attr}'")
                print(f"       [DEBUG] outerHTML: {outer_html}")

                unique_key = (link_text, onclick_attr.strip())
                if unique_key in self.visited_links:
                    print(f"       -> link already visited: {unique_key} => skip")
                    link_index += 1
                    continue
                self.visited_links.add(unique_key)

                skip_patterns = ["joStmdPop", "fJoHstShow", "arView", "fncArLawPop"]
                if any(sp in onclick_attr for sp in skip_patterns):
                    print(f"       [DEBUG] skip_patterns matched => '{onclick_attr}'")
                    link_index += 1
                    continue

                target_fns = ["fncLawPop", "fncLsLawPop", "fncLsPttnLinkPop"]
                if not any(fn in onclick_attr for fn in target_fns):
                    print(f"       [DEBUG] target_fns not matched => '{onclick_attr}'")
                    link_index += 1
                    continue

                if (pgroup_idx, art_j, link_text) in self.failed_links:
                    print(f"       [DEBUG] already in failed_links => {link_text}")
                    link_index += 1
                    continue

                self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", link_el)
                time.sleep(0.5)

                print(f"       [DEBUG] about to click link => text='{link_text}', onclick='{onclick_attr}'")

                popup_res = self._open_popup_and_get_text(link_el, link_text, main_window)
                related_articles.append((link_text, popup_res))
                link_index += 1

                time.sleep(0.5)

            except StaleElementReferenceException:
                print("       -> StaleElementReferenceException, trying to search link again...")
                time.sleep(1)
                continue
            except WebDriverException as wde:
                print(f"       -> WebDriverException for link '{link_text}': {wde}")
                self.failed_links.add((pgroup_idx, art_j, link_text))

                print("       -> re-initialize driver due to WebDriverException")
                try:
                    self.driver.quit()
                except:
                    pass
                self.driver = self._new_driver()

                link_index += 1
                time.sleep(1)
                continue
            except Exception as e:
                print(f"       [ERROR] link '{link_text}' skipped – {type(e).__name__}: {e}")
                self.failed_links.add((pgroup_idx, art_j, link_text))
                link_index += 1
                continue         

        return related_articles

    def _open_popup_and_get_text(self, link_el, link_text, main_window_handle):
        old_url = self.driver.current_url
        before_handles = self.driver.window_handles

        print(f"         [DEBUG] _open_popup_and_get_text start => link_text='{link_text}', old_url='{old_url}'")
        try:
            link_el.click()
        except Exception as e:
            print(f"       -> link click exception for '{link_text}': {e}")
            return ""

        start_wait = time.time()
        popup_text = ""
        new_window_opened = False
        dom_popup_found = False
        url_changed = False

        print("         [DEBUG] waiting for new window/DOM popup/url change...")

        while time.time() - start_wait < 60:
            after_handles = self.driver.window_handles
            curr_url = self.driver.current_url

            print(f"         [DEBUG] elapsed={round(time.time()-start_wait,1)}s, handles={after_handles}, current_url={curr_url}")

            if len(after_handles) > len(before_handles):
                new_window_opened = True
                break
            if self._dom_dialog_exists():
                dom_popup_found = True
                break
            if curr_url != old_url:
                url_changed = True
                break
            time.sleep(1)

        if not (new_window_opened or dom_popup_found or url_changed):
            print(f"       -> No URL change or Popups in 60s -> skip link '{link_text}'")
            return ""
        
        if new_window_opened:
            after_handles = self.driver.window_handles
            new_handle = None
            for h in after_handles:
                if h not in before_handles:
                    new_handle = h
                    break
            if not new_handle:
                print("       -> cannot find new_handle???")
                return ""

            self.driver.switch_to.window(new_handle)
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                popup_text = self.driver.find_element(By.TAG_NAME, "body").text.strip()
                wait_start = time.time()
                while 'Loading data...' in popup_text and (time.time() - wait_start < 15):
                    time.sleep(2)
                    popup_text = self.driver.find_element(By.TAG_NAME, "body").text.strip()

                if self.skip_popup_text in popup_text:
                    print("       -> popup has skip text => ''")
                    popup_text = ""

            except Exception as e:
                print(f"       -> popup body read error: {e}")
                popup_text = ""

            try:
                self.driver.close()
            except:
                pass
            self.driver.switch_to.window(main_window_handle)
            time.sleep(1)

            return popup_text

        if dom_popup_found:
            try:
                dtext = self._close_dom_dialog()
                if dtext and self.skip_popup_text in dtext:
                    print("       -> dom popup skip text => ''")
                    dtext = ""
                return dtext
            except Exception as e:
                print(f"       -> dom popup read error: {e}")
                return ""

        if url_changed:
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                popup_text = self.driver.find_element(By.TAG_NAME, "body").text.strip()
                wait_start = time.time()
                while 'Loading data...' in popup_text and (time.time() - wait_start < 15):
                    time.sleep(2)
                    popup_text = self.driver.find_element(By.TAG_NAME, "body").text.strip()

            except Exception as e:
                print(f"       -> url_changed read error: {e}")
                popup_text = ""

            try:
                self.driver.back()
                time.sleep(1)
            except:
                pass

            return popup_text

        return ""

    def _dom_dialog_exists(self):
        try:
            self.driver.find_element(By.CSS_SELECTOR, "div.ui-dialog[role='dialog']")
            return True
        except NoSuchElementException:
            return False

    def _close_dom_dialog(self):
        try:
            dialog = self.driver.find_element(By.CSS_SELECTOR, "div.ui-dialog[role='dialog']")
            dtext = dialog.text.strip()
            close_btn = dialog.find_element(By.CSS_SELECTOR, "a[onclick*='TempJoDeleLayer.hiddenTempLsLinkLayer']")
            close_btn.click()
            time.sleep(0.5)
            return dtext
        except NoSuchElementException:
            return ""
        except WebDriverException as e:
            print(f"      -> fail to close DOM dialog: {e}")
            return ""

def _crawl_buchik_merged(url, pgroup_list, start_idx, end_idx, buchik_index, law_name):
    chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')

    merged_texts = []
    title_text = f"부칙 {buchik_index}"
    driver = None

    try:
        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
        driver.get(url)
        switch_into_content_frame(driver)
        
        _smart_scroll(driver, "#conScroll")

        all_pgroups = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")

        if end_idx > len(all_pgroups):
            end_idx = len(all_pgroups)

        for idx in range(start_idx, end_idx):
            if idx >= len(all_pgroups):
                break
            pg = all_pgroups[idx]

            if idx == start_idx:
                pty3_list = pg.find_elements(By.CSS_SELECTOR, "p.pty3")
                if pty3_list:
                    tmp_title = pty3_list[0].text.strip()
                    if tmp_title:
                        title_text = tmp_title

            lines = pg.find_elements(By.CSS_SELECTOR, "p.pty3, p.pty3_dep1, p.pty3_dep2")
            for line in lines:
                txt = line.text.strip()
                if txt:
                    merged_texts.append(txt)
    finally:
        if driver:
            driver.quit()

    full_text = "\n".join(merged_texts).strip()
    result = {
        "law_name": law_name,
        "chapter": title_text,
        "article": title_text,
        "text": full_text,
        "deleted": False,
        "keywords": [],
        "related_articles": []
    }
    return [result]

def single_section_with_retry(url, start_idx, end_idx, section_type, section_num, law_name,page_len, max_retry=3):
    attempts = 0
    final_data = []
    while attempts < max_retry:
        attempts += 1
        print(f"[{section_type}:{section_num}] Attempt {attempts}/{max_retry}")

        driver = None
        try:
            chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
            options = webdriver.ChromeOptions()
            options.add_argument("headless")
            options.add_argument("disable-gpu")

            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
            driver.get(url)
            time.sleep(1)

            switch_into_content_frame(driver)   
            time.sleep(1)
                      
            _smart_scroll(driver, "#conScroll")

            pgroup_list = driver.find_elements(By.CSS_SELECTOR, "div.pgroup")
            driver.quit()

            if section_type == "chapter":
                crawler = SectionCrawler(
                    url=url,
                    section_type=section_type,
                    section_num=section_num,
                    law_name=law_name,
                    pgroup_list=pgroup_list,
                    start_idx=start_idx,
                    end_idx=end_idx,
                    page_len=page_len            
                )
                data = crawler.run()
            else:  
                data = _crawl_buchik_merged(
                    url, pgroup_list, start_idx, end_idx, section_num, law_name
                )

            final_data = data
            break

        except InvalidSessionIdException as e:
            print(f" -> invalid session on attempt {attempts}, retry: {e}")
            if attempts == max_retry:
                print("   => max retry reached for invalid session. skip.")
            else:
                continue
        except WebDriverException as e:
            print(f" -> WebDriverException on attempt {attempts}, retry: {e}")
            if attempts == max_retry:
                print("   => max retry reached for WebDriverException. skip.")
            else:
                continue
        except Exception as e:
            print(f" -> unexpected error on attempt {attempts}, stop. {e}")
            break
        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

    return final_data

def crawl_law(url, start_chapter=None, end_chapter=None):
    chrome_driver_path = "YOUR_CHROMEDRIVER_PATH"
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')
    with webdriver.Chrome(service=Service(chrome_driver_path), options=options) as drv:
        drv.get(url)
        switch_into_content_frame(drv)
        _smart_scroll(drv, "#conScroll")
        page_text_len = len(drv.find_element(By.ID, "contentBody").text)
    h2_text = get_page_title_via_h2(url)
    if h2_text:
        law_name = sanitize_filename(h2_text)
        print(f" -> h2 based law_name: {law_name}")
    else:
        law_name = extract_law_name_from_url(url)
        print(f" -> URL based law_name: {law_name}")

    os.makedirs(law_name, exist_ok=True)
    sections, pgroup_count = detect_sections(url)

    all_data = []
    chapter_count = 0
    buchik_count = 0

    for i in range(len(sections) - 1):
        start_idx, sec_type = sections[i]
        end_idx, _ = sections[i + 1]
        if not sec_type:
            continue

        if sec_type == "chapter":
            chapter_count += 1

            if start_chapter and chapter_count < start_chapter:
                print(f"   -> skip chapter {chapter_count}, < start_chapter({start_chapter})")
                continue
            if end_chapter and chapter_count > end_chapter:
                print(f"   -> skip chapter {chapter_count}, > end_chapter({end_chapter})")
                continue

            print(f"\n===== CHAPTER {chapter_count} pgroup[{start_idx}~{end_idx-1}] =====")
            partial = single_section_with_retry(
                url, start_idx, end_idx,
                "chapter", chapter_count, law_name,
                page_text_len        
            )
            all_data.extend(partial)

            fname = os.path.join(law_name, f"chapter_{chapter_count}.json")
            with open(fname, "w", encoding="utf-8") as f:
                json.dump(partial, f, ensure_ascii=False, indent=4)
            print(f" -> {fname} saved, count={len(partial)}")

        else:
            if start_chapter or end_chapter:
                print(f"\n===== SKIP BUCHIK => (start_chapter={start_chapter}, end_chapter={end_chapter}) =====")
                continue

            buchik_count += 1
            print(f"\n===== BUCHIK {buchik_count} pgroup[{start_idx}~{end_idx-1}] =====")
            partial = single_section_with_retry(url, start_idx, end_idx, "buchik", buchik_count, law_name, None)
            all_data.extend(partial)

            fname = os.path.join(law_name, f"buchik_{buchik_count}.json")
            with open(fname, "w", encoding="utf-8") as f:
                json.dump(partial, f, ensure_ascii=False, indent=4)
            print(f" -> {fname} saved, count={len(partial)}")

    final_fname = os.path.join(law_name, "final.json")
    with open(final_fname, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print(f"\n[{final_fname}] => {len(all_data)} results.")

    return all_data


In [None]:

def main():
    url_list = [

    ]

    for url in url_list:

        try:
            data = crawl_law(url, start_chapter=None, end_chapter=None)
            print(f"[OK]  {url}  →  {len(data)} item(s)")
        except Exception as e:
            print(f"[ERROR] {url}\n        {type(e).__name__}: {e}")
            continue           

if __name__ == "__main__":
    main()