In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchWindowException,
    InvalidSessionIdException,
    StaleElementReferenceException,
)
from urllib.parse import urljoin, urlparse
import json
import time
import threading
from queue import Queue
from threading import Lock


class CrawlerThread(threading.Thread):
    def __init__(self, url, result_queue, crawler_instance):
        super().__init__()
        self.url = url
        self.result_queue = result_queue
        self.crawler = crawler_instance
        self.driver = None
        self.is_stopped = threading.Event()

    def setup_driver(self):
        options = webdriver.ChromeOptions()
        # options.add_argument("--headless")
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()), options=options
        )
        self.driver.implicitly_wait(5)

    def run(self):
        try:
            self.setup_driver()
            print(f"Thread {self.name} starting crawl: {self.url}")

            self.driver.get(self.url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # 가짜 경고창 제거
            try:
                self.driver.execute_script(
                    "document.querySelector('.fake-alert-class').remove();"
                )
            except:
                pass

            # 페이지 데이터 수집 및 탐색
            page_data = self.crawler._collect_page_data(self.driver, self.url)
            new_urls = self.crawler._explore_page(self.driver, self.url, start_index=0)

            self.result_queue.put(
                {"thread_name": self.name, "page_data": page_data, "new_urls": new_urls}
            )

        except Exception as e:
            print(f"Thread {self.name} encountered an error: {e}")
            self.result_queue.put({"thread_name": self.name, "error": str(e)})
        finally:
            if self.driver:
                self.driver.quit()
            print(f"Thread {self.name} finished")

    def stop(self):
        self.is_stopped.set()
        if self.driver:
            self.driver.quit()
        print(f"Thread {self.name} stopped")


class AdvancedWebCrawler:
    def __init__(self, base_domain="https://recruit.snowcorp.com"):
        self.base_domain = urlparse(base_domain).netloc
        self.visited_urls = set()
        self.crawled_data = []
        self.graph_nodes = set()
        self.graph_edges = []
        self.link_cache = {}
        self.max_tabs = 3
        self.result_queue = Queue()
        self.timeout = 10
        self.data_lock = Lock()
        self.thread_numbers = 5

    def _is_within_domain(self, url):
        parsed_url = urlparse(url)
        return parsed_url.netloc == self.base_domain

    def _collect_page_data(self, driver, url):
        try:
            return {
                "url": url,
                "title": driver.title,
                "text": driver.find_element(By.TAG_NAME, "body").text,
            }
        except Exception as e:
            print(f"Error collecting page data for {url}: {e}")
            return {"url": url, "title": "Unknown", "text": ""}

    def monitor_threads(self, threads, timeout=10):
        print("monitor_threads...")
        start_time = time.time()
        while any(t.is_alive() for t in threads):
            elapsed = time.time() - start_time
            for thread in threads:
                if thread.is_alive():
                    try:
                        thread.driver.title
                    except:
                        print(f"Detected stalled driver in {thread.name}")
                        thread.stop()
                if elapsed > timeout:
                    print(f"Timeout ({timeout}s) exceeded for {thread.name}")
                    thread.stop()
            time.sleep(1)

    def crawl(self, start_url, max_pages=100):
        url_queue = [start_url]
        pages_crawled = 0
        threads = []

        while url_queue and pages_crawled < max_pages:
            current_url = url_queue.pop(0)

            if current_url in self.visited_urls or not self._is_within_domain(
                current_url
            ):
                continue
            for i in range(self.thread_numbers):
                thread = CrawlerThread(current_url, self.result_queue, self)
                threads.append(thread)
                thread.start()

            self.monitor_threads(threads, timeout=10)

            while not self.result_queue.empty():
                result = self.result_queue.get()
                if "error" in result:
                    print(f"Error in {result['thread_name']}: {result['error']}")
                    continue

                with self.data_lock:
                    self.crawled_data.append(result["page_data"])
                    self.visited_urls.add(result["page_data"]["url"])
                    self.graph_nodes.add(
                        f'"{result["page_data"]["url"]}"[Page: {result["page_data"]["url"]}]'
                    )
                pages_crawled += 1

                new_urls = result["new_urls"]
                url_queue.extend(
                    [url for url in new_urls if url not in self.visited_urls]
                )

            for thread in threads:
                thread.join()

        self._save_results()
        self._save_graph()

    def _explore_page(self, driver, current_url, start_index=0):
        print(f"Crawled page: {current_url} start_index {start_index}")
        if current_url not in self.link_cache:
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
                )
                links = driver.find_elements(By.TAG_NAME, "a")
                self.link_cache[current_url] = [
                    {
                        "href": link.get_attribute("href"),
                        "onclick": link.get_attribute("onclick"),
                    }
                    for link in links
                ]
            except TimeoutException:
                print(f"Timeout waiting for links on {current_url}")
                return []

        new_urls = []
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
            )
            links = driver.find_elements(By.TAG_NAME, "a")
        except TimeoutException:
            print(f"Timeout reloading links on {current_url}")
            return []

        for i in range(start_index, len(links)):
            try:
                link_data = links[i]  # 요소를 즉시 사용
                href = link_data.get_attribute("href")
                onclick = link_data.get_attribute("onclick")
                link_node = f'"{current_url}_link_{i}"[Link: {i}]'

                if (
                    href
                    and self._is_within_domain(href)
                    and href not in self.visited_urls
                ):
                    new_urls.append(href)
                    self.graph_edges.append(f'"{current_url}" --> |Href| "{href}"')

                if onclick:
                    try:
                        driver.execute_script(onclick, link_data)
                        time.sleep(1)
                        new_url = driver.current_url
                        if new_url != current_url and self._is_within_domain(new_url):
                            new_urls.append(new_url)
                            self.graph_edges.append(
                                f'{link_node} --> |Onclick| "{new_url}"'
                            )
                    except Exception as e:
                        print(f"Error executing onclick at {current_url}: {e}")

            except StaleElementReferenceException:
                print(f"Stale element at index {i} on {current_url}, retrying...")
                try:
                    # 요소가 stale이면 다시 조회
                    links = driver.find_elements(By.TAG_NAME, "a")
                    if i < len(links):
                        link_data = links[i]
                        href = link_data.get_attribute("href")
                        onclick = link_data.get_attribute("onclick")
                        link_node = f'"{current_url}_link_{i}"[Link: {i}]'

                        if (
                            href
                            and self._is_within_domain(href)
                            and href not in self.visited_urls
                        ):
                            new_urls.append(href)
                            self.graph_edges.append(
                                f'"{current_url}" --> |Href| "{href}"'
                            )
                    else:
                        continue
                except Exception as e:
                    print(f"Retry failed for index {i} on {current_url}: {e}")
            except Exception as e:
                print(f"Error processing link {i} on {current_url}: {e}")

        return new_urls

    def _save_results(self):
        with open("crawled_data.json", "w", encoding="utf-8") as f:
            json.dump(self.crawled_data, f, ensure_ascii=False, indent=4)
        print(f"Saved {len(self.crawled_data)} pages to crawled_data.json")

    def _save_graph(self):
        with open("crawler_graph.mmd", "w", encoding="utf-8") as f:
            f.write("graph TD\n")
            for node in self.graph_nodes:
                f.write(f"    {node}\n")
            for edge in self.graph_edges:
                f.write(f"    {edge}\n")
        print("Graph saved to crawler_graph.mmd")


def main():
    crawler = AdvancedWebCrawler(base_domain="https://recruit.snowcorp.com")
    start_url = "https://recruit.snowcorp.com/rcrt/list.do"
    crawler.crawl(start_url, max_pages=10)


if __name__ == "__main__":
    main()

monitor_threads...
Detected stalled driver in Thread-42
Thread Thread-42 stopped
Detected stalled driver in Thread-43
Thread Thread-43 stopped
Detected stalled driver in Thread-44
Thread Thread-44 stopped
Detected stalled driver in Thread-45
Thread Thread-45 stopped
Detected stalled driver in Thread-46
Thread Thread-46 stopped
Detected stalled driver in Thread-42
Thread Thread-42 stopped
Detected stalled driver in Thread-43
Thread Thread-43 stopped
Detected stalled driver in Thread-44
Thread Thread-44 stopped
Detected stalled driver in Thread-45
Thread Thread-45 stopped
Detected stalled driver in Thread-46
Thread Thread-46 stopped
Thread Thread-44 starting crawl: https://recruit.snowcorp.com/rcrt/list.do
Thread Thread-45 starting crawl: https://recruit.snowcorp.com/rcrt/list.do
Thread Thread-43 starting crawl: https://recruit.snowcorp.com/rcrt/list.do
Thread Thread-46 starting crawl: https://recruit.snowcorp.com/rcrt/list.do
Thread Thread-42 starting crawl: https://recruit.snowcorp.com

KeyboardInterrupt: 