In [84]:
from queue import Queue
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [None]:
class Crawler:
    def __init__(self, seed_url):
        self.queue = Queue()
        self.visited = set()
        self.seed_url = seed_url
        self.queue.put(seed_url)

    def crawl(self,data):
        chrome_options = Options()
        driver = webdriver.Chrome(options=chrome_options)

        while not self.queue.empty():
            url = self.queue.get()
            if url in self.visited:
                continue

            self.visited.add(url)
            print(f"Visiting {url}")

            # Selenium을 사용하여 페이지를 렌더링
            try:
                driver.get(url)
                # 페이지가 완전히 로드될 때까지 기다림
                driver.implicitly_wait(10)
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, "html.parser")
                links = soup.find_all("a", href=True)
                # CSV 파일에 데이터를 추가하는 함수
                title = soup.title.string if soup.title else 'No Title'
                html = driver.page_source
                text = soup.get_text()
                data.append([title,url,text])
                for link in links:
                    href = link["href"]
                    if 'onclick' in link.attrs:
                        driver.execute_script(link.attrs['onclick'])
                        driver.implicitly_wait(2)
                        html = driver.page_source
                        title = soup.title.string if soup.title else 'No Title'
                        text = soup.get_text()
                        data.append([title,url,text])
                    # 상대 경로를 절대 경로로 변환
                    absolute_url = urljoin(url, href)
                    if absolute_url not in self.visited and absolute_url.startswith(seed_url):
                        self.queue.put(absolute_url)

            except Exception as e:
                print(f"Error crawling {url}: {e}")

        driver.quit()

Crawled page: https://recruit.snowcorp.com/rcrt/list.do start_index 0
total links: 96
No alert in new window
"https://recruit.snowcorp.com/rcrt/list.do_link_21"[Link: 21]
96
Executing onclick JavaScript for link 21 at https://recruit.snowcorp.com/rcrt/list.do: show('30003063')
End onclick JavaScript for link 21 at https://recruit.snowcorp.com/rcrt/list.do: show('30003063')
add new_url https://recruit.snowcorp.com/rcrt/view.do?annoId=30003063&sw=&subJobCdArr=&sysCompanyCdArr=&empTypeCdArr=&entTypeCdArr=&workAreaCdArr=
Saved 2 pages to crawled_data.json
save results
Crawled page: https://recruit.snowcorp.com/rcrt/view.do?annoId=30003063&sw=&subJobCdArr=&sysCompanyCdArr=&empTypeCdArr=&entTypeCdArr=&workAreaCdArr= start_index 0
total links: 34
No alert in new window
"https://recruit.snowcorp.com/rcrt/view.do?annoId=30003063&sw=&subJobCdArr=&sysCompanyCdArr=&empTypeCdArr=&entTypeCdArr=&workAreaCdArr=_link_27"[Link: 27]
34
Executing onclick JavaScript for link 27 at https://recruit.snowcorp.

In [None]:
data= []
seed_url ="https://recruit.navercorp.com/"
crawler = Crawler(seed_url)
crawler.crawl(data)
df = pd.DataFrame(data, columns=["Title", "URL", "Text"])
df.to_csv('navercorp.csv')

In [None]:
df.to_csv("navercorp.csv")

In [3]:
from selenium.webdriver.common.action_chains import ActionChains
import threading
import time

def _explore_page(self, current_url, start_index=0):
    print(f"Crawled page: {current_url} start_index {start_index}")
    if current_url not in self.link_cache:
        try:
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
            )
            links = self.driver.find_elements(By.TAG_NAME, "a")
            self.link_cache[current_url] = [
                {
                    "href": link.get_attribute("href"),
                    "onclick": link.get_attribute("onclick"),
                }
                for link in links
            ]
        except TimeoutException:
            print(f"Timeout waiting for links on {current_url}")
            return []

    try:
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
        )
        links = self.driver.find_elements(By_TAG_NAME, "a")
    except TimeoutException:
        print(f"Timeout reloading links on {current_url}")
        return []

    new_urls = []
    original_window = self.driver.current_window_handle
    print(f"Original window handle: {original_window}")
    print("total links:", len(links))

    def execute_js_with_timeout(driver, script, element, timeout=5):
        """JavaScript 실행에 타임아웃을 적용하여 멈춤 방지"""
        result = [None]
        exception = [None]

        def target():
            try:
                result[0] = driver.execute_script(script, element)
            except Exception as e:
                exception[0] = e

        thread = threading.Thread(target=target)
        thread.start()
        thread.join(timeout)
        if thread.is_alive():
            print(f"JavaScript execution timed out after {timeout} seconds")
            return None, TimeoutError("Script execution timed out")
        return result[0], exception[0]

    for i in range(start_index, len(links)):
        try:
            link_data = links[i]
            href = link_data.get_attribute("href")
            onclick = link_data.get_attribute("onclick")
            link_node = f'"{current_url}_link_{i}"[Link: {i}]'

            if not onclick:
                continue

            if len(self.driver.window_handles) >= self.max_tabs:
                print(f"Max tabs reached: {self.max_tabs}")
                continue

            # 새 창 열기
            print(f"Opening new window for link {i}")
            self.driver.execute_script("window.open('about:blank', '_blank');")
            new_window = self.driver.window_handles[-1]
            print(f"New window handle: {new_window}")
            self.driver.switch_to.window(new_window)
            self.driver.get(current_url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
            )
            link_data = self.driver.find_elements(By_TAG_NAME, "a")[i]
            print(f"Preparing to execute onclick for link {i}: {onclick}")

            # JavaScript 실행에 타임아웃 적용
            result, error = execute_js_with_timeout(self.driver, onclick, link_data, timeout=5)
            if error:
                print(f"Error executing onclick for link {i}: {str(error)}")
                if isinstance(error, TimeoutError):
                    # 타임아웃 시 창 정리 후 복귀
                    self.driver.close()
                    self.driver.switch_to.window(original_window)
                    continue

            # 실행 후 상태 확인
            print(f"onclick executed for link {i}")
            time.sleep(1)  # 페이지 로드 대기
            new_url = self.driver.current_url
            print(f"New URL after onclick: {new_url}")

            # Alert 처리
            try:
                WebDriverWait(self.driver, 3).until(EC.alert_is_present())
                print(f"Alert accepted on link {i}")
            except TimeoutException:
                print(f"No alert present on link {i}")
            except Exception as e:
                print(f"Error handling alert on link {i}: {str(e)}")

            # 팝업창 처리
            current_handles = self.driver.window_handles
            print(f"Current window handles: {current_handles}")
            if len(current_handles) > 2:
                for handle in current_handles:
                    if handle != original_window and handle != new_window:
                        print(f"Switching to popup: {handle}")
                        self.driver.switch_to.window(handle)
                        print(f"Popup URL: {self.driver.current_url}")
                        self.driver.close()
                        print(f"Closed popup: {handle}")

            # 새 창으로 다시 전환
            self.driver.switch_to.window(new_window)
            new_url = self.driver.current_url

            # 새 URL 처리
            if (
                new_url != current_url
                and new_url not in self.visited_urls
                and self._is_within_domain(new_url)
            ):
                page_data = self._collect_page_data(new_url)
                self.crawled_data.append(page_data)
                self._save_results()
                print(f"Saved new page: {new_url}")
                self.visited_urls.add(new_url)
                new_urls.append(new_url)
                self.graph_nodes.add(f'"{new_url}"[Page: {new_url}]')
                self.graph_edges.append(f'{link_node} --> |Onclick| "{new_url}"')

                recursive_urls = self._explore_page(new_url, start_index=0)
                new_urls.extend(
                    [url for url in recursive_urls if url not in self.visited_urls]
                )

            # 새 창 닫고 복귀
            print(f"Closing new window: {new_window}")
            self.driver.close()
            self.driver.switch_to.window(original_window)
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
            )

        except Exception as e:
            print(f"Error processing link {i} at {current_url}: {str(e)}")
            try:
                current_handles = self.driver.window_handles
                print(f"Handles during recovery: {current_handles}")
                for handle in current_handles:
                    if handle != original_window:
                        self.driver.switch_to.window(handle)
                        self.driver.close()
                self.driver.switch_to.window(original_window)
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
                )
            except Exception as recovery_error:
                print(f"Failed to recover: {str(recovery_error)}")
                return new_urls

    return new_urls

In [5]:
crawler = AdvancedWebCrawler(base_domain="https://recruit.snowcorp.com")
start_url = "https://recruit.snowcorp.com/rcrt/list.do"
crawler.crawl(start_url, max_pages=100)

NameError: name 'AdvancedWebCrawler' is not defined