In [84]:
from queue import Queue
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [None]:
class Crawler:
    def __init__(self, seed_url):
        self.queue = Queue()
        self.visited = set()
        self.seed_url = seed_url
        self.queue.put(seed_url)

    def crawl(self,data):
        chrome_options = Options()
        driver = webdriver.Chrome(options=chrome_options)

        while not self.queue.empty():
            url = self.queue.get()
            if url in self.visited:
                continue

            self.visited.add(url)
            print(f"Visiting {url}")

            # Selenium을 사용하여 페이지를 렌더링
            try:
                driver.get(url)
                # 페이지가 완전히 로드될 때까지 기다림
                driver.implicitly_wait(10)
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, "html.parser")
                links = soup.find_all("a", href=True)
                # CSV 파일에 데이터를 추가하는 함수
                title = soup.title.string if soup.title else 'No Title'
                html = driver.page_source
                text = soup.get_text()
                data.append([title,url,text])
                for link in links:
                    href = link["href"]
                    if 'onclick' in link.attrs:
                        driver.execute_script(link.attrs['onclick'])
                        driver.implicitly_wait(2)
                        html = driver.page_source
                        title = soup.title.string if soup.title else 'No Title'
                        text = soup.get_text()
                        data.append([title,url,text])
                    # 상대 경로를 절대 경로로 변환
                    absolute_url = urljoin(url, href)
                    if absolute_url not in self.visited and absolute_url.startswith(seed_url):
                        self.queue.put(absolute_url)

            except Exception as e:
                print(f"Error crawling {url}: {e}")

        driver.quit()

In [None]:
data= []
seed_url ="https://recruit.navercorp.com/"
crawler = Crawler(seed_url)
crawler.crawl(data)
df = pd.DataFrame(data, columns=["Title", "URL", "Text"])
df.to_csv('navercorp.csv')

In [121]:
df.to_csv("navercorp.csv")