In [None]:
# -*- coding: UTF8 -*-

import sys
import io
import os
import re
import csv
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException


# 크롬 옵션 설정
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless") # 브라우저가 보이지 않게 실행
    chrome_options.add_argument("--disable-gpu") # GPU 가속 비활성화
    chrome_options.add_argument("--no-sandbox") # 샌드박스 모드 비활성화
    chrome_options.add_argument("--disable-images") # 이미지 로드 차단
    return webdriver.Chrome(options=chrome_options)

# 정보 수집
def collect_data(driver, page_number):
    url = f"https://www.rallit.com/hub?pageNumber={page_number}"
    driver.get(url)
    time.sleep(2)

    # 페이지 내의 항목 수를 확인
    items = driver.find_elements(By.XPATH, '//*[@id="hub-container"]/section[2]/ul/li')
    item_count = len(items)
    results = []
    
    # 각 페이지의 요소를 클릭하여 데이터 수집
    for i in range(1, item_count + 1):
        xpath = f'//*[@id="hub-container"]/section[2]/ul/li[{i}]/a/article/div'
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, xpath)))
            element.click()
            time.sleep(1)

            section_ids = {
                "직업": "basic-section",
                "기술 스택": "techStack-section",
                "경력": "workExperiences-section",
                "프로젝트": "projects-section",
                "대외활동": "activities-section",
                "교육": "educations-section",
                "자격증": "certificates-section",
                "외국어": "foreignLanguages-section"
            }
            
            # 각 섹션의 데이터 수집
            result = {}
            for section_name, section_id in section_ids.items():
                try:
                    section_element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, section_id)))
                    job_element = section_element.find_element(By.XPATH, "//div[dt='직업']/dd") if section_name == "직업" else section_element.find_elements(By.CLASS_NAME, 'preview-description')
                    job_text = job_element.text if section_name == "직업" else ', '.join(text.text for text in job_element)
                    result[section_name] = job_text
                except Exception:
                    result[section_name] = ""
            
            results.append(result)
            print(result) 
            driver.back() # 이전 페이지로 이동
            time.sleep(1)
            
        except TimeoutException:
            continue
    return results

driver = setup_driver()
result_list = []

for page_number in range(1, 45):
    results = collect_data(driver, page_number)
    result_list.extend(results)

driver.quit()
print(result_list)


In [None]:
import pandas as pd

# result_list를 데이터프레임으로 변환
df = pd.DataFrame(result_list)
df

In [16]:
#df을 csv로 변환
df.to_csv('rallit.csv', index=False)