In [None]:
# %pip install selenium webdriver-manager beautifulsoup4 requests

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import json

class JobPlanetCrawler:
    def __init__(self, url):
        self.url = url
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        self.driver.implicitly_wait(3)

    def get_html(self):
        # HTML 소스 가져오기
        self.driver.get(self.url)
        return self.driver.page_source

    def parse_json(self, html):
        # HTML에서 JSON 데이터를 파싱하여 반환
        soup = BeautifulSoup(html, "html.parser")
        json_text = soup.find("pre").text if soup.find("pre") else ""
        if json_text:
            return json.loads(json_text)
        else:
            print("❌ <pre> 태그를 찾을 수 없습니다.")
            return None

    def extract_job_data(self, data):
        # JSON 데이터에서 id,경력 값 추출
        jobs = [
            {
                "id": job.get("id"),
                "annual_text": job.get("annual", {}).get("text")
            }
            for job in data.get("data", {}).get("recruits", [])
        ]
        return jobs

    def crawl_jobs(self):
        # 크롤링 실행 후 결과를 반환
        html = self.get_html()
        data = self.parse_json(html)
        if data:
            jobs = self.extract_job_data(data)
            return jobs
        return []

    def quit(self):
        self.driver.quit()

# json 
data=[]
# 사용 예시
if __name__ == "__main__":
    for idx in range(1,10):
        url = f"https://www.jobplanet.co.kr/api/v3/job/postings?occupation_level1=&occupation_level2=11905,11907,11904,11906,11610,11911,11609&years_of_experience=&review_score=&job_type=&city=&education_level_id=&order_by=aggressive&page={idx}&page_size=8"
        crawler = JobPlanetCrawler(url)
    
        job_data = crawler.crawl_jobs()
    
        if job_data:
            data.extend(job_data)
            print(json.dumps(job_data, ensure_ascii=False, indent=2))
    
    crawler.quit()

print(data)

[
  {
    "id": 1291264,
    "annual_text": "경력"
  },
  {
    "id": 1287688,
    "annual_text": "경력"
  },
  {
    "id": 1287613,
    "annual_text": "경력"
  },
  {
    "id": 1290553,
    "annual_text": "경력"
  },
  {
    "id": 1288726,
    "annual_text": "경력"
  },
  {
    "id": 1290918,
    "annual_text": "경력"
  },
  {
    "id": 1283031,
    "annual_text": "경력"
  },
  {
    "id": 1291090,
    "annual_text": "경력"
  }
]
[
  {
    "id": 1291054,
    "annual_text": "경력"
  },
  {
    "id": 1291048,
    "annual_text": "경력"
  },
  {
    "id": 1291100,
    "annual_text": "경력"
  },
  {
    "id": 1291099,
    "annual_text": "경력"
  },
  {
    "id": 1291053,
    "annual_text": "경력"
  },
  {
    "id": 1287485,
    "annual_text": "경력"
  },
  {
    "id": 1287482,
    "annual_text": "경력"
  },
  {
    "id": 1290554,
    "annual_text": "경력"
  }
]
[
  {
    "id": 1289221,
    "annual_text": "경력"
  },
  {
    "id": 1290903,
    "annual_text": "경력"
  },
  {
    "id": 1291000,
    "annual_text": "경력"
  },
  {


In [2]:
print(len(data))

72


In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
from bs4 import BeautifulSoup
import time

In [9]:
def get_jobplanet_recruitment_text(obj: dict) -> list:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.implicitly_wait(50)
    url = f"https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D={obj["id"]}"

    print(url)
    driver.get(url)
    time.sleep(2)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    elements = soup.select("p.recruitment-detail__txt")

    if elements:
        flat_page_texts = [
            word.lower()
            for elem in elements
            for word in re.findall(r'\b[a-zA-Z0-9]+\b', elem.text)
        ]
        driver.quit()
        return flat_page_texts
    else:
        print("텍스트가 존재하지 않습니다.")

In [None]:
for d in data[:3]:
    text=get_jobplanet_recruitment_text(d)
    print(text)


https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1291264
['2', '3050', '2', '000', 'https', 'view', 'asiae', 'co', 'kr', 'article', '2021120809515578394', 'https', 'www', 'jobplanet', 'co', 'kr', 'contents', 'news', '2154', 'queenit', '3050', 'https', 'www', 'mk', 'co', 'kr', 'news', 'business', 'view', '2021', '10', '985379', 'mz', '3050', 'https', 'www', 'hankyung', 'com', 'life', 'article', '202111254260g', 'key', 'highlights', '20', 'ing', 'a', '2', '000', 'b', 'https', 'www', 'rapportlabs', 'kr', 'culture', 'https', 'www', 'rapportlabs', 'kr', 'product', 'https', 'www', 'rapportlabs', 'kr', 'product', 'howwework1', 'https', 'www', 'linkedin', 'com', 'company', 'rapport', 'labs', 'machine', 'learning', 'intelligence', 'squad', 'software', 'engineer', 'product', 'owner', 'product', 'designer', 'llm', 'rag', 'ai', 'a', 'to', 'python', 'sql', 'git', 'docker', 'flask', 'redis', 'ml', 'pytorch', 'huggingface', 'tensorflow', 'catboost', 'pyspark', 'polars', 'faiss', 'ml', 'airflo

In [11]:
print(len(text))

120
