In [None]:
%pip install selenium webdriver-manager beautifulsoup4 requests

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import json


class JobPlanetCrawler:
    def __init__(self, url):
        self.url = url
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()))
        self.driver.implicitly_wait(3)

    def get_html(self):
        # HTML 소스 가져오기
        self.driver.get(self.url)
        return self.driver.page_source

    def parse_json(self, html):
        # HTML에서 JSON 데이터를 파싱하여 반환
        soup = BeautifulSoup(html, "html.parser")
        json_text = soup.find("pre").text if soup.find("pre") else ""
        if json_text:
            return json.loads(json_text)
        else:
            print("❌ <pre> 태그를 찾을 수 없습니다.")
            return None

    def extract_job_data(self, data):
        # JSON 데이터에서 id,경력 값 추출
        jobs = [
            {
                "id": job.get("id"),
                "annual_text": job.get("annual", {}).get("text")
            }
            for job in data.get("data", {}).get("recruits", [])
        ]
        return jobs

    def crawl_jobs(self):
        # 크롤링 실행 후 결과를 반환
        html = self.get_html()
        data = self.parse_json(html)
        if data:
            jobs = self.extract_job_data(data)
            return jobs
        return []

    def quit(self):
        self.driver.quit()


job_data = []
# 사용 예시
if __name__ == "__main__":
    for i in range(1, 36):
        url = f"https://www.jobplanet.co.kr/api/v3/job/postings?occupation_level1=&occupation_level2=11905,11907,11904,11906,11610,11911,11609&years_of_experience=&review_score=&job_type=&city=&education_level_id=&order_by=aggressive&page={i}&page_size=8"
        crawler = JobPlanetCrawler(url)
        job_data.extend(crawler.crawl_jobs())

    # if job_data:
    #     print(json.dumps(job_data, ensure_ascii=False, indent=2))

    crawler.quit()



[{'id': 1291264, 'annual_text': '경력'}, {'id': 1287688, 'annual_text': '경력'}, {'id': 1287613, 'annual_text': '경력'}, {'id': 1290553, 'annual_text': '경력'}, {'id': 1288726, 'annual_text': '경력'}, {'id': 1290918, 'annual_text': '경력'}, {'id': 1283031, 'annual_text': '경력'}, {'id': 1291090, 'annual_text': '경력'}, {'id': 1291054, 'annual_text': '경력'}, {'id': 1291048, 'annual_text': '경력'}, {'id': 1291100, 'annual_text': '경력'}, {'id': 1291099, 'annual_text': '경력'}, {'id': 1291053, 'annual_text': '경력'}, {'id': 1287485, 'annual_text': '경력'}, {'id': 1287482, 'annual_text': '경력'}, {'id': 1290554, 'annual_text': '경력'}]


In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
from bs4 import BeautifulSoup
import time

In [7]:
def get_jobplanet_recruitment_text(obj: dict) -> list:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.implicitly_wait(50)
    url = f"https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D={obj["id"]}"

    print(url)
    driver.get(url)
    time.sleep(2)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    elements = soup.select("p.recruitment-detail__txt")

    if elements:
        flat_page_texts = [
            word.lower()
            for elem in elements
            for word in re.findall(r'\b[a-zA-Z0-9]+\b', elem.text)
        ]
        driver.quit()
        return flat_page_texts
    else:
        print("텍스트가 존재하지 않습니다.")

In [11]:
import job_list

front_list = job_list.FRONT_STACK_LIST
back_list =job_list.BACK_STACK_LIST
ios_list = job_list.IOS_STACK_LIST
cross_list = job_list.CROSS_STACK_LIST
android_list = job_list.ANDROID_STACK_LIST
game_list =job_list.GAME_STACK_LIST
security_list = job_list.SECURITY_STACK_LIST
cloud_list = job_list.CLOUD_STACK_LIST

entire_list = [front_list, back_list, ios_list, cross_list, android_list, game_list, security_list, cloud_list]

In [12]:
def count_data(data_list, elist, count_total):
    count_s = {}
    for d in data_list:
        for e in elist:  
            if d.strip() == e.strip():
                if d not in count_s:
                    count_s[d] = 1
    count_total.append(count_s)

In [20]:
import pandas as pd
from pathlib import Path  # 파일 존재 여부를 체크하기 위한 모듈

def write_excel(data_list):
    count_total = list()
    for e in entire_list:
        count_data(data_list, e, count_total)

    print(count_total)
    
    file_name = '../data/excel/StackList.xlsx'
    sheet_names = ['FRONT_STACK_LIST','BACK_STACK_LIST','IOS_STACK_LIST', 'CROSS_STACK_LIST', 'ANDROID_STACK_LIST', 'GAME_STACK_LIST', 
                'SECURITY_STACK_LIST', 'CLOUD_STACK_LIST']
    df_list = list()
    for i in range(len(sheet_names)):
        df = pd.DataFrame(list(count_total[i].items()), columns= ['stack', 'count'])
        df.set_index('stack', inplace=True)  # stack을 인덱스로 설정
        df_list.append(df)

    total_df_list = list()
    if Path(file_name).exists():
        try:
            for i in range(len(sheet_names)):
                old_df = pd.read_excel(file_name, sheet_name=sheet_names[i], index_col=0)
                total_df_list.append(old_df.add(df_list[i], fill_value=0))
        except:
            # 시트가 엑셀 파일 안에 없을 때
            total_df_list = df_list
    #엑셀 파일 없을 때
    else:
        total_df_list = df_list
        
    if Path(file_name).exists():
        with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            for sheet, df in zip(sheet_names, total_df_list):
                df.to_excel(writer, sheet_name=sheet)
    else:
        with pd.ExcelWriter(file_name, engine='openpyxl', mode='w') as writer:
            for sheet, df in zip(sheet_names, total_df_list):
                df.to_excel(writer, sheet_name=sheet)

In [None]:
for d in job_data:
    text=get_jobplanet_recruitment_text(d)
    write_excel(text)

https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1291264
[{}, {'python': 1, 'docker': 1, 'flask': 1, 'redis': 1, 'kubernetes': 1}, {}, {}, {}, {}, {}, {'docker': 1, 'kubernetes': 1, 'gcp': 1, 'aws': 1}]
https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1287688
[{}, {'python': 1, 'django': 1, 'mysql': 1, 'dynamodb': 1, 'kubernetes': 1, 'redis': 1}, {}, {}, {}, {}, {'prometheus': 1}, {'kubernetes': 1, 'spinnaker': 1, 'datadog': 1, 'grafana': 1, 'prometheus': 1, 'loki': 1, 'sentry': 1, 'aws': 1}]
