In [13]:
import requests
from bs4 import BeautifulSoup
import json
import logging

# Logging 설정
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)

In [76]:
def crawl_reviews(base_url, max_pages, output_file):

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/109.0.0.0 Safari/537.36"
        )
    }

    all_reviews = []
    logging.info(f"Starting to crawl reviews from {base_url}")

    for page in range(1, max_pages + 1):
        url = f"{base_url}{page}"
        logging.info(f"Fetching page {page}: {url}")


        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            logging.error(f"Failed to fetch page {page}, status code: {response.status_code}")
            continue
        
        soup = BeautifulSoup(response.content, "html.parser")
        reviews = soup.find_all("div", class_="review_item")
        logging.info(f"Found {len(reviews)} reviews on page {page}")
        
        for index, review in enumerate(reviews, start=1):
            try:
                review_content = review.find("div", class_="review_item_inr")
                raw_rating = review_content.find("div", class_="rating").find("strong", class_="num").get_text(strip=True)
                rating = float(raw_rating.replace("Rating Score", "").strip())
                title = review_content.find("h3", class_="rvtit").get_text(strip=True)
                raw_status = review_content.find("div", class_="auth").find("strong").get_text(strip=True)
                status = raw_status.replace("Verified User", "").strip()
                raw_info = review_content.find("div", class_="auth").get_text(strip=True)
                info = raw_info.replace("Verified User", "").strip()
                
                # Removing redundant "status" from info text
                info = info.replace(status, "").strip()
                
                all_reviews.append({
                    "review": rating,
                    "title": title,
                    "status": status,
                    "info": info
                })

                logging.info(f"Processed review {index} on page {page}: {title}")

            except AttributeError:
                logging.warning(f"Skipping a review on page {page}, review index {index}, due to missing elements.")

    # Save data to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_reviews, f, ensure_ascii=False, indent=4)
    
    logging.info(f"Saved {len(all_reviews)} reviews to {output_file}")

In [77]:
# Usage
base_url = "https://www.teamblind.com/kr/company/현대자동차/reviews?page="
max_pages = 40  # Number of pages to crawl
output_file = "hyundai_reviews.json"

In [78]:
crawl_reviews(base_url, max_pages, output_file)

2025-01-14 15:51:21,159 - INFO - Starting to crawl reviews from https://www.teamblind.com/kr/company/현대자동차/reviews?page=
2025-01-14 15:51:21,160 - INFO - Fetching page 1: https://www.teamblind.com/kr/company/현대자동차/reviews?page=1
2025-01-14 15:51:21,586 - INFO - Found 30 reviews on page 1
2025-01-14 15:51:21,587 - INFO - Processed review 1 on page 1: “연봉 속이는 최악의 회사(신입4천대/대리5천대), 7년째 매년 하락중”
2025-01-14 15:51:21,587 - INFO - Processed review 2 on page 1: “인재 멸시의 현대자동차”
2025-01-14 15:51:21,588 - INFO - Processed review 3 on page 1: “진실된 리뷰는 '추천수'로 확인하세요”
2025-01-14 15:51:21,588 - INFO - Processed review 4 on page 1: “미래가없는 회사, 꿀빠는게 답이다라고 강요하는 회사”
2025-01-14 15:51:21,589 - INFO - Processed review 5 on page 1: “회사가 잘나가도 입사후부터 연봉이 수직하강함”
2025-01-14 15:51:21,589 - INFO - Processed review 6 on page 1: “연구원들을 일회용품 쓰듯하는 회사”
2025-01-14 15:51:21,590 - INFO - Processed review 7 on page 1: “갈수록 연봉이 주는 회사”
2025-01-14 15:51:21,590 - INFO - Processed review 8 on page 1: “커리어의 종착역 ”
2025-01-14 15:51:21,5