In [1]:
! python -V

Python 3.10.18


In [2]:
import requests
from bs4 import BeautifulSoup
import csv, json, time
from urllib.parse import urljoin

BASE_URL = "http://quotes.toscrape.com/"

def crawl():
    url = BASE_URL
    results = []
    while url:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # 명언, 저자, 태그 추출
        for q in soup.select("div.quote"):
            text = q.select_one("span.text").get_text(strip=True).strip("“”")
            author = q.select_one("small.author").get_text(strip=True)
            tags = [t.get_text(strip=True) for t in q.select("a.tag")]
            results.append({"quote": text, "author": author, "tags": tags})
        
        # 다음 페이지 확인
        next_link = soup.select_one("li.next a")
        url = urljoin(url, next_link["href"]) if next_link else None
        time.sleep(0.5)  # 예의상 대기
    return results

if __name__ == "__main__":
    data = crawl()
    print(f"총 {len(data)}개 수집 완료")

    # CSV 저장
    with open("quotes.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["quote", "author", "tags"])
        for row in data:
            writer.writerow([row["quote"], row["author"], "|".join(row["tags"])])

    # JSONL 저장
    with open("quotes.jsonl", "w", encoding="utf-8") as f:
        for row in data:
            json.dump(row, f, ensure_ascii=False)
            f.write("\n")


총 100개 수집 완료


In [3]:
import requests
from bs4 import BeautifulSoup
import csv

# 사이트 URL
url = 'https://books.toscrape.com/'

# 요청 보내기
response = requests.get(url)
response.raise_for_status()  # 오류 발생 시 예외

# HTML 파싱
soup = BeautifulSoup(response.text, 'html.parser')

# 책 리스트 가져오기
books = soup.select('section > div:nth-child(2) > ol > li')

# 별점과 제목 추출
book_data = []
for book in books:
    title_tag = book.select_one('h3 > a')
    rating_tag = book.select_one('p.star-rating')
    
    if title_tag and rating_tag:
        title = title_tag['title']  # title 속성에 책 제목이 있음
        # 별점은 클래스에서 추출 (예: 'star-rating Three')
        rating_class = rating_tag.get('class', [])
        rating = rating_class[1] if len(rating_class) > 1 else 'None'
        
        book_data.append([title, rating])

# CSV 파일로 저장
with open('books.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'Rating'])  # 헤더
    writer.writerows(book_data)

print('CSV 파일 저장 완료: books.csv')


CSV 파일 저장 완료: books.csv


In [7]:
! pip install selenium webdriver-manager

Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver-manager
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [webdriver-manager]
[1A[2KSuccessfully installed python-dotenv-1.1.1 webdriver-manager-4.0.2


In [18]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
IMDb Top 250 (chart/top) 수집 스크립트
- 대상: https://www.imdb.com/chart/top/
- 1순위: JSON-LD(ItemList) 파싱
- 2순위: 테이블 마크업 파싱(폴백)
- 출력: imdb_top250.csv, imdb_top250.jsonl
"""

import time, re, sys, csv, json
from typing import List, Dict, Optional
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

URL = "https://www.imdb.com/chart/top/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (training-bot; +https://example.com) Requests/2.x",
    "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
}
SLEEP = 0.5
TIMEOUT = 15
RETRIES = 3
BACKOFF = 1.7

OUT_CSV = "imdb_top250.csv"
OUT_JSONL = "imdb_top250.jsonl"

def fetch_html(url: str) -> str:
    last = None
    for i in range(1, RETRIES + 1):
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            return r.text
        except Exception as e:
            last = e
            time.sleep(BACKOFF ** (i - 1))
    raise RuntimeError(f"Fetch failed: {url} ({last})")

def _to_int(s: Optional[str]) -> Optional[int]:
    if not s: return None
    m = re.search(r"\b(19|20)\d{2}\b", s) or re.search(r"\d+", s)
    return int(m.group(0)) if m else None

def _to_float(s: Optional[str]) -> Optional[float]:
    if not s: return None
    m = re.search(r"\d+(?:\.\d+)?", s)
    return float(m.group(0)) if m else None

# 1) JSON-LD 우선 파싱
def parse_jsonld(html: str, base_url: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    items: List[Dict] = []
    for tag in soup.select('script[type="application/ld+json"]'):
        txt = tag.string or ""
        if not txt.strip():
            continue
        try:
            data = json.loads(txt)
        except Exception:
            continue
        blocks = data if isinstance(data, list) else [data]
        for b in blocks:
            if not isinstance(b, dict):
                continue
            # ItemList 혹은 CollectionPage 내부의 itemListElement
            if b.get("@type") in ("ItemList", "CollectionPage"):
                elements = b.get("itemListElement") or []
                for pos, el in enumerate(elements, start=1):
                    node = el.get("item") if isinstance(el, dict) else el
                    if not isinstance(node, dict):
                        continue
                    title = node.get("name") or ""
                    url = node.get("url") or ""
                    agg = node.get("aggregateRating") or {}
                    rating = agg.get("ratingValue")
                    year = _to_int(node.get("datePublished") or node.get("copyrightYear"))
                    if not url.startswith("http"):
                        url = urljoin(base_url, url)
                    if title:
                        items.append({
                            "rank": pos,
                            "title": title,
                            "year": year,
                            "rating": float(rating) if rating else None,
                            "title_url": url,
                            "source_page": base_url
                        })
    return items

# 2) 테이블 폴백 파싱
def parse_table(html: str, page_url: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    rows = soup.select("table tbody tr") or soup.select("tbody tr")
    out: List[Dict] = []
    for i, tr in enumerate(rows, start=1):
        a = tr.select_one(".titleColumn a, a[href*='/title/']")
        title = a.get_text(strip=True) if a else ""
        href = a.get("href") if a else ""
        url = urljoin(page_url, href) if href else ""
        year_el = tr.select_one(".titleColumn span, span.secondaryInfo")
        year = _to_int(year_el.get_text(strip=True) if year_el else "")
        rating_el = tr.select_one(".imdbRating strong, td.ratingColumn strong, strong")
        rating = _to_float(rating_el.get_text(strip=True) if rating_el else "")
        if title:
            out.append({
                "rank": i,
                "title": title,
                "year": year,
                "rating": rating,
                "title_url": url,
                "source_page": page_url
            })
    return out

def save_csv(rows: List[Dict], path: str = OUT_CSV) -> None:
    fields = ["rank", "title", "year", "rating", "title_url", "source_page"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(fields)
        for r in rows:
            w.writerow([r.get(k, "") for k in fields])

def save_jsonl(rows: List[Dict], path: str = OUT_JSONL) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            json.dump(r, f, ensure_ascii=False)
            f.write("\n")

def main():
    print("[INFO] chart/top 수집 시작:", URL)
    html = fetch_html(URL)

    # 1) JSON-LD 먼저
    data = parse_jsonld(html, URL)
    if data:
        print(f"[INFO] JSON-LD {len(data)}개")
    else:
        # 2) 폴백: 테이블 파싱
        data = parse_table(html, URL)
        print(f"[INFO] table {len(data)}개")

    if not data:
        raise RuntimeError("아이템을 찾지 못했습니다. 선택자를 다시 확인하세요.")

    # 샘플 출력
    for r in data[:10]:
        print(f"{r['rank']:>3}. {r['title']} ({r.get('year')}) ★{r.get('rating')}")

    save_csv(data, OUT_CSV)
    save_jsonl(data, OUT_JSONL)
    print(f"[INFO] 저장 완료 -> CSV: {OUT_CSV}, JSONL: {OUT_JSONL}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print("[ERROR]", e, file=sys.stderr)
        sys.exit(1)


[INFO] chart/top 수집 시작: https://www.imdb.com/chart/top/
[INFO] JSON-LD 250개
  1. The Shawshank Redemption (None) ★9.3
  2. The Godfather (None) ★9.2
  3. The Dark Knight (None) ★9.1
  4. The Godfather Part II (None) ★9.0
  5. 12 Angry Men (None) ★9.0
  6. The Lord of the Rings: The Return of the King (None) ★9.0
  7. Schindler&apos;s List (None) ★9.0
  8. Pulp Fiction (None) ★8.8
  9. The Lord of the Rings: The Fellowship of the Ring (None) ★8.9
 10. Il buono, il brutto, il cattivo (None) ★8.8
[INFO] 저장 완료 -> CSV: imdb_top250.csv, JSONL: imdb_top250.jsonl
