In [12]:
import requests, uuid, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import pandas as pd

headers = {"User-Agent": "Mozilla/5.0"}

# ✅ Tambah kategori supaya data lebih banyak
category_urls = {
    "Nasional": "https://nasional.kompas.com/",
    "Ekonomi": "https://money.kompas.com/",
    "Tekno": "https://tekno.kompas.com/",
    "Otomotif": "https://otomotif.kompas.com/",
    "Health": "https://health.kompas.com/",
    "Edukasi": "https://edukasi.kompas.com/",
    "Bola": "https://bola.kompas.com/",
    "Entertainment": "https://entertainment.kompas.com/",
    "Lifestyle": "https://lifestyle.kompas.com/",
    "Travel": "https://travel.kompas.com/",
    "Internasional": "https://internasional.kompas.com/",
    "Properti": "https://properti.kompas.com/",
    "Sains": "https://sains.kompas.com/"
}

def canonicalize(url):
    p = urlparse(url)
    return urlunparse((p.scheme or "https", p.netloc, p.path.rstrip('/'), "", "", ""))

def first_text(soup, selectors):
    for sel in selectors:
        nodes = soup.select(sel)
        if nodes:
            return " ".join([n.get_text(strip=True) for n in nodes])
    return None

seen_urls = set()

def parse_news_detail(url, default_category):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code != 200:
            print(f"[ERROR] status {res.status_code} for {url}")
            return None
        soup = BeautifulSoup(res.text, "html.parser")

        # canonical URL
        canon_tag = soup.select_one('link[rel="canonical"]')
        final_url = canon_tag['href'] if canon_tag and canon_tag.get('href') else res.url
        url_key = canonicalize(final_url)
        if url_key in seen_urls:
            return None
        seen_urls.add(url_key)

        # judul
        title = first_text(soup, ["h1.read__title", "h1.article__title", "h1.title"])
        if not title:
            m = soup.find('meta', property='og:title') or soup.find('meta', attrs={'name': 'title'})
            title = m['content'] if m and m.get('content') else ""

        # isi
        content = first_text(soup, [
            "div.read__content p",
            "div.article__content p",
            "div.detail_text p",
            "div.article__lead p",
            "div._article_content p"
        ])
        if not content:
            m = soup.find('meta', property='og:description') or soup.find('meta', attrs={'name': 'description'})
            content = m['content'] if m and m.get('content') else ""

        # kategori (pakai kategori input biar konsisten)
        breadcrumb = [a.get_text(strip=True) for a in soup.select("a.breadcrumb__link, span.breadcrumb__link")]
        detected_kategori = breadcrumb[-1] if breadcrumb else default_category

        return {
            "id": str(uuid.uuid4()),
            "judul": title,
            "isi": content,
            "kategori": default_category,           # konsisten
            "detected_kategori": detected_kategori, # info tambahan
            "url": final_url
        }
    except Exception as e:
        print(f"[ERROR parsing] {url}: {e}")
        return None

def crawl_category(name, base_url, max_news_per_page=None, max_pages=20):
    print(f"[CRAWL] {name} — {base_url}")
    news_list = []
    page = 1
    while True:
        if max_pages and page > max_pages:
            break
        page_url = f"{base_url}?page={page}"
        print(f"  [PAGE {page}] {page_url}")
        res = requests.get(page_url, headers=headers, timeout=10)
        if res.status_code != 200:
            print(f"   [ERROR] Can't access {page_url}")
            break
        soup = BeautifulSoup(res.text, "html.parser")

        # ambil semua link berita
        anchors = soup.select("h3.article__title a, h4.article__title a, article a")
        if not anchors:
            print("   [INFO] No links found, stop.")
            break

        # filter & canonicalize
        hrefs = []
        for a in anchors:
            href = a.get("href")
            if not href:
                continue
            full = urljoin(base_url, href)
            key = canonicalize(full)
            if key not in seen_urls and key not in hrefs:
                hrefs.append(full)

        print(f"   → Found {len(hrefs)} unique links on page {page}")

        for idx, link in enumerate(hrefs):
            if max_news_per_page and idx >= max_news_per_page:
                break
            print(f"     - Fetching {idx+1}: {link}")
            news = parse_news_detail(link, name)
            if news:
                news_list.append(news)
            time.sleep(1)

        page += 1
    return news_list

if __name__ == "__main__":
    hasil = []
    for kategori, url in category_urls.items():
        hasil.extend(crawl_category(kategori, url, max_news_per_page=None, max_pages=20))  # ambil lebih banyak

    df = pd.DataFrame(hasil)
    df = df.drop_duplicates(subset=['url'])
    df.to_csv("kompas_news_big.csv", index=False, encoding="utf-8-sig")
    print(f"[DONE] Total {len(df)} berita disimpan ke kompas_news_big.csv")


[CRAWL] Nasional — https://nasional.kompas.com/
  [PAGE 1] https://nasional.kompas.com/?page=1
   [INFO] No links found, stop.
[CRAWL] Ekonomi — https://money.kompas.com/
  [PAGE 1] https://money.kompas.com/?page=1
   → Found 1 unique links on page 1
     - Fetching 1: https://video.kompas.com/watch/1873974/cara-klaim-diskon-tambah-daya-listrik-pln-50-persen-september-2025?source=KOMPASCOM&position=money_terkini__player_1
  [PAGE 2] https://money.kompas.com/?page=2
   → Found 0 unique links on page 2
  [PAGE 3] https://money.kompas.com/?page=3
   → Found 0 unique links on page 3
  [PAGE 4] https://money.kompas.com/?page=4
   → Found 0 unique links on page 4
  [PAGE 5] https://money.kompas.com/?page=5
   → Found 0 unique links on page 5
  [PAGE 6] https://money.kompas.com/?page=6
   → Found 0 unique links on page 6
  [PAGE 7] https://money.kompas.com/?page=7
   → Found 0 unique links on page 7
  [PAGE 8] https://money.kompas.com/?page=8
   → Found 0 unique links on page 8
  [PAGE 9] htt