# Crawling Berita Kompas

In [1]:
import requests, uuid, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import pandas as pd

# Header untuk menyamarkan request agar dianggap seperti browser biasa
headers = {"User-Agent": "Mozilla/5.0"}


# Daftar Kategori berita yang akan di crawling

In [2]:
# Tambah kategori supaya data lebih banyak
category_urls = {
    "Nasional": "https://nasional.kompas.com/",
    "Ekonomi": "https://money.kompas.com/",
    "Tekno": "https://tekno.kompas.com/",
    "Otomotif": "https://otomotif.kompas.com/",
    "Health": "https://health.kompas.com/",
    "Edukasi": "https://edukasi.kompas.com/",
    "Bola": "https://bola.kompas.com/",
    "Entertainment": "https://entertainment.kompas.com/",
    "Lifestyle": "https://lifestyle.kompas.com/",
    "Travel": "https://travel.kompas.com/",
    "Internasional": "https://internasional.kompas.com/",
    "Properti": "https://properti.kompas.com/",
    "Sains": "https://sains.kompas.com/"
}


# Fungsi Utility: Normalisasi URL & Ambil Teks

In [3]:
def canonicalize(url):
    # Normalisasi URL supaya tidak ada parameter yang bikin duplikat
    p = urlparse(url)
    return urlunparse((p.scheme or "https", p.netloc, p.path.rstrip('/'), "", "", ""))

def first_text(soup, selectors):
    # Ambil teks pertama yang cocok dari list CSS selector
    for sel in selectors:
        nodes = soup.select(sel)
        if nodes:
            return " ".join([n.get_text(strip=True) for n in nodes])
    return None


# Variabel untuk Cegah Duplikat

In [4]:
seen_urls = set()

# Fungsi Parsing Detail Berita

In [5]:
def parse_news_detail(url, default_category):
    try:
        # Request detail berita
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code != 200:
            print(f"[ERROR] status {res.status_code} for {url}")
            return None
        soup = BeautifulSoup(res.text, "html.parser")

        # Ambil canonical URL
        canon_tag = soup.select_one('link[rel="canonical"]')
        final_url = canon_tag['href'] if canon_tag and canon_tag.get('href') else res.url
        url_key = canonicalize(final_url)
        if url_key in seen_urls:
            return None
        seen_urls.add(url_key)

        # Judul berita
        title = first_text(soup, ["h1.read__title", "h1.article__title", "h1.title"])
        if not title:
            m = soup.find('meta', property='og:title') or soup.find('meta', attrs={'name': 'title'})
            title = m['content'] if m and m.get('content') else ""

        # Isi berita
        content = first_text(soup, [
            "div.read__content p",
            "div.article__content p",
            "div.detail_text p",
            "div.article__lead p",
            "div._article_content p"
        ])
        if not content:
            m = soup.find('meta', property='og:description') or soup.find('meta', attrs={'name': 'description'})
            content = m['content'] if m and m.get('content') else ""

        # Kategori (ambil breadcrumb jika ada, fallback ke default_category)
        breadcrumb = [a.get_text(strip=True) for a in soup.select("a.breadcrumb__link, span.breadcrumb__link")]
        detected_kategori = breadcrumb[-1] if breadcrumb else default_category

        return {
            "id": str(uuid.uuid4()),        # ID unik
            "judul": title,                 # Judul berita
            "isi": content,                 # Isi berita
            "kategori": default_category,   # Kategori awal (hardcode)
            "detected_kategori": detected_kategori, # Kategori dari breadcrumb
            "url": final_url                # URL canonical
        }
    except Exception as e:
        print(f"[ERROR parsing] {url}: {e}")
        return None


# Fungsi Crawling per Kategori

In [6]:
def crawl_category(name, base_url, max_news_per_page=None, max_pages=20):
    print(f"[CRAWL] {name} — {base_url}")
    news_list = []
    page = 1
    while True:
        if max_pages and page > max_pages:
            break

        # URL pagination
        page_url = f"{base_url}?page={page}"
        print(f"  [PAGE {page}] {page_url}")
        res = requests.get(page_url, headers=headers, timeout=10)
        if res.status_code != 200:
            print(f"   [ERROR] Can't access {page_url}")
            break
        soup = BeautifulSoup(res.text, "html.parser")

        # Cari semua link berita
        anchors = soup.select("h3.article__title a, h4.article__title a, article a")
        if not anchors:
            print("   [INFO] No links found, stop.")
            break

        # Filter duplikat
        hrefs = []
        for a in anchors:
            href = a.get("href")
            if not href:
                continue
            full = urljoin(base_url, href)
            key = canonicalize(full)
            if key not in seen_urls and key not in hrefs:
                hrefs.append(full)

        print(f"   → Found {len(hrefs)} unique links on page {page}")

        # Ambil detail berita satu per satu
        for idx, link in enumerate(hrefs):
            if max_news_per_page and idx >= max_news_per_page:
                break
            print(f"     - Fetching {idx+1}: {link}")
            news = parse_news_detail(link, name)
            if news:
                news_list.append(news)
            time.sleep(1)  # delay 1 detik

        page += 1
    return news_list


# Hasil Crawling

In [7]:
if __name__ == "__main__":
    hasil = []
    for kategori, url in category_urls.items():
        hasil.extend(crawl_category(kategori, url, max_news_per_page=None, max_pages=20))  

    # Simpan hasil ke DataFrame
    df = pd.DataFrame(hasil)
    df = df.drop_duplicates(subset=['url'])  # hapus duplikat
    df.to_csv("hasil_crawling_kompas.csv", index=False, encoding="utf-8-sig")
    print(f"[DONE] Total {len(df)} berita disimpan ke hasil_crawling_kompas.csv")


[CRAWL] Nasional — https://nasional.kompas.com/
  [PAGE 1] https://nasional.kompas.com/?page=1
   [INFO] No links found, stop.
[CRAWL] Ekonomi — https://money.kompas.com/
  [PAGE 1] https://money.kompas.com/?page=1


   → Found 1 unique links on page 1
     - Fetching 1: https://video.kompas.com/watch/1873974/cara-klaim-diskon-tambah-daya-listrik-pln-50-persen-september-2025?source=KOMPASCOM&position=money_terkini__player_1


  [PAGE 2] https://money.kompas.com/?page=2


   → Found 0 unique links on page 2
  [PAGE 3] https://money.kompas.com/?page=3
   → Found 0 unique links on page 3
  [PAGE 4] https://money.kompas.com/?page=4


   → Found 0 unique links on page 4
  [PAGE 5] https://money.kompas.com/?page=5


   → Found 0 unique links on page 5
  [PAGE 6] https://money.kompas.com/?page=6


   → Found 0 unique links on page 6
  [PAGE 7] https://money.kompas.com/?page=7


   → Found 0 unique links on page 7
  [PAGE 8] https://money.kompas.com/?page=8
   → Found 0 unique links on page 8
  [PAGE 9] https://money.kompas.com/?page=9


   → Found 0 unique links on page 9
  [PAGE 10] https://money.kompas.com/?page=10
   → Found 0 unique links on page 10
  [PAGE 11] https://money.kompas.com/?page=11


   → Found 0 unique links on page 11
  [PAGE 12] https://money.kompas.com/?page=12


   → Found 0 unique links on page 12
  [PAGE 13] https://money.kompas.com/?page=13
   → Found 0 unique links on page 13
  [PAGE 14] https://money.kompas.com/?page=14


   → Found 0 unique links on page 14
  [PAGE 15] https://money.kompas.com/?page=15


   → Found 0 unique links on page 15
  [PAGE 16] https://money.kompas.com/?page=16
   → Found 0 unique links on page 16
  [PAGE 17] https://money.kompas.com/?page=17


   → Found 0 unique links on page 17
  [PAGE 18] https://money.kompas.com/?page=18


   → Found 0 unique links on page 18
  [PAGE 19] https://money.kompas.com/?page=19


   → Found 0 unique links on page 19
  [PAGE 20] https://money.kompas.com/?page=20


   → Found 0 unique links on page 20
[CRAWL] Tekno — https://tekno.kompas.com/
  [PAGE 1] https://tekno.kompas.com/?page=1


   → Found 29 unique links on page 1
     - Fetching 1: https://tekno.kompas.com/read/2025/09/11/10070037/iphone-17-baru-punya-fitur-yang-sudah-ada-di-hp-android-sejak-2017


     - Fetching 2: https://tekno.kompas.com/read/2025/09/11/09234927/startup-italia-bending-spoons-caplok-vimeo-senilai-rp-227-triliun


     - Fetching 3: https://tekno.kompas.com/read/2025/09/11/09020017/oppo-a6-gt-5g-resmi-meluncur-dengan-baterai-7.000-mah-dan-ram-12-gb


     - Fetching 4: https://tekno.kompas.com/read/2025/09/10/14410087/ketika-layar-120-hz-baru-masuk-iphone-17-hp-android-sudah-8-tahun-lalu


     - Fetching 5: https://tekno.kompas.com/read/2025/09/11/08010017/foto-polaroid-gemini-ai-dipeluk-idol-k-pop-viral-ini-prompt-untuk-membuatnya


     - Fetching 6: https://tekno.kompas.com/galeri/detail/557/Unboxing.dan.Hands-on.Infinix.Hot.60.Pro.Plus.Enteng.Tipis.seperti.Tak.Bawa.HP


     - Fetching 7: https://tekno.kompas.com/galeri/detail/556/Membuka.Kotak.Kemasan.Oppo.Reno.14.Pro.5G.yang.Punya.Desain.Baru


     - Fetching 8: https://tekno.kompas.com/galeri/detail/555/Unboxing.HP.Tipis.Samsung.Galaxy.S25.Edge.di.Taipei


     - Fetching 9: http://tekno.kompas.com/read/2025/09/10/08030077/lenovo-rilis-monitor-5k-lengkung-hemat-daya-thinkvision-p40wd


     - Fetching 10: http://tekno.kompas.com/read/2025/09/09/16420027/lenovo-rilis-flicklift-edit-foto-di-laptop-tak-perlu-photoshop


KeyboardInterrupt: 