<a href="https://colab.research.google.com/github/renaldoaluska/pbagasal2025-klp11-garuda/blob/main/Scrapping%20Garuda%20Indonesia%20News%20Article/Scrapping%20Garuda%20Indonesia%20News%20Content/Scrap_Garuda_Indonesia_News_Content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pandas tqdm requests newspaper3k trafilatura readability-lxml justext lxml-html-clean

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting readability-lxml
  Downloading readability_lxml-0.8.4.1-py3-none-any.whl.metadata (4.0 kB)
Collecting justext
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting lxml-html-clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)


In [7]:
# ==== (1) Install dependensi (jalankan sekali) ====
# Di Colab aktifkan baris pip berikut; di environment lokal bisa pip install lewat terminal.
# !pip install pandas tqdm requests newspaper3k trafilatura readability-lxml justext lxml-html-clean

# ==== (2) Import ====
import os
import re
import time
import math
import random
import warnings
import pandas as pd
import requests
from tqdm import tqdm

# parser utama & fallback
from newspaper import Article
import trafilatura
from readability import Document
from lxml import html, etree

warnings.filterwarnings("ignore", category=UserWarning)

# ==== (3) Konfigurasi ====
INPUT_CSV = "data_link_berita.csv"  # ganti kalau beda
OUTPUT_CSV = None  # biarkan None agar otomatis
LINK_COL = None     # biarkan None agar ambil kolom pertama
TITLE_COL_NAMES = ["judul", "title", "headline"]  # daftar kandidat nama kolom judul
TIMEOUT = 15
RETRIES = 3
SLEEP_BASE = 1.2     # jeda dasar antar request (hindari 429)
SLEEP_JITTER = (0.0, 0.8)  # jitter tambahan
USER_AGENTS = [
    # putar beberapa UA agar lebih aman
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118 Safari/537.36",
]

In [4]:
# ==== (4) Util: pilih kolom link & judul ====
def pick_link_col(df: pd.DataFrame, prefer=LINK_COL):
    if prefer and prefer in df.columns:
        return prefer
    return df.columns[0]

def find_title_col(df: pd.DataFrame):
    cols_lower = {c.lower(): c for c in df.columns}
    for name in TITLE_COL_NAMES:
        if name in cols_lower:
            return cols_lower[name]
    return None  # tidak ada judul

# ==== (5) FungsI ambil HTML mentah dengan retry ====
def fetch_html(url: str) -> str | None:
    last_err = None
    for attempt in range(1, RETRIES + 1):
        try:
            headers = {
                "User-Agent": random.choice(USER_AGENTS),
                "Accept-Language": "id,en;q=0.8",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            }
            resp = requests.get(url, headers=headers, timeout=TIMEOUT)
            # tangani rate limit
            if resp.status_code == 429:
                # exponential backoff kecil
                time.sleep(SLEEP_BASE * attempt + random.uniform(*SLEEP_JITTER))
                continue
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(SLEEP_BASE * attempt + random.uniform(*SLEEP_JITTER))
    return None

# ==== (6) Ekstraksi judul & konten via beberapa metode ====
def parse_with_newspaper(url: str):
    try:
        art = Article(url, keep_article_html=False, fetch_images=False)
        art.download()
        art.parse()
        title = (art.title or "").strip()
        text = (art.text or "").strip()
        return title, text
    except Exception:
        return None, None

def parse_with_trafilatura(url: str, html_text: str | None = None):
    try:
        downloaded = html_text or trafilatura.fetch_url(url)
        if not downloaded:
            return None, None
        text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
        # judul dari metadata
        title = trafilatura.metadata.extract_metadata(downloaded)
        title_txt = (title.title if title and title.title else "").strip()
        return title_txt, (text or "").strip()
    except Exception:
        return None, None

def parse_with_readability(html_text: str):
    try:
        doc = Document(html_text)
        title = (doc.short_title() or "").strip()
        content_html = doc.summary()
        # buang tag → teks
        tree = html.fromstring(content_html)
        text = "\n".join([t.strip() for t in tree.xpath("//text()") if t and t.strip()])
        return title, text.strip()
    except Exception:
        return None, None

def clean_text(txt: str) -> str:
    if not txt:
        return ""
    # rapikan spasi berlebih
    txt = re.sub(r"\s+\n", "\n", txt)
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    txt = re.sub(r"[ \t]{2,}", " ", txt)
    return txt.strip()

def extract_article(url: str):
    # 1) coba newspaper3k
    title, text = parse_with_newspaper(url)
    if text and len(text.split()) >= 60:  # minimal panjang wajar
        return clean_text(title), clean_text(text)

    # 2) fetch html sekali agar hemat request
    html_text = fetch_html(url)
    if html_text:
        # 2a) trafilatura
        t2_title, t2_text = parse_with_trafilatura(url, html_text)
        if t2_text and len(t2_text.split()) >= 60:
            # pilih judul terbaik
            best_title = title or t2_title
            return clean_text(best_title), clean_text(t2_text)

        # 2b) readability
        r_title, r_text = parse_with_readability(html_text)
        if r_text and len(r_text.split()) >= 60:
            best_title = title or t2_title or r_title
            return clean_text(best_title), clean_text(r_text)

    # kalau semua gagal, kembalikan apa adanya (bisa kosong)
    return clean_text(title), clean_text(text or "")

# ==== (7) Proses CSV ====
df = pd.read_csv(INPUT_CSV)
link_col = pick_link_col(df)
title_col = find_title_col(df)

# siapkan kolom hasil
if title_col is None:
    title_col = "judul"
    if title_col not in df.columns:
        df[title_col] = ""

df["konten"] = ""

# loop ambil artikel
urls = df[link_col].astype(str).tolist()

for i in tqdm(range(len(urls)), desc="Mengambil konten artikel"):
    url = urls[i]
    if not isinstance(url, str) or not url.startswith("http"):
        continue
    title, content = extract_article(url)
    # isi judul hanya jika kosong atau null
    if not isinstance(df.at[i, title_col], str) or not df.at[i, title_col].strip():
        df.at[i, title_col] = title
    df.at[i, "konten"] = content
    # jeda kecil antar request (hindari 429)
    time.sleep(SLEEP_BASE + random.uniform(*SLEEP_JITTER))

# ==== (8) Letakkan 'konten' tepat di sebelah 'judul' ====
def move_column_next_to(df: pd.DataFrame, col_to_move: str, target_col: str, after=True):
    cols = list(df.columns)
    if col_to_move not in cols or target_col not in cols:
        return df
    cols.remove(col_to_move)
    idx = cols.index(target_col) + (1 if after else 0)
    cols.insert(idx, col_to_move)
    return df[cols]

df = move_column_next_to(df, "konten", title_col, after=True)

# ==== (9) Simpan ====
if OUTPUT_CSV is None:
    base, ext = os.path.splitext(INPUT_CSV)
    # Change output extension to xlsx
    OUTPUT_CSV = f"{base}_with_content.xlsx"

df.to_csv(OUTPUT_CSV, index=False)
print(f"Selesai! Tersimpan: {OUTPUT_CSV}")

Mengambil konten artikel: 100%|██████████| 614/614 [42:42<00:00,  4.17s/it]

Selesai! Tersimpan: data_link_berita_with_content.csv



