In [35]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install pandas lxml


In [36]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime
import time

In [37]:

BASE_URL = "https://www.tribunnews.com/tag/politik"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}


In [38]:
def get_article_content(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        if r.status_code != 200:
            return None

        soup = BeautifulSoup(r.text, "lxml")

        # PRIORITAS selector isi artikel
        selectors = [
            "div.read__content",
            "div#article-content",
            "div.side-article",
            "article"
        ]

        article_div = None
        for sel in selectors:
            article_div = soup.select_one(sel)
            if article_div:
                break

        if article_div is None:
            return None

        paragraphs = article_div.find_all("p")
        content = []

        for p in paragraphs:
            text = p.get_text(" ", strip=True)

            if len(text) < 30:
                continue
            if "Baca juga" in text:
                continue
            if text.lower().startswith(("reporter", "editor", "penulis")):
                continue

            content.append(text)

        if len(content) == 0:
            return None

        return " ".join(content)

    except Exception as e:
        print("Error article:", e)
        return None


In [44]:
print("Scraping Tribunnews Politik...")
MAX_PAGE = 10  # 10 halaman ≈ 150 artikel
data = []

for page in range(1, MAX_PAGE + 1):
    url = f"https://www.tribunnews.com/tag/politik?page={page}"
    print("Scraping page:", page)

    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "lxml")

    articles = soup.select("li.ptb15")

    for art in articles:
        try:
            title_tag = art.select_one("h3 a")
            if not title_tag:
                continue

            title = title_tag.get_text(strip=True)
            link = title_tag["href"]

            print(f"Scraping:", title)

            content = get_article_content(link)
            if not content:
                continue

            data.append({
                "title": title,
                "url": link,
                "content": content
            })

            time.sleep(2)
        except Exception as e:
            print("Skip article:", e)

print("Total articles scraped:", len(data))
df = pd.DataFrame(data)
df.to_csv(
    "tribunnews_politik_full.csv",
    index=False,
    encoding="utf-8-sig"
)

print("Berhasil disimpan ke tribunnews_politik_full.csv")
print(df.head())


Scraping Tribunnews Politik...
Scraping page: 1
Scraping: Tebar Pesona di Tengah Banjir, Masih Efektifkah Politik Pencitraan?
Scraping: PSI Puji Prabowo Akui Keberhasilan Jokowi: Padahal Musuh Politik 2 Kali Pilpres
Scraping: Nahdlatul Ulama Diharapkan Tetap Menjadi Rumah Besar Umat, Bukan Panggung Politik
Scraping: Said Abdullah Sebut Regenerasi Pemilih Muda Jadi Tantangan Masa Depan Politik
Scraping: Ketua PSI Jabar: Bakal Ada Badai Politik Usai Hadirnya Jokowi
Scraping: Al Araf Ingatkan Penyimpangan Peran TNI: Jangan Sampai Kalah Perang karena Tentara Sibuk Urus Pangan
Scraping: Pernyataan Prabowo Tak Takut Dikendalikan Jokowi Dinilai Jadi Strategi untuk Menenangkan Publik
Scraping: Realisme Politik dan Penegakan Hukum Pemilu
Scraping: Partai Perindo Akan Menggelar Rakernas di Ancol Jakarta, Dorong Politik Inklusif
Scraping: Parpol Diminta Ciptakan Ekosistem Politik, Bukan Sekadar Mobilisasi Masyarakat Setiap 5 Tahun
Scraping: Hari Sumpah Pemuda, Megawati Soekarnoputri Minta Anak Mu

In [45]:
print("Total articles scraped:", len(data))

Total articles scraped: 199


In [41]:
# df = pd.DataFrame(data)
# df.to_csv(
#     "tribunnews_politik_full2.csv",
#     index=False,
#     encoding="utf-8-sig"
# )

# print("Berhasil disimpan ke tribunnews_politik_full2.csv")
# print(df.head())

In [42]:
# title_tag = articles[0].select_one("h3 a")
# print(title_tag)

# test_url = title_tag["href"]
# print(test_url)
# print(get_article_content(test_url)[:500])

In [43]:


# for i, art in enumerate(articles, start=1):
#     try:
#         # title_tag = art.select_one("h3 a")
#         if not title_tag:
#             continue

#         title = title_tag.get_text(strip=True)
#         link = title_tag["href"]

#         date_tag = art.select_one(".article__date")
#         date = date_tag.get_text(strip=True) if date_tag else None

#         print(f"[{i}] Scraping:", title)

#         content = get_article_content(link)
#         if content is None:
#             print("Konten kosong, skip")
#             continue

#         data.append({
#             "source": "tribunnews",
#             "title": title,
#             "date": date,
#             "url": link,
#             "content": content,
#             "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#         })

#         time.sleep(2)

#     except Exception as e:
#         print("Skip article:", e)


# df = pd.DataFrame(data)
# df.to_csv(
#     "tribunnews_politik_full.csv",
#     index=False,
#     encoding="utf-8-sig"
# )

# print("Berhasil disimpan ke tribunnews_politik_full.csv")
# print(df.head())
