<a href="https://colab.research.google.com/github/qoyyimil/Project-A-PBA/blob/main/Scrapping_UBER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas openpyxl requests beautifulsoup4 lxml



In [2]:
import os
import time
import random
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

In [3]:
# KONFIGURASI
EXCEL_PATH = "/content/Link Berita - Kelompok 6.xlsx"
SHEET_NAME = "UBER"
URL_COLUMN = "Link"
OUTPUT_CSV = "/content/hasil_scrape_UBER.csv"

# Header agar tidak mudah diblokir server
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "id,en;q=0.9",
}

In [4]:
# FUNGSI UTILITAS
def is_valid_url(url: str) -> bool:
    if not isinstance(url, str):
        return False
    u = url.strip()
    if not (u.startswith("http://") or u.startswith("https://")):
        return False
    parsed = urlparse(u)
    return bool(parsed.scheme) and bool(parsed.netloc)

def extract_title(soup: BeautifulSoup) -> str:
    # Prioritas: og:title -> twitter:title -> <h1> -> <title>
    og = soup.find("meta", attrs={"property": "og:title"})
    if og and og.get("content"):
        return og["content"].strip()

    tw = soup.find("meta", attrs={"name": "twitter:title"})
    if tw and tw.get("content"):
        return tw["content"].strip()

    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        return h1.get_text(strip=True)

    title = soup.find("title")
    if title and title.get_text(strip=True):
        return title.get_text(strip=True)

    return "Judul tidak ditemukan"

def extract_text(soup: BeautifulSoup) -> str:
    # Ambil semua paragraf yang ada teksnya
    paragraphs = soup.find_all("p")
    texts = [
        p.get_text(separator=" ", strip=True)
        for p in paragraphs
        if p.get_text(strip=True)
    ]
    article_text = " ".join(texts).strip()

    # Fallback: meta description jika paragraf kosong
    if not article_text:
        for attrs in ({"name": "description"}, {"property": "og:description"}):
            meta = soup.find("meta", attrs=attrs)
            if meta and meta.get("content"):
                return meta["content"].strip()

    return article_text

def scrape_article(url: str, timeout: int = 20):
    """
    Kembalikan: (title, text, status_code, error_msg)
    """
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        status = resp.status_code

        if status != 200:
            return None, None, status, f"Status code {status}"

        soup = BeautifulSoup(resp.content, "html.parser")
        title = extract_title(soup)
        text = extract_text(soup)

        if not text:
            return title, "", status, "Empty article text"

        return title, text, status, ""
    except RequestException as e:
        return None, None, None, f"RequestException: {e}"
    except Exception as e:
        return None, None, None, f"Exception: {e}"

In [5]:
# MAIN
def main():
    # 1) Baca Excel
    try:
        df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_NAME)  # engine=openpyxl otomatis
    except FileNotFoundError:
        print(f"File tidak ditemukan: {EXCEL_PATH}")
        return
    except ValueError as e:
        # Biasanya saat sheet tidak ada
        print(f"Gagal membuka sheet '{SHEET_NAME}': {e}")
        return
    except Exception as e:
        print(f"Error membaca Excel: {e}")
        return

    # 2) Validasi kolom
    if URL_COLUMN not in df.columns:
        print(f"Kolom '{URL_COLUMN}' tidak ditemukan di sheet '{SHEET_NAME}'. Kolom tersedia: {list(df.columns)}")
        return

    # 3) Siapkan daftar URL + optional kolom No/PJ
    urls = []
    for _, row in df.iterrows():
        url = row[URL_COLUMN]
        if is_valid_url(url):
            urls.append({
                "No": row["No"] if "No" in df.columns else None,
                "PJ": row["PJ"] if "PJ" in df.columns else None,
                "URL": url.strip()
            })

    if not urls:
        print("Tidak ada URL valid pada kolom 'Link'.")
        return

    # 4) (Opsional) Hilangkan duplikasi URL agar hemat waktu
    seen = set()
    unique_rows = []
    for r in urls:
        if r["URL"] not in seen:
            seen.add(r["URL"])
            unique_rows.append(r)
    urls = unique_rows

    print(f"Total URL valid untuk di-scrape: {len(urls)}")
    results = []

    # 5) Scrape satu per satu
    for i, item in enumerate(urls, start=1):
        url = item["URL"]
        print(f"[{i}/{len(urls)}] Scraping: {url}")

        title, text, status, err = scrape_article(url)
        results.append({
            "No": item["No"],
            "PJ": item["PJ"],
            "Original_URL": url,
            "Judul_Artikel_Scraped": title if title else "",
            "Teks_Artikel_Scraped": text if text else "",
            "Status_Code": status if status is not None else "",
            "Error": err
        })

        # Jeda acak biar sopan
        time.sleep(random.uniform(1.0, 2.5))

    # 6) Simpan ke CSV
    out_df = pd.DataFrame(results)
    out_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"Selesai. Hasil disimpan ke: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

Total URL valid untuk di-scrape: 174
[1/174] Scraping: https://edition.cnn.com/2025/09/11/tech/us-sues-uber-alleging-disability-discrimination
[2/174] Scraping: https://edition.cnn.com/2025/03/04/business/self-driving-uber-waymo-texas
[3/174] Scraping: https://edition.cnn.com/2025/05/14/tech/uber-route-share-cheaper-carpool-service
[4/174] Scraping: https://edition.cnn.com/2025/07/24/tech/uber-women-preferences-us-launch
[5/174] Scraping: https://edition.cnn.com/2023/05/17/tech/uber-teen-accounts
[6/174] Scraping: https://edition.cnn.com/2024/05/15/tech/uber-shuttle-airport-concert-annual-product-update
[7/174] Scraping: https://edition.cnn.com/2022/02/16/tech/uber-ratings-update
[8/174] Scraping: https://edition.cnn.com/2024/03/18/business/uber-australia-taxi-drivers-compensation
[9/174] Scraping: https://edition.cnn.com/2024/10/02/business/uber-eats-accident-lawsuit
[10/174] Scraping: https://www.cnn.com/2021/03/16/tech/uber-uk-vacation-pensions-drivers
[11/174] Scraping: https://www