# Tahap 1

In [3]:
# Install required packages if not already installed
!pip install pandas requests beautifulsoup4 pdfminer.six lxml aiohttp

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import asyncio
import aiohttp
import io
import os
import re
import time
import urllib.request
import requests
import random
from datetime import date
import pandas as pd
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

In [7]:
def create_path(folder_name):
    path = os.path.join('/content/drive/MyDrive', folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def get_random_user_agent():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]
    return random.choice(user_agents)

async def open_page_async(link, headers=None):
    count = 0
    max_retries = 3
    while count < max_retries:
        try:
            headers = headers or {"User-Agent": get_random_user_agent()}
            async with aiohttp.ClientSession() as session:
                async with session.get(link, headers=headers, timeout=300) as response:
                    if response.status == 503:
                        raise aiohttp.ClientResponseError(
                            response.request_info,
                            response.history,
                            status=503,
                            message="Service Temporarily Unavailable",
                            headers=response.headers
                        )
                    response.raise_for_status()
                    text = await response.text()
                    return BeautifulSoup(text, "lxml")
        except (aiohttp.ClientResponseError, aiohttp.ClientError, asyncio.TimeoutError) as e:
            print(f"Percobaan {count + 1}/{max_retries} gagal untuk {link}: {e}")
            count += 1
            await asyncio.sleep(3 if '503' in str(e) else 2)
    print(f"Gagal mengakses {link} setelah {max_retries} percobaan. Mencoba menggunakan requests...")
    try:
        response = requests.get(link, headers=headers, timeout=20)
        response.raise_for_status()
        return BeautifulSoup(response.text, "lxml")
    except requests.RequestException as e:
        print(f"Fallback ke requests juga gagal untuk {link}: {e}")
        return None

def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""

def get_pdf(url, path_pdf):
    try:
        headers = {"User-Agent": get_random_user_agent()}
        with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as file:
            file_name = os.path.basename(url).replace("/", " ").replace("?", "_").replace("=", "_")
            file_content = file.read()
            full_path = os.path.join(path_pdf, file_name)
            with open(full_path, "wb") as out_file:
                out_file.write(file_content)
            print(f"PDF berhasil diunduh: {full_path}")
            return io.BytesIO(file_content), file_name
    except Exception as e:
        print(f"Gagal mengunduh PDF dari {url}: {e}")
        return None, None

def clean_text(text):
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text

def is_url_already_scraped(url, destination):
    if not os.path.isfile(destination):
        return False
    try:
        df = pd.read_csv(destination)
        return url in df["link"].values
    except pd.errors.EmptyDataError:
        return False


In [8]:
async def extract_data(link, keyword_url, path_output, path_pdf, today, download_pdf=True, extract_pdf_text=False):
    keyword_url_clean = keyword_url.replace("/", " ").replace("+", "_")
    if keyword_url.startswith("https"):
        keyword_url_clean = "narkotika"
    destination = os.path.join(path_output, f"putusan_ma_{keyword_url_clean}_{today}.csv")

    if is_url_already_scraped(link, destination):
        print(f"Melewati URL duplikat: {link}")
        return

    headers = {"User-Agent": get_random_user_agent()}
    soup = await open_page_async(link, headers=headers)
    if not soup:
        print(f"Gagal memproses halaman untuk {link}")
        return
    table = soup.find("table", {"class": "table"})
    if not table:
        print(f"Tabel tidak ditemukan di {link}")
        return
    judul = table.find("h2").text if table.find("h2") else ""
    if table.find("h2"):
        table.find("h2").decompose()

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    status = get_detail(table, "Status")
    abstrak = get_detail(table, "Abstrak")

    link_pdf, file_name_pdf, text_pdf = "", "", ""
    if download_pdf or extract_pdf_text:
        try:
            pdf_tag = soup.find("a", href=re.compile(r"/pdf/"))
            if pdf_tag:
                link_pdf = pdf_tag["href"]
                print(f"Menemukan link PDF: {link_pdf}")
                if download_pdf and path_pdf:
                    file_pdf, file_name_pdf = get_pdf(link_pdf, path_pdf)
                else:
                    file_pdf = None
                if extract_pdf_text and file_pdf:
                    text_pdf = extract_text(file_pdf)
                    text_pdf = clean_text(text_pdf)
            else:
                print(f"Link PDF tidak ditemukan di {link}")
        except Exception as e:
            print(f"Gagal memproses PDF untuk {link}: {e}")
            link_pdf, file_name_pdf, text_pdf = "", "", ""

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        status,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
        text_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "status",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
            "text_pdf",
        ],
    )

    if not os.path.isfile(destination):
        result.to_csv(destination, header=True, index=False)
        print(f"CSV baru dibuat: {destination}")
    else:
        result.to_csv(destination, mode="a", header=False, index=False)
        print(f"Data ditambahkan ke CSV: {destination}")


In [9]:
async def run_process(keyword_url, page, sort_date, path_output, path_pdf, today, download_pdf=True, extract_pdf_text=False):
    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_date:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    headers = {"User-Agent": get_random_user_agent()}
    soup = await open_page_async(link, headers=headers)
    if not soup:
        print(f"Gagal memproses halaman {page}")
        return
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    tasks = []
    for link in links:
        tasks.append(extract_data(link["href"], keyword_url, path_output, path_pdf, today, download_pdf, extract_pdf_text))
        await asyncio.sleep(0.5)  # Penundaan kecil antar-permintaan dalam halaman
    await asyncio.gather(*tasks)

async def run_scraper(keyword="narkotika", url=None, sort_date=True, download_pdf=True, extract_pdf_text=False, max_pages=None):
    if not keyword and not url:
        print("Harap masukkan kata kunci atau URL")
        return

    path_output = create_path("CSV")
    path_pdf = create_path("PDF") if download_pdf else None
    today = date.today().strftime("%Y-%m-%d")

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"
    if url:
        link = url

    headers = {"User-Agent": get_random_user_agent()}
    soup = await open_page_async(link, headers=headers)
    if not soup:
        print("Gagal mengakses halaman awal. Silakan periksa koneksi atau coba lagi nanti.")
        return
    try:
        last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))
    except (IndexError, ValueError):
        print("Paginasi tidak ditemukan atau data halaman tidak valid. Hanya memproses 1 halaman.")
        last_page = 1

    if max_pages:
        last_page = min(last_page, max_pages)

    if url:
        print(f"Melakukan scraping dengan URL: {url} - estimasi {20 * last_page} data - {last_page} halaman")
    else:
        print(f"Melakukan scraping dengan kata kunci: {keyword} - estimasi {20 * last_page} data - {last_page} halaman")

    keyword_url = url if url else keyword
    tasks = []
    for page in range(1, last_page + 1):
        tasks.append(run_process(keyword_url, page, sort_date, path_output, path_pdf, today, download_pdf, extract_pdf_text))
        await asyncio.sleep(2)  # Penundaan untuk menghindari pemblokiran server
    await asyncio.gather(*tasks)
    print(f"Scraping selesai. Data disimpan ke {path_output}, PDF disimpan ke {path_pdf}")

async def scrape_specific_url(url, download_pdf=True, extract_pdf_text=False):
    if not url or not url.startswith("https://"):
        print("Harap masukkan URL yang valid")
        return

    path_output = create_path("CSV")
    path_pdf = create_path("PDF") if download_pdf else None
    today = date.today().strftime("%Y-%m-%d")

    await extract_data(url, url, path_output, path_pdf, today, download_pdf, extract_pdf_text)
    print(f"Scraping selesai untuk {url}. Data disimpan ke {path_output}")


In [10]:
# Jalankan scraper untuk 'narkotika' dengan optimasi
# max_pages=3 untuk pengujian agar lebih cepat
# extract_pdf_text=False untuk menghindari ekstraksi teks PDF yang lambat
await run_scraper(keyword="narkotika", max_pages=7, extract_pdf_text=False)

Melakukan scraping dengan kata kunci: narkotika - estimasi 140 data - 7 halaman
Percobaan 1/3 gagal untuk https://putusan3.mahkamahagung.go.id/search.html?q=narkotika&page=4&obf=TANGGAL_PUTUS&obm=desc: 503, message='Service Temporarily Unavailable', url='https://putusan3.mahkamahagung.go.id/search.html?q=narkotika&page=4&obf=TANGGAL_PUTUS&obm=desc'
Percobaan 2/3 gagal untuk https://putusan3.mahkamahagung.go.id/search.html?q=narkotika&page=4&obf=TANGGAL_PUTUS&obm=desc: 503, message='Service Temporarily Unavailable', url='https://putusan3.mahkamahagung.go.id/search.html?q=narkotika&page=4&obf=TANGGAL_PUTUS&obm=desc'
Melewati URL duplikat: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf04bed03bcd238b54f303933363139.html
Melewati URL duplikat: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf04bf2fd3c3448b20a313031393035.html
Melewati URL duplikat: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf04bf0d8f46a62abd8313030333435.html
Melewati URL duplikat: https:/