# Tahap 1

Scraping

In [1]:
!pip install beautifulsoup4 requests



In [2]:
from bs4 import BeautifulSoup
import requests

base_url = "https://putusan3.mahkamahagung.go.id"
kategori_url = "https://putusan3.mahkamahagung.go.id/direktori/index/kategori/senjata-api-2/page/{}"

detail_links = []

for page in range(1, 3):
    r = requests.get(kategori_url.format(page))
    soup = BeautifulSoup(r.text, 'html.parser')

    for a in soup.select("a[href*='/direktori/putusan/']"):
        href = a.get('href')
        if href.startswith("/"):
            full_url = base_url + href
        else:
            full_url = href
        if full_url not in detail_links:
            detail_links.append(full_url)

print("Contoh link detail:")
for i in range(3):
    print(detail_links[i])


Contoh link detail:
https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf0519a15da042e8757313435373438.html
https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf05199b3401894bd17313435353032.html
https://putusan3.mahkamahagung.go.id/direktori/putusan/zaf05182b8ff4b5e8d1e313231303333.html


In [3]:
import os
import re

os.makedirs("data/pdf", exist_ok=True)

for i, detail_url in enumerate(detail_links[:30]):  # ambil 30 pertama
    try:
        res = requests.get(detail_url)
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')

        # Ambil HASH dari URL
        match_hash = re.search(r'/putusan/([a-z0-9]+)\.html', detail_url)
        hash_val = match_hash.group(1) if match_hash else None

        # Ambil FILE ID dari halaman HTML (biasanya di JS link atau tombol download)
        match_file_id = re.search(r'download_file/([a-z0-9]+)/pdf', html)
        file_id = match_file_id.group(1) if match_file_id else None

        if file_id and hash_val:
            pdf_url = f"https://putusan3.mahkamahagung.go.id/direktori/download_file/{file_id}/pdf/{hash_val}"
            print(f"[{i+1}] Downloading: {pdf_url}")

            pdf_res = requests.get(pdf_url)
            with open(f"data/pdf/case_{i+1:03}.pdf", "wb") as f:
                f.write(pdf_res.content)
        else:
            print(f"[{i+1}] Gagal ekstrak ID dari: {detail_url}")

    except Exception as e:
        print(f"[{i+1}] Error: {e}")


[1] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/4c1e5f101fa9228811e1ca53e9afd189/pdf/zaf0519a15da042e8757313435373438
[2] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/07ffc9555d65e5031e8d3a449b84274d/pdf/zaf05199b3401894bd17313435353032
[3] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/81a2c9ba844e5d13754daf48cf557c4c/pdf/zaf05182b8ff4b5e8d1e313231303333
[4] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/c2e047fa5e4dd5319c6fceed9b37c87c/pdf/zaf051829ef75dbebb39313230393530
[5] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/5cdfb945e97b415c4c2ca95513a479e7/pdf/zaf051829c352d72a7b3313230393435
[6] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/e12bb1e4c9bfa3f17a65ce9f5a5ce17c/pdf/zaf0516948e3fb80a2a8303930383238
[7] Downloading: https://putusan3.mahkamahagung.go.id/direktori/download_file/56b61cc11e132640b26cedc351341cb0/p

Cleaning to TXT

In [4]:
pip install pymupdf tqdm

Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl (18.5 MB)
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ------

In [5]:
import os
import fitz  # PyMuPDF
import re
from tqdm.notebook import tqdm
from datetime import datetime

# Path
input_folder = 'data/pdf'
output_folder = 'data/raw'
log_folder = 'logs'
log_path = os.path.join(log_folder, 'cleaning.log')

# Pastikan folder ada
os.makedirs(output_folder, exist_ok=True)
os.makedirs(log_folder, exist_ok=True)

log_entries = []

def bersihkan_teks(teks):
    # Hapus footer/header umum, tapi pertahankan tanda baca penting
    teks = re.sub(r'halaman\s+\d+\s*', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'\s{2,}', ' ', teks)  # normalisasi spasi ganda
    teks = teks.lower()
    
    # Hapus karakter tidak penting, tapi biarkan :, /, . tetap ada
    teks = re.sub(r'[^\w\s/:.,-]', '', teks)
    
    return teks.strip()

def ekstrak_teks_pdf(filepath):
    doc = fitz.open(filepath)
    all_text = ""
    for page in doc:
        all_text += page.get_text()
    doc.close()
    return all_text

def simpan_log(log_entries):
    with open(log_path, 'a', encoding='utf-8') as log_file:
        for entry in log_entries:
            log_file.write(entry + '\n')

# Proses semua PDF
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

for idx, filename in enumerate(tqdm(pdf_files, desc="Memproses PDF")):
    file_path = os.path.join(input_folder, filename)
    try:
        teks_asli = ekstrak_teks_pdf(file_path)
        panjang_awal = len(teks_asli)

        teks_bersih = bersihkan_teks(teks_asli)
        panjang_bersih = len(teks_bersih)

        if panjang_bersih >= 0.8 * panjang_awal:
            output_filename = f"case_{idx+1:03}.txt"
            with open(os.path.join(output_folder, output_filename), 'w', encoding='utf-8') as f:
                f.write(teks_bersih)

            log_entries.append(f"[{datetime.now()}] Sukses: {filename} → {output_filename} ({panjang_bersih/panjang_awal:.2%})")
        else:
            log_entries.append(f"[{datetime.now()}] GAGAL: {filename} → isi terlalu sedikit ({panjang_bersih/panjang_awal:.2%})")

    except Exception as e:
        log_entries.append(f"[{datetime.now()}] ERROR saat memproses {filename}: {str(e)}")

# Simpan log
simpan_log(log_entries)

print("✅ Selesai: File .txt disimpan di /data/raw/, log di /logs/cleaning.log")


Memproses PDF:   0%|          | 0/30 [00:00<?, ?it/s]

✅ Selesai: File .txt disimpan di /data/raw/, log di /logs/cleaning.log


# Tahap 2

In [6]:
!pip install openpyxl




In [7]:
import os
import pandas as pd
import re
from tqdm.notebook import tqdm

# Path
input_folder = 'data/raw'
output_csv = 'data/processed/cases.csv'
os.makedirs('data/processed', exist_ok=True)

# Fungsi bantu regex (sederhana, bisa kamu sesuaikan kalau format teks MA cukup konsisten)
def ekstrak_no_perkara(teks):
    match = re.search(r'nomor[:\s]+([\w/.\-]+)', teks, re.IGNORECASE)
    return match.group(1).strip() if match else None

def ekstrak_tanggal(teks):
    match = re.search(r'tanggal\s+(?:dibacakan\s+)?(\d{1,2}\s+\w+\s+\d{4})', teks, re.IGNORECASE)
    return match.group(1).strip() if match else None

def ekstrak_pasal(teks):
    pasal = re.findall(r'pasal\s+\d+\s+[^\n.,;]*', teks, re.IGNORECASE)
    return "; ".join(set(pasal)) if pasal else None

def ekstrak_pihak(teks):
    match = re.search(r'nama lengkap\s*[:\s]+([a-z\s\.\']+)', teks, re.IGNORECASE)
    nama = match.group(1).strip() if match else None
    return f"terdakwa: {nama}" if nama else None

def ringkasan_fakta(teks):
    # Cari bagian yang biasanya memuat fakta: "menimbang", "bahwa", "terbukti"
    match = re.search(r'(menimbang.*?terbukti.*?)putusan', teks, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else teks[:1000]  # fallback

# Proses semua file
rows = []
txt_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])

for idx, filename in enumerate(tqdm(txt_files, desc="Ekstraksi metadata")):
    path = os.path.join(input_folder, filename)
    with open(path, 'r', encoding='utf-8') as f:
        teks = f.read()

    row = {
        'case_id': filename.replace('.txt', ''),
        'no_perkara': ekstrak_no_perkara(teks),
        'tanggal': ekstrak_tanggal(teks),
        'ringkasan_fakta': ringkasan_fakta(teks),
        'pasal': ekstrak_pasal(teks),
        'pihak': ekstrak_pihak(teks),
        'text_full': teks
    }
    rows.append(row)

# Simpan ke CSV
df = pd.DataFrame(rows)
df.to_csv(output_csv, index=False, encoding='utf-8')
print(f"✅ Berhasil disimpan ke: {output_csv}")

# Simpan juga ke Excel
output_excel = 'data/processed/cases.xlsx'
df.to_excel(output_excel, index=False)
print(f"📘 Excel disimpan di: {output_excel}")


Ekstraksi metadata:   0%|          | 0/30 [00:00<?, ?it/s]

✅ Berhasil disimpan ke: data/processed/cases.csv
📘 Excel disimpan di: data/processed/cases.xlsx


In [8]:
df['word_count'] = df['text_full'].apply(lambda x: len(x.split()))
df['char_count'] = df['text_full'].apply(len)