In [1]:
import pypdf
import re

# Extract PDF to TXT

In [9]:
import os
import re
import pypdf

def final_output(pdf_folder="pdf/", txt_folder="txt/"):
    """
    Processes PDFs in pdf_folder and writes:
    - raw text files in txt_folder/raw/
    - cleaned & merged final text files in txt_folder/final/
    """
    raw_folder = os.path.join(txt_folder, "raw")
    final_folder = os.path.join(txt_folder, "final")
    analysis_folder = os.path.join(txt_folder, "analysis")
    os.makedirs(analysis_folder, exist_ok=True)
    os.makedirs(raw_folder, exist_ok=True)
    os.makedirs(final_folder, exist_ok=True)

    for filename in os.listdir(pdf_folder):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(pdf_folder, filename)
        base = os.path.splitext(filename)[0]
        raw_txt_path = os.path.join(raw_folder, "raw " + base + ".txt")
        final_txt_path = os.path.join(final_folder, "final " + base + ".txt")
        analysis_txt_path = os.path.join(analysis_folder, "analysis " + base + ".txt")

        # Attempt to open PDF
        try:
            reader = pypdf.PdfReader(pdf_path)
        except Exception as e:
            print(f"Skipping {filename}: cannot read PDF ({e})")
            continue
        text = ''
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
        # ==================================== Raw Text Processing ====================================
        with open(analysis_txt_path, 'w', encoding='utf-8') as f:
            f.write(text)
        text = re.sub(
            r"Mahkamah Agung Republik Indonesia.*?putusan.mahkamahagung.go.id",
            "",
            text,
            flags=re.DOTALL
        )
        text = re.sub(r"Disclaimer\s*Kepaniteraan", "", text, flags=re.DOTALL)
        text = re.sub(r"Email : kepaniteraan@mahkamahagung\.go\.id\s+Telp : 021-384 3348 \(ext\.318\)", "", text)
        text = re.sub(r"Halaman\s*\d+\s*dari\s*\d+\s*Putusan\s*Nomor\s*.+", "", text)
        text = text.replace("1. Nama lengkap", "Nama lengkap")
        text = text.replace("2. Tempat lahir", "Tempat lahir")
        text = text.replace("3. Umur/Tanggal lahir", "Umur/Tanggal lahir")
        text = text.replace("4. Jenis kelamin", "Jenis kelamin")
        text = text.replace("5. Kebangsaan", "Kebangsaan")
        text = text.replace("6. Tempat tinggal", "Tempat tinggal")
        text = text.replace("7. Agama", "Agama")
        text = text.replace("8. Pekerjaan", "Pekerjaan")
        
        text = re.sub(r"^\s*\n*(?=P\s+U\s+T\s+U\s+S\s+A\s+N)", "", text, flags=re.MULTILINE) # membuat PUTUSAN di baris pertama 
        
        pattern = re.compile(
            r"(Pengadilan\s+Negeri\s+.+?berikut\s+dalam\s+perkara\s+Terdakwa\s*:?)",
            flags=re.IGNORECASE | re.DOTALL
        ) # Mengganti "Pengadilan Negeri" yang sampai "berikut dalam perkara Terdakwa" dalam satu bari
        
        match = pattern.search(text)
        if match:
            joined = " ".join(match.group(1).split())
            text = text.replace(match.group(1), joined)

        text = re.sub(r'\s{2,}:\s*', ': ', text) # menghapus spasi di identitas terdakwa
        
        text = re.sub(
            r'\n\s+', # Normalisasi baris agar satu baris penuh (jika multiline, seperti alamat)
            ' ', # Ini opsional, tapi membantu saat alamat terpotong jadi banyak baris 
            text) 
        text = re.sub(r'(?<!:)\s{2,}', ' ', text) # Hapus spasi ganda di tengah kalimat (tanpa mempengaruhi titik dua)
        
        
        text = re.sub(
            r"(Pengadilan\s+Negeri\s+tersebut;?)(\s*)([^.\n]+)", # Cari "Pengadilan Negeri tersebut;" atau "Pengadilan Negeri tersebut" (tanpa titik koma),
            r"\n\1\n\3", # lalu pindahkan ke baris baru, dan juga potong kalimat setelahnya ke baris baru
            text,
            flags=re.IGNORECASE
        )
        with open(analysis_txt_path, 'w', encoding='utf-8') as f:
            f.write(text)

        # ==================================== Prepocessing setelah identitas ==================================== 
        with open(analysis_txt_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        output_lines = []
        inside_ordered_block = False
        buffer = ""
        for i, line in enumerate(lines):
            striped = line.strip()
            
            if re.match(r'^Pekerjaan\s*:', striped, flags=re.IGNORECASE): # Cek awal ordered list (setelah Pekerjaan:)
                inside_ordered_block = True
                output_lines.append(line)
                continue
            
            if inside_ordered_block and re.match(r'^Pengadilan\s+Negeri\s+tersebut;?', striped, flags=re.IGNORECASE): # Cek akhir blok: "Pengadilan Negeri tersebut;"
                if buffer:
                    output_lines.append(buffer.strip() + '\n')
                    buffer = ""
                output_lines.append(line)
                inside_ordered_block = False
                continue
            
            if inside_ordered_block: # Di dalam blok ordered list
                
                if re.match(r'^\d+\.\s', striped): # Baris baru dimulai dengan angka diikuti titik: item baru
                    if buffer:
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                    buffer = striped
                    if buffer.endswith(';'):  # Jika baris sudah berakhir dengan titik koma, langsung simpan
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                else:
                    buffer += " " + striped # Baris lanjutan: tambahkan ke buffer
                    if ";" or "." in striped: # Jika sudah ketemu titik koma, simpan dan reset buffer
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
            else:
                output_lines.append(line)
        if buffer:  # Jika buffer masih tersisa (dan blok belum ditutup), simpan
            output_lines.append(buffer.strip() + '\n')
        with open(raw_txt_path, "w", encoding="utf-8") as f:
            f.writelines(output_lines)
        
        # ==================================== Unordered List ====================================
        with open(raw_txt_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        output_lines = []
        inside_target_block = False
        collecting_bullet = False
        buffer = ""
        result_block = []
        for i, line in enumerate(lines):
            striped = line.strip()
            
            if striped.lower().startswith("pengadilan negeri tersebut;"): # Mulai blok ketika ketemu "Pengadilan Negeri tersebut;"
                inside_target_block = True
                output_lines.append(line)
                continue
            
            if inside_target_block and re.match(r'^Setelah mendengar', striped, flags=re.IGNORECASE): # Jika sudah di dalam blok, berhenti saat ketemu "Setelah mendengar"
                if buffer:
                    result_block.append(buffer.strip() + '\n')
                    buffer = ""
                inside_target_block = False
                output_lines.extend(result_block)
                output_lines.append(line)
                continue
            if inside_target_block:
                
                if striped.startswith("- "): # Baris bullet baru
                    if buffer:
                        result_block.append(buffer.strip() + '\n')
                        buffer = ""
                    buffer = striped
                    if striped.endswith(";") or striped.endswith("."):
                        result_block.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_bullet = False
                    else:
                        collecting_bullet = True
                elif collecting_bullet:
                    buffer += " " + striped
                    if striped.endswith(";") or striped.endswith("."):
                        result_block.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_bullet = False
                else:
                    result_block.append(line)
            else:
                output_lines.append(line)
        if buffer: # Tambahan: jika buffer masih ada dan tidak tertutup titik koma
            result_block.append(buffer.strip() + '\n')
            buffer = ""

        with open(raw_txt_path, "w", encoding="utf-8") as f:
            f.writelines(output_lines)

        # ==================================== Final Output ====================================
        with open(raw_txt_path, "r", encoding="utf-8") as f:
            lines = f.readlines()

        exception_words = {
            "No.", "Lab.", "Jl.", "Prof.", "Dr.", "S.H.", "M.H.", "Rp.", "Yth.",
            "Tgl.", "S.K.", "S.Pd.", "S.T.", "S.E.", "S.Kom.", "S.Ag.",
            "S.H.", "S.Kes.", "S.Psi.", "S.Si.", "S.TP.", "S.Pi.",
            "M.H.", "M.Si.", "M.Kn.", "M.Pd.", "M.A.", "M.Hum.",
            "RT.", "RW.", "rt.", "rw.", "dsn.", "dsa.", "ds.", "dusun.",
            "Kec.", "Kel.", "Kec.", "Kab.", "Kota.", "Prov.",
            "Kediri.",
            "kec.", "kel.", "kota.", "kab.", "prov.",
        }

        output_lines = []
        start_merging = False
        buffer = ""
        collecting_ordered = False
        collecting_unordered = False
        collecting_plain = False

        for i, line in enumerate(lines):
            striped = line.strip()

            if not start_merging and striped == "Pengadilan Negeri tersebut;": # Mulai merging setelah "Pengadilan Negeri tersebut;"
                start_merging = True
                output_lines.append(line)
                continue

            if start_merging:
                
                if re.match(r'^\d+\.', striped): # Detect ordered list: 1., 2., 3.
                    if buffer:
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                    buffer = striped
                    collecting_ordered = True
                    if re.search(r'(?<!\d)[;:.](?!\d)', striped):
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_ordered = False
                    continue

                if collecting_ordered: # Ordered continuation
                    buffer += " " + striped
                    if re.search(r'(?<!\d)[;:.](?!\d)', striped):
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_ordered = False
                    continue

                if re.match(r'^[-\uf0b7]\s+', striped): # Detect unordered list: - and \uf0b7
                    if buffer:
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                    buffer = striped
                    collecting_unordered = True
                    if re.search(r'(?<!\d)[;:.](?!\d)', striped):
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_unordered = False
                    continue

                if collecting_unordered: # Unordered continuation
                    buffer += " " + striped
                    if re.search(r'(?<!\d)[;:.](?!\d)', striped):
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                        collecting_unordered = False
                    continue

                if re.match(r'^Ad\.\d+\.', striped, re.IGNORECASE): # Detect special format like Ad.1., Ad.2., etc.
                    if buffer:
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                    output_lines.append(striped + '\n')
                    continue

                last_word_match = re.search(r'(\b[\w\.]+)[;:.](?!\d)', striped) # Baris biasa
                last_word = last_word_match.group(1) if last_word_match else ""

                if not re.search(r'(?<!\d)[;:.](?!\d)', striped) or last_word in exception_words:
                    buffer += " " + striped
                else:
                    if buffer:
                        buffer += " " + striped
                        output_lines.append(buffer.strip() + '\n')
                        buffer = ""
                    else:
                        output_lines.append(striped + '\n')
            else:
                output_lines.append(line)

        if buffer: # Simpan buffer terakhir jika ada
            output_lines.append(buffer.strip() + '\n')
        print(f"Processed {filename} into {raw_txt_path} and {final_txt_path}")
        with open(final_txt_path, "w", encoding="utf-8") as f:
            f.writelines(output_lines)
    

    print(f"Processed PDFs from '{pdf_folder}' into '{raw_folder}' and '{final_folder}'.")

In [None]:
if __name__ == "__main__":
    final_output()