In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
 !pip install pdfplumber pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
# Simple Case Representation System with minimal dependencies
# First, install required packages
# pip install pdfplumber pandas

import os
import pandas as pd
import json
import re
from datetime import datetime
from typing import Dict, List, Optional
import pdfplumber
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

class SimpleCaseRepresentationSystem:
    def __init__(self, pdf_path: str, csv_path: str):
        """
        Initialize Simple Case Representation System

        Args:
            pdf_path: Path to PDF files directory
            csv_path: Path to save processed CSV files
        """
        self.pdf_path = pdf_path
        self.csv_path = csv_path
        self.cases_data = []

        # Indonesian stopwords (manual list)
        self.stopwords_id = {
            'dan', 'atau', 'yang', 'di', 'ke', 'dari', 'pada', 'dalam', 'untuk',
            'dengan', 'oleh', 'bahwa', 'adalah', 'akan', 'telah', 'dapat', 'bisa',
            'ini', 'itu', 'sini', 'situ', 'sana', 'mereka', 'kami', 'kita', 'saya',
            'anda', 'dia', 'ia', 'nya', 'mu', 'ku', 'se', 'an', 'ter', 'ber', 'per',
            'a', 'the', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'
        }

        # Create output directory if not exists
        os.makedirs(csv_path, exist_ok=True)
        print(f"Initialized Case Representation System")
        print(f"PDF Path: {pdf_path}")
        print(f"CSV Path: {csv_path}")

    def extract_text_from_pdf(self, pdf_file_path: str) -> str:
        """
        Extract text from PDF file using pdfplumber

        Args:
            pdf_file_path: Path to PDF file

        Returns:
            Extracted text content
        """
        try:
            text = ""
            with pdfplumber.open(pdf_file_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    try:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
                    except Exception as e:
                        print(f"Warning: Error extracting page {page_num+1} from {pdf_file_path}: {str(e)}")
                        continue
            return text
        except Exception as e:
            print(f"Error extracting text from {pdf_file_path}: {str(e)}")
            return ""

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text

        Args:
            text: Raw text

        Returns:
            Cleaned text
        """
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    def extract_metadata(self, text: str, filename: str) -> Dict:
        """
        Extract metadata from court decision text

        Args:
            text: Full text of court decision
            filename: PDF filename

        Returns:
            Dictionary containing extracted metadata
        """
        metadata = {
            'case_id': filename.replace('.pdf', ''),
            'no_perkara': '',
            'tanggal': '',
            'jenis_perkara': '',
            'pasal': '',
            'pihak': '',
            'penggugat': '',
            'tergugat': '',
            'filename': filename
        }

        # Clean text for better pattern matching
        clean_text = self.clean_text(text)

        # Extract case number (Nomor Perkara) - Multiple patterns
        no_perkara_patterns = [
            r'(?:Nomor|No\.?|Perkara\s+Nomor)\s*:?\s*(\d+\/[A-Za-z\.\s]+\/\d{4})',
            r'(\d+\/[A-Za-z]+[\/\.][A-Za-z]*\/\d{4})',
            r'Nomor\s*:\s*([^\n]+)',
        ]

        for pattern in no_perkara_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE)
            if match:
                metadata['no_perkara'] = match.group(1).strip()
                break

        # Extract date - Multiple patterns
        date_patterns = [
            r'(\d{1,2})\s+(\w+)\s+(\d{4})',  # 15 Januari 2023
            r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})',  # 15/01/2023
            r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})',  # 2023/01/15
            r'tanggal\s+(\d{1,2})\s+(\w+)\s+(\d{4})',  # tanggal 15 Januari 2023
        ]

        for pattern in date_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE)
            if match:
                metadata['tanggal'] = match.group(0)
                break

        # Extract case type
        case_type_patterns = [
            r'(Perkara\s+Perdata)',
            r'(Perkara\s+Pidana)',
            r'(Pdt\.G[^/]*)',
            r'(Pid[^/]*)',
            r'(Perdata)',
            r'(Pidana)',
            r'(Tata\s+Usaha\s+Negara|TUN)',
            r'(Agama)',
        ]

        for pattern in case_type_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE)
            if match:
                metadata['jenis_perkara'] = match.group(1)
                break

        # Extract articles (Pasal)
        pasal_patterns = [
            r'Pasal\s+(\d+(?:\s+[a-zA-Z]+)*(?:\s+dan\s+\d+)*)',
            r'ps\.?\s+(\d+)',
            r'pasal\s+(\d+[^\n,\.]{0,50})'
        ]

        pasal_matches = []
        for pattern in pasal_patterns:
            matches = re.findall(pattern, clean_text, re.IGNORECASE)
            pasal_matches.extend(matches)

        if pasal_matches:
            # Remove duplicates and limit to first 5
            unique_pasal = list(set(pasal_matches[:5]))
            metadata['pasal'] = ', '.join(unique_pasal)

        # Extract parties
        pihak_patterns = [
            r'(Penggugat|Pemohon|Terdakwa)\s*:?\s*([^\n;]+?)(?=\n|;|Melawan|vs|Tergugat|Termohon)',
            r'(Tergugat|Termohon|Jaksa)\s*:?\s*([^\n;]+?)(?=\n|;|$)',
        ]

        parties = []
        for pattern in pihak_patterns:
            matches = re.findall(pattern, clean_text, re.IGNORECASE)
            for match in matches:
                role, name = match
                clean_name = name.strip()
                if len(clean_name) > 3:  # Minimum name length
                    parties.append(f"{role}: {clean_name}")

                    if role.lower() in ['penggugat', 'pemohon', 'terdakwa']:
                        metadata['penggugat'] = clean_name
                    elif role.lower() in ['tergugat', 'termohon']:
                        metadata['tergugat'] = clean_name

        metadata['pihak'] = ' | '.join(parties[:4])  # Limit to 4 parties

        return metadata

    def extract_key_content(self, text: str) -> Dict:
        """
        Extract key content from court decision

        Args:
            text: Full text of court decision

        Returns:
            Dictionary containing key content
        """
        content = {
            'ringkasan_fakta': '',
            'argumen_hukum': '',
            'putusan': '',
            'barang_bukti': ''
        }

        clean_text = self.clean_text(text)

        # Extract facts summary
        fakta_patterns = [
            r'(?:DUDUK\s+PERKARA|FAKTA[^:]*):?\s*([^A-Z\n]{100,800})',
            r'(?:Bahwa\s+pada\s+|Menimbang\s+bahwa)[^:]*([^A-Z\n]{100,600})',
            r'(?:KRONOLOGI|PERISTIWA)[^:]*:?\s*([^A-Z\n]{100,600})'
        ]

        for pattern in fakta_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
            if match:
                facts = match.group(1).strip()
                # Clean up facts
                facts = re.sub(r'\s+', ' ', facts)
                content['ringkasan_fakta'] = facts[:500]  # Limit length
                break

        # Extract legal arguments
        argumen_patterns = [
            r'(?:PERTIMBANGAN\s+HUKUM|MENIMBANG)[^:]*:?\s*([^A-Z\n]{100,800})',
            r'(?:Menimbang|Mempertimbangkan)[^:]*([^A-Z\n]{100,600})',
            r'(?:DASAR\s+HUKUM)[^:]*:?\s*([^A-Z\n]{100,600})'
        ]

        for pattern in argumen_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
            if match:
                argument = match.group(1).strip()
                argument = re.sub(r'\s+', ' ', argument)
                content['argumen_hukum'] = argument[:500]
                break

        # Extract decision
        putusan_patterns = [
            r'(?:MENGADILI|MEMUTUSKAN|PUTUSAN)[^:]*:?\s*([^A-Z\n]{50,400})',
            r'(?:MENYATAKAN|MENGHUKUM)[^:]*:?\s*([^A-Z\n]{50,400})',
        ]

        for pattern in putusan_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
            if match:
                decision = match.group(1).strip()
                decision = re.sub(r'\s+', ' ', decision)
                content['putusan'] = decision[:300]
                break

        # Extract evidence
        bukti_patterns = [
            r'(?:BARANG\s+BUKTI|ALAT\s+BUKTI)[^:]*:?\s*([^A-Z\n]{50,300})',
            r'(?:BUKTI[^:]*):?\s*([^A-Z\n]{50,200})',
        ]

        for pattern in bukti_patterns:
            match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
            if match:
                evidence = match.group(1).strip()
                evidence = re.sub(r'\s+', ' ', evidence)
                content['barang_bukti'] = evidence[:200]
                break

        return content

    def simple_tokenize(self, text: str) -> List[str]:
        """
        Simple tokenization without NLTK

        Args:
            text: Input text

        Returns:
            List of tokens
        """
        # Convert to lowercase and split
        text = text.lower()
        # Remove punctuation and split
        tokens = re.findall(r'\b[a-zA-Z]+\b', text)
        # Remove stopwords
        clean_tokens = [token for token in tokens
                       if token not in self.stopwords_id and len(token) > 2]
        return clean_tokens

    def feature_engineering(self, text: str) -> Dict:
        """
        Perform simple feature engineering

        Args:
            text: Full text of court decision

        Returns:
            Dictionary containing engineered features
        """
        features = {}

        try:
            # Basic text statistics
            features['text_length'] = len(text)
            words = text.split()
            features['word_count'] = len(words)

            # Simple tokenization
            clean_tokens = self.simple_tokenize(text)
            features['clean_word_count'] = len(clean_tokens)

            # Bag of words (top 15 most common words)
            if clean_tokens:
                word_freq = Counter(clean_tokens)
                top_words = dict(word_freq.most_common(15))
                features['top_words'] = json.dumps(top_words)
            else:
                features['top_words'] = '{}'

            # Legal term frequency
            legal_terms = ['pasal', 'hukum', 'putusan', 'dakwaan', 'bukti',
                          'saksi', 'terdakwa', 'jaksa', 'hakim', 'pengadilan',
                          'perdata', 'pidana', 'penggugat', 'tergugat']

            legal_term_count = {}
            text_lower = text.lower()
            for term in legal_terms:
                count = text_lower.count(term)
                legal_term_count[term] = count

            features['legal_terms'] = json.dumps(legal_term_count)

            # Sentence count (approximation)
            sentences = re.split(r'[.!?]+', text)
            features['sentence_count'] = len([s for s in sentences if len(s.strip()) > 10])

        except Exception as e:
            print(f"Error in feature engineering: {str(e)}")
            # Default values
            features.update({
                'text_length': len(text),
                'word_count': len(text.split()),
                'clean_word_count': 0,
                'top_words': '{}',
                'legal_terms': '{}',
                'sentence_count': 0
            })

        return features

    def create_simple_qa_pairs(self, metadata: Dict, content: Dict) -> List[Dict]:
        """
        Create simple QA pairs from the case data

        Args:
            metadata: Case metadata
            content: Case content

        Returns:
            List of QA pairs
        """
        qa_pairs = []

        # QA pairs for metadata
        if metadata.get('no_perkara'):
            qa_pairs.append({
                'question': 'Apa nomor perkara ini?',
                'answer': metadata['no_perkara']
            })

        if metadata.get('tanggal'):
            qa_pairs.append({
                'question': 'Kapan tanggal putusan ini?',
                'answer': metadata['tanggal']
            })

        if metadata.get('jenis_perkara'):
            qa_pairs.append({
                'question': 'Apa jenis perkara ini?',
                'answer': metadata['jenis_perkara']
            })

        if metadata.get('pasal'):
            qa_pairs.append({
                'question': 'Pasal apa yang terlibat dalam perkara ini?',
                'answer': metadata['pasal']
            })

        if metadata.get('penggugat'):
            qa_pairs.append({
                'question': 'Siapa penggugat dalam perkara ini?',
                'answer': metadata['penggugat']
            })

        # QA pairs for content
        if content.get('ringkasan_fakta'):
            qa_pairs.append({
                'question': 'Apa ringkasan fakta perkara ini?',
                'answer': content['ringkasan_fakta'][:200]
            })

        if content.get('putusan'):
            qa_pairs.append({
                'question': 'Apa putusan dalam perkara ini?',
                'answer': content['putusan'][:200]
            })

        return qa_pairs

    def process_single_case(self, pdf_file_path: str) -> Optional[Dict]:
        """
        Process a single case file

        Args:
            pdf_file_path: Path to PDF file

        Returns:
            Dictionary containing all processed case data or None if failed
        """
        filename = os.path.basename(pdf_file_path)

        try:
            # Extract text
            text = self.extract_text_from_pdf(pdf_file_path)
            if not text or len(text) < 100:
                print(f"  ⚠️  Warning: No text or insufficient text extracted from {filename}")
                return None

            # Extract metadata
            metadata = self.extract_metadata(text, filename)

            # Extract key content
            content = self.extract_key_content(text)

            # Feature engineering
            features = self.feature_engineering(text)

            # Create QA pairs
            qa_pairs = self.create_simple_qa_pairs(metadata, content)

            # Combine all data
            case_data = {
                **metadata,
                **content,
                **features,
                'text_full': text[:1500],  # Limit full text to first 1500 chars
                'qa_pairs': json.dumps(qa_pairs, ensure_ascii=False),
                'processing_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            return case_data

        except Exception as e:
            print(f"  ❌ Error processing {filename}: {str(e)}")
            return None

    def process_all_cases(self) -> None:
        """
        Process all PDF files in the directory
        """
        print(f"🔍 Checking PDF directory: {self.pdf_path}")

        if not os.path.exists(self.pdf_path):
            print(f"❌ PDF directory not found: {self.pdf_path}")
            return

        # Get all PDF files
        all_files = os.listdir(self.pdf_path)
        pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]

        if not pdf_files:
            print(f"❌ No PDF files found in {self.pdf_path}")
            print(f"   Files found: {all_files[:10]}")  # Show first 10 files
            return

        print(f"📁 Found {len(pdf_files)} PDF files to process")
        print("─" * 50)

        successful_cases = 0
        failed_cases = 0

        for i, filename in enumerate(pdf_files, 1):
            pdf_file_path = os.path.join(self.pdf_path, filename)

            print(f"📄 Processing {i}/{len(pdf_files)}: {filename}")

            case_data = self.process_single_case(pdf_file_path)
            if case_data:
                self.cases_data.append(case_data)
                successful_cases += 1
                print(f"  ✅ Success")
            else:
                failed_cases += 1
                print(f"  ❌ Failed")

            # Progress update every 10 files
            if i % 10 == 0:
                print(f"📊 Progress: {i}/{len(pdf_files)} files processed")

        print("─" * 50)
        print(f"🎯 Processing Complete!")
        print(f"   ✅ Successful: {successful_cases}")
        print(f"   ❌ Failed: {failed_cases}")
        print(f"   📊 Total: {len(pdf_files)}")

    def save_results(self) -> None:
        """
        Save processed cases to both CSV and JSON formats
        """
        if not self.cases_data:
            print("❌ No data to save")
            return

        print("💾 Saving results...")

        # Save to CSV
        try:
            df = pd.DataFrame(self.cases_data)
            csv_file_path = os.path.join(self.csv_path, "cases.csv")
            df.to_csv(csv_file_path, index=False, encoding='utf-8')
            print(f"✅ CSV saved: {csv_file_path}")
            print(f"   📊 Shape: {df.shape}")
            print(f"   📝 Columns: {len(df.columns)}")
        except Exception as e:
            print(f"❌ Error saving CSV: {str(e)}")

        # Save to JSON
        try:
            json_file_path = os.path.join(self.csv_path, "cases.json")
            with open(json_file_path, 'w', encoding='utf-8') as f:
                json.dump(self.cases_data, f, ensure_ascii=False, indent=2)
            print(f"✅ JSON saved: {json_file_path}")
        except Exception as e:
            print(f"❌ Error saving JSON: {str(e)}")

    def print_sample_data(self, n: int = 2) -> None:
        """
        Print sample processed data

        Args:
            n: Number of samples to show
        """
        if not self.cases_data:
            print("❌ No data available")
            return

        print(f"\n📋 Sample Data (showing first {min(n, len(self.cases_data))} cases):")
        print("=" * 80)

        for i, case in enumerate(self.cases_data[:n]):
            print(f"\n🔍 Case {i+1}: {case.get('filename', 'Unknown')}")
            print(f"   📄 Case ID: {case.get('case_id', 'N/A')}")
            print(f"   📋 No Perkara: {case.get('no_perkara', 'N/A')}")
            print(f"   📅 Tanggal: {case.get('tanggal', 'N/A')}")
            print(f"   ⚖️  Jenis: {case.get('jenis_perkara', 'N/A')}")
            print(f"   📜 Pasal: {case.get('pasal', 'N/A')[:100]}...")
            print(f"   👥 Pihak: {case.get('pihak', 'N/A')[:100]}...")
            print(f"   📊 Words: {case.get('word_count', 0)}")

            if case.get('ringkasan_fakta'):
                print(f"   📝 Fakta: {case['ringkasan_fakta'][:150]}...")

    def get_summary_statistics(self) -> None:
        """
        Print comprehensive summary statistics
        """
        if not self.cases_data:
            print("❌ No data processed yet")
            return

        df = pd.DataFrame(self.cases_data)

        print("\n" + "=" * 60)
        print("📊 CASE REPRESENTATION SUMMARY REPORT")
        print("=" * 60)

        # Basic statistics
        print(f"📁 Total cases processed: {len(df)}")
        print(f"📄 Average text length: {df['text_length'].mean():.0f} characters")
        print(f"📝 Average word count: {df['word_count'].mean():.0f} words")
        print(f"🔤 Average clean word count: {df['clean_word_count'].mean():.0f} words")

        # Data completeness
        print(f"\n📋 Data Completeness:")
        print(f"   📋 Cases with case numbers: {df['no_perkara'].notna().sum()} ({df['no_perkara'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   📅 Cases with dates: {df['tanggal'].notna().sum()} ({df['tanggal'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   ⚖️  Cases with case types: {df['jenis_perkara'].notna().sum()} ({df['jenis_perkara'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   📜 Cases with articles: {df['pasal'].notna().sum()} ({df['pasal'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   👥 Cases with parties: {df['pihak'].notna().sum()} ({df['pihak'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   📝 Cases with facts: {df['ringkasan_fakta'].notna().sum()} ({df['ringkasan_fakta'].notna().sum()/len(df)*100:.1f}%)")
        print(f"   ⚖️  Cases with decisions: {df['putusan'].notna().sum()} ({df['putusan'].notna().sum()/len(df)*100:.1f}%)")

        # Case type distribution
        if df['jenis_perkara'].notna().sum() > 0:
            print(f"\n⚖️  Case Type Distribution:")
            case_types = df['jenis_perkara'].value_counts()
            for case_type, count in case_types.head(10).items():
                print(f"   📁 {case_type}: {count} cases ({count/len(df)*100:.1f}%)")

        # Text statistics
        print(f"\n📊 Text Statistics:")
        print(f"   📄 Min text length: {df['text_length'].min():.0f} characters")
        print(f"   📄 Max text length: {df['text_length'].max():.0f} characters")
        print(f"   📝 Min word count: {df['word_count'].min():.0f} words")
        print(f"   📝 Max word count: {df['word_count'].max():.0f} words")

        print("\n" + "=" * 60)

# Helper functions for easy usage
def create_case_system(pdf_directory: str, output_directory: str = "./output") -> SimpleCaseRepresentationSystem:
    """
    Create and initialize a SimpleCaseRepresentationSystem

    Args:
        pdf_directory: Path to directory containing PDF files
        output_directory: Path to save processed results

    Returns:
        Initialized SimpleCaseRepresentationSystem instance
    """
    return SimpleCaseRepresentationSystem(pdf_directory, output_directory)

def process_legal_documents(pdf_directory: str, output_directory: str = "./output",
                          show_samples: bool = True, sample_count: int = 2) -> SimpleCaseRepresentationSystem:
    """
    Complete pipeline to process legal documents

    Args:
        pdf_directory: Path to directory containing PDF files
        output_directory: Path to save processed results
        show_samples: Whether to show sample processed data
        sample_count: Number of samples to show

    Returns:
        SimpleCaseRepresentationSystem instance with processed data
    """
    # Initialize system
    system = create_case_system(pdf_directory, output_directory)

    # Process all cases
    system.process_all_cases()

    # Save results
    system.save_results()

    # Show statistics
    system.get_summary_statistics()

    # Show samples if requested
    if show_samples and system.cases_data:
        system.print_sample_data(sample_count)

    return system

# Example usage
if __name__ == "__main__":
    # Example 1: Basic usage
    pdf_path = "/content/drive/MyDrive/PENALARAN KOMPUTER FIX/PDF"  # Change this to your PDF directory
    output_path = "/content/drive/MyDrive/PENALARAN KOMPUTER FIX"      # Change this to your desired output directory

    print("🚀 Starting Case Representation System")
    print("=" * 50)

    # Process all documents
    system = process_legal_documents(pdf_path, output_path, show_samples=True, sample_count=3)

    print("\n🎉 Processing completed!")
    print("Files saved:")
    print(f"  📄 CSV: {output_path}/cases.csv")
    print(f"  📄 JSON: {output_path}/cases.json")

    # Example 2: Advanced usage with custom processing
    """
    # Initialize system manually
    system = SimpleCaseRepresentationSystem(pdf_path, output_path)

    # Process specific file
    single_case = system.process_single_case("/path/to/specific/file.pdf")
    if single_case:
        print("Single case processed successfully!")
        print(f"Case ID: {single_case.get('case_id', 'N/A')}")

    # Process all files
    system.process_all_cases()

    # Get detailed statistics
    system.get_summary_statistics()

    # Save results
    system.save_results()

    # Show sample data
    system.print_sample_data(n=5)
    """

🚀 Starting Case Representation System
Initialized Case Representation System
PDF Path: /content/drive/MyDrive/PENALARAN KOMPUTER FIX/PDF
CSV Path: /content/drive/MyDrive/PENALARAN KOMPUTER FIX
🔍 Checking PDF directory: /content/drive/MyDrive/PENALARAN KOMPUTER FIX/PDF
📁 Found 117 PDF files to process
──────────────────────────────────────────────────
📄 Processing 1/117: zaf04bfe9b9d478480e1313134323135.pdf
  ✅ Success
📄 Processing 2/117: zaf04be947f6488ea6c5303930393335.pdf
  ✅ Success
📄 Processing 3/117: zaf04bf54310144c9b17313033353231.pdf
  ✅ Success
📄 Processing 4/117: zaf04bf612a8aa8482bc313034313039.pdf
  ✅ Success
📄 Processing 5/117: zaf04bf96390c604a4ee313130343533.pdf
  ✅ Success
📄 Processing 6/117: zaf04bf90032b946ae2f313130323037.pdf
  ✅ Success
📄 Processing 7/117: zaf04bfec6df42b29505313134333237.pdf
  ✅ Success
📄 Processing 8/117: zaf04bf353acb24e813d313032313330.pdf
  ✅ Success
📄 Processing 9/117: zaf04b3a846431a0874a313231383334.pdf
  ✅ Success
📄 Processing 10/117: zaf04