### Import Libraries

In [38]:
### Import Libraries - Updated with latest LangChain packages
import re
import os
import textwrap
from typing import List, Dict, Any

# Document loading and processing
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

# Vector store and embeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# LLM integration
from langchain_openai import ChatOpenAI
from langchain_community.llms import Ollama

# Retrieval and chains
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

#### Setup API Key

In [None]:
# Masukkan OpenAI API key Anda
os.environ["OPENAI_API_KEY"] = "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"

#### Ekstraksi Teks dari PDF

In [None]:
def load_pdf(pdf_path):
    """Load dan ekstrak teks dari PDF"""
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    
    # Gabungkan semua halaman menjadi satu teks
    full_text = ""
    for page in pages:
        full_text += page.page_content + "\n"
    
    return full_text

In [None]:
def preprocess_text(text):
    """Membersihkan dan mempersiapkan teks untuk chunking"""
    # Hapus header dan footer yang tidak diperlukan
    text = re.sub(r'PRESIDEN\s+REPUBLIK\s+INDONESIA', '', text)
    text = re.sub(r'-\s*\d+\s*-', '', text)
    
    # Normalisasi spasi dan line breaks
    text = re.sub(r'\s+', ' ', text)
    
    # Pastikan ada baris baru sebelum BAB dan Pasal untuk memudahkan pendeteksian
    text = re.sub(r'(?<!\n)(BAB\s+[IVXLCDM]+)', r'\n\1', text)
    text = re.sub(r'(?<!\n)(Pasal\s+\d+)', r'\n\1', text)
    
    return text

In [None]:
# Uji fungsi load_pdf dengan path contoh
pdf_path = "../data/UU Nomor 13 Tahun 2003.pdf"
try:
    # Ekstrak sampel teks (50 karakter awal dan akhir) untuk ditampilkan
    raw_text = load_pdf(pdf_path)
    preprocessed_text = preprocess_text(raw_text)
    print(f"Berhasil memuat PDF! Total karakter: {len(preprocessed_text)}")
    print(f"Sample awal teks:\n{preprocessed_text[:100]}...")
    print(f"Sample akhir teks:\n...{preprocessed_text[-100:]}")
except FileNotFoundError:
    print(f"File tidak ditemukan: {pdf_path}")

#### Fungsi Ekstraksi Struktur UU

In [39]:
def chunk_uu_with_recursive_splitter(text: str, min_words: int = 400, max_words: int = 800, overlap: int = 50):
    """
    Split UU text using RecursiveCharacterTextSplitter while preserving pasal structure as much as possible.
    
    Args:
        text (str): UU text to be split
        min_words (int): Minimum words per chunk
        max_words (int): Maximum words per chunk
        overlap (int): Number of overlapping words between chunks
    
    Returns:
        List[Document]: List of document chunks
    """
    # Preprocess text
    text = preprocess_text(text)
    
    # Create splitter for pasal division
    pasal_splitter = re.compile(r'(?=\nPasal\s+\d+\s+)')
    pasal_texts = pasal_splitter.split(text)
    
    # Create splitter for text that's too long
    # Convert word count to character count (estimate: 1 word = 6 characters)
    min_chars = min_words * 6
    max_chars = max_words * 6
    overlap_chars = overlap * 6
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,
        chunk_overlap=overlap_chars,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    documents = []
    current_bab = "BAB I"
    current_bab_title = "KETENTUAN UMUM"  # Default title
    
    # BAB pattern detection
    bab_pattern = re.compile(r'BAB\s+([IVXLCDM]+)\s+([A-Z][A-Z\s,/.&-]+)')
    
    for i, pasal_text in enumerate(pasal_texts):
        # Skip empty sections
        if not pasal_text.strip():
            continue
        
        # Check if there's new BAB information
        bab_match = bab_pattern.search(pasal_text)
        if bab_match:
            current_bab = f"BAB {bab_match.group(1)}"
            current_bab_title = bab_match.group(2).strip()
            # If this is just a BAB header, skip
            if len(pasal_text.strip()) < 100 and not re.search(r'Pasal\s+\d+', pasal_text):
                continue
        
        # Extract pasal number if available
        pasal_num_match = re.search(r'Pasal\s+(\d+)', pasal_text)
        pasal_nomor = f"Pasal {pasal_num_match.group(1)}" if pasal_num_match else f"Bagian {i}"
        
        # Count words
        word_count = len(pasal_text.split())
        
        # If pasal is too long, split into multiple chunks
        if word_count > max_words:
            chunks = text_splitter.create_documents([pasal_text])
            for j, chunk in enumerate(chunks):
                chunk.metadata = {
                    "bab_nomor": current_bab,
                    "bab_judul": current_bab_title,
                    "pasal_nomor": pasal_nomor,
                    "chunk": f"{j+1}/{len(chunks)}",
                    "source": "UU No. 13 Tahun 2003",
                    "word_count": len(chunk.page_content.split()),
                    "full_reference": f"{current_bab} {current_bab_title} - {pasal_nomor} (Bagian {j+1}/{len(chunks)})"
                }
                documents.append(chunk)
        else:
            # If pasal is within desired size range, no need to split
            doc = Document(
                page_content=pasal_text,
                metadata={
                    "bab_nomor": current_bab,
                    "bab_judul": current_bab_title,
                    "pasal_nomor": pasal_nomor,
                    "chunk": "1/1",
                    "source": "UU No. 13 Tahun 2003",
                    "word_count": word_count,
                    "full_reference": f"{current_bab} {current_bab_title} - {pasal_nomor}"
                }
            )
            documents.append(doc)
    
    return documents


In [None]:
def process_uu_ketenagakerjaan(text):
    """Proses teks UU dan ekstrak struktur bab, pasal, dan ayat dengan lebih akurat"""
    documents = []
    
    # Preprocessing untuk normalisasi format
    text = re.sub(r'\s+', ' ', text)  # Normalisasi whitespace
    text = re.sub(r'(?<!\n)BAB\s+([IVXLCDM]+)', r'\nBAB \1', text)  # Pastikan BAB ada di baris baru
    text = re.sub(r'(?<!\n)Pasal\s+(\d+)', r'\nPasal \1', text)  # Pastikan Pasal ada di baris baru
    
    # Pola untuk deteksi struktur
    bab_pattern = r'BAB\s+([IVXLCDM]+)\s*[-—]*\s*([A-Z][A-Z\s,/.&-]+)'
    pasal_pattern = r'Pasal\s+(\d+)\s*'
    ayat_pattern = r'\((\d+)\)\s*(.*?)(?=\(\d+\)|Pasal\s+\d+|BAB\s+[IVXLCDM]+|$)'
    point_pattern = r'([a-z])\.\s*(.*?)(?=[a-z]\.\s*|$)'
    
    # Ekstraksi BAB
    bab_matches = list(re.finditer(bab_pattern, text, re.DOTALL))
    
    for i, bab_match in enumerate(bab_matches):
        bab_nomor = bab_match.group(1).strip()
        bab_judul = bab_match.group(2).strip()
        
        # Tentukan rentang konten BAB
        bab_start = bab_match.end()
        bab_end = text.length if i == len(bab_matches) - 1 else bab_matches[i + 1].start()
        bab_content = text[bab_start:bab_end]
        
        # Ekstraksi pasal dalam BAB
        pasal_matches = list(re.finditer(pasal_pattern, bab_content))
        
        for j, pasal_match in enumerate(pasal_matches):
            pasal_nomor = pasal_match.group(1).strip()
            
            # Tentukan rentang konten pasal
            pasal_start = pasal_match.end()
            pasal_end = bab_end if j == len(pasal_matches) - 1 else pasal_matches[j + 1].start()
            pasal_content = bab_content[pasal_start:pasal_end].strip()
            
            # Deteksi apakah pasal memiliki ayat atau langsung konten
            ayat_matches = list(re.finditer(ayat_pattern, pasal_content, re.DOTALL))
            
            if ayat_matches:
                # Proses setiap ayat
                for ayat_match in ayat_matches:
                    ayat_nomor = ayat_match.group(1).strip()
                    ayat_content = ayat_match.group(2).strip()
                    
                    # Cek apakah ayat memiliki point-point
                    point_matches = list(re.finditer(point_pattern, ayat_content))
                    
                    if point_matches:
                        # Proses setiap point dalam ayat
                        for point_match in point_matches:
                            point_label = point_match.group(1).strip()
                            point_content = point_match.group(2).strip()
                            
                            doc = Document(
                                page_content=point_content,
                                metadata={
                                    "bab_nomor": f"BAB {bab_nomor}",
                                    "bab_judul": bab_judul,
                                    "pasal_nomor": f"Pasal {pasal_nomor}",
                                    "ayat_nomor": ayat_nomor,
                                    "point": point_label,
                                    "source": "UU No. 13 Tahun 2003",
                                    "full_reference": f"BAB {bab_nomor} {bab_judul} - Pasal {pasal_nomor} Ayat ({ayat_nomor}) Point {point_label}"
                                }
                            )
                            documents.append(doc)
                    else:
                        # Proses ayat tanpa point
                        doc = Document(
                            page_content=ayat_content,
                            metadata={
                                "bab_nomor": f"BAB {bab_nomor}",
                                "bab_judul": bab_judul,
                                "pasal_nomor": f"Pasal {pasal_nomor}",
                                "ayat_nomor": ayat_nomor,
                                "source": "UU No. 13 Tahun 2003",
                                "full_reference": f"BAB {bab_nomor} {bab_judul} - Pasal {pasal_nomor} Ayat ({ayat_nomor})"
                            }
                        )
                        documents.append(doc)
            else:
                # Proses pasal tanpa ayat (langsung konten)
                # Cek untuk point-point
                point_matches = list(re.finditer(point_pattern, pasal_content))
                
                if point_matches:
                    for point_match in point_matches:
                        point_label = point_match.group(1)
                        point_content = point_match.group(2).strip()
                        
                        doc = Document(
                            page_content=point_content,
                            metadata={
                                "bab_nomor": f"BAB {bab_nomor}",
                                "bab_judul": bab_judul,
                                "pasal_nomor": f"Pasal {pasal_nomor}",
                                "point": point_label,
                                "source": "UU No. 13 Tahun 2003",
                                "full_reference": f"BAB {bab_nomor} {bab_judul} - Pasal {pasal_nomor} Point {point_label}"
                            }
                        )
                        documents.append(doc)
                else:
                    # Pasal dengan konten langsung tanpa ayat atau point
                    doc = Document(
                        page_content=pasal_content,
                        metadata={
                            "bab_nomor": f"BAB {bab_nomor}",
                            "bab_judul": bab_judul,
                            "pasal_nomor": f"Pasal {pasal_nomor}",
                            "source": "UU No. 13 Tahun 2003",
                            "full_reference": f"BAB {bab_nomor} {bab_judul} - Pasal {pasal_nomor}"
                        }
                    )
                    documents.append(doc)
    
    # Fallback jika dokumen kosong, gunakan text splitter standar
    if len(documents) < 5:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", " ", ""]
        )
        documents = text_splitter.create_documents([text])
    
    return documents

In [None]:
def debug_find_bab(text):
    lines = text.splitlines()
    print(len(lines))
    for line in lines:
        if "BAB" in line:
            print(repr(line))
debug_find_bab(preprocessed_text)


In [None]:
documents = chunk_uu_with_recursive_splitter(preprocessed_text)
print(f"Total dokumen yang dihasilkan: {len(documents)}")

# documents1 = parse_uu_ketenagakerjaan(preprocessed_text)
# print(f"Total dokumen yang dihasilkan: {len(documents)}")


In [None]:
# Tampilkan contoh 3 dokumen pertama
for i, doc in enumerate(documents[:100]):
    print(f"\nDokumen #{i+1}:")
    print(f"Content: {doc.page_content[:100]}...")
    print(f"panjang konten: {len(doc.page_content)}")
    print(f"Metadata: {doc.metadata}")


In [40]:
def create_vector_store(documents, persist_dir: str = "./chroma_db") -> Chroma:
    """
    Create and persist a vector store from documents
    
    Args:
        documents: List of processed documents
        persist_dir: Directory to persist the vector store
        
    Returns:
        Chroma: Vector store instance
    """
    # Initialize embeddings model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    # Create vector store
    db = Chroma.from_documents(
        documents=documents, 
        embedding=embeddings,
        persist_directory=persist_dir
    )
    
    # Persist to disk
    db.persist()
    print(f"Vector store saved to {persist_dir}")
    
    return db

create_vector_store(documents, persist_dir="./chroma_db")

Vector store saved to ./chroma_db


<langchain_community.vectorstores.chroma.Chroma at 0x273dd603790>

### Template Prompt untuk RAG

In [None]:
# Template prompt untuk retrieval yang menyertakan metadata
prompt_template = """
        Kamu adalah asisten hukum yang ahli tentang regulasi ketenagakerjaan di Indonesia, khususnya UU No. 13 Tahun 2003.
        Berdasarkan konteks berikut, jawablah pertanyaan dengan relevan, akurat, dan jelas.
        Sertakan nomor pasal jika tersedia dalam konteks.
        Jika informasi tidak tersedia dalam konteks, cukup jawab bahwa kamu tidak menemukan informasi yang relevan dalam regulasi.
        Jika ada informasi yang tidak relevan, abaikan informasi tersebut.
        Informasi di konteks merupakan bagian dari UU No. 13 Tahun 2003.
        KONTEKS:
        {context}


        PERTANYAAN:
        {question}

        JAWABAN:
        """

In [None]:
print("Template prompt untuk RAG:")
print(prompt_template)

## 8. Format Referensi Pasal untuk Output

In [None]:
def format_reference(source_doc):
    """Format referensi pasal dari metadata"""
    metadata = source_doc.metadata
    # Periksa apakah metadata memiliki struktur yang diharapkan
    if 'pasal_nomor' in metadata and 'bab_nomor' in metadata and 'bab_judul' in metadata:
        reference = f"[{metadata['pasal_nomor']} UU No. 13 Tahun 2003 {metadata['bab_nomor']} - {metadata['bab_judul']}]"
        if 'ayat_nomor' in metadata and metadata['ayat_nomor'] != "semua":
            reference += f" Ayat ({metadata['ayat_nomor']})"
        return reference
    else:
        # Fallback jika metadata tidak lengkap
        return "[UU No. 13 Tahun 2003]"

In [None]:
# Uji fungsi format_reference dengan dokumen contoh
print("Contoh format referensi:")
print(format_reference(documents[0]))
print(format_reference(documents[10]))

## 9. Setup QA System

In [None]:
def build_rag_chain(vectorstore, model_name="gpt-3.5-turbo", temperature=0):
    """
    Build a RAG chain using the latest LangChain paradigm (LCEL)
    
    Args:
        vectorstore: Vector store to use for retrieval
        model_name: Name of the model to use
        temperature: Temperature setting for the model
        
    Returns:
        Chain: Query processing chain
    """
    # Initialize LLM
    llm = ChatOpenAI(temperature=temperature, model_name=model_name)
    
    # Create retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 7})
    
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    # Build the chain with LCEL pattern (LangChain Expression Language)
    rag_chain = (
        {"context": retriever | format_docs_with_references, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain

In [None]:
def build_ollama_rag_chain(vectorstore, model_name="mistral"):
    """
    Build a RAG chain using Ollama models
    
    Args:
        vectorstore: Vector store to use for retrieval
        model_name: Name of the Ollama model to use
        
    Returns:
        Chain: Query processing chain
    """
    # Initialize Ollama LLM
    llm = Ollama(model=model_name, temperature=0)
    
    # Create retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 7})
    

    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    # Build the chain with LCEL pattern
    rag_chain = (
        {"context": retriever | format_docs_with_references, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain

In [None]:
# Build RAG chains
openai_chain = build_rag_chain(db)
mistral_chain = build_ollama_rag_chain(db, model_name="mistral")
llama_chain = build_ollama_rag_chain(db, model_name="llama3.2")

In [None]:
from ipywidgets import widgets
from IPython.display import display, clear_output

def ask_question(b):
    with out:
        clear_output()
        query = text.value.strip()

        if query == "":
            print("Please enter a question")
            return

        print("Searching for an answer...")

        # Choose which chain to use (OpenAI or Ollama)
        response = mistral_chain.invoke(query)  # Change to openai_chain if preferred

        # Display answer
        print(f"Answer:\n{textwrap.fill(response, width=100)}\n")

# Question input widget
text = widgets.Textarea(
    value='',
    placeholder='Enter your question about UU Ketenagakerjaan',
    description='Question:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='80px')
)

# Button to trigger answer search
button = widgets.Button(description="Ask")
button.on_click(ask_question)

# Output area
out = widgets.Output()

# Display all widgets
display(text, button, out)

In [None]:
custom_prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template= prompt_template
)


In [None]:
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

# Inisialisasi retriever dan LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 7})

# Fungsi untuk menyisipkan referensi dalam konteks
def format_docs_with_references(docs):
    formatted = ""
    for doc in docs:
        ref = doc.metadata.get("full_reference", "Tidak diketahui")
        formatted += f"[{ref}]\n{doc.page_content}\n\n"
    return formatted



# Melakukan query
query = "bagaimana ketentuan mengenai jam kerja lembur dalam UU No. 13 Tahun 2003?"
# result = db.similarity_search(query, k=5)


In [None]:
# Inisialisasi model Mistral lokal via Ollama
llm_mistral = Ollama(model="mistral", temperature=0)
llm_llama = Ollama(model="llama3.2", temperature=0)

In [None]:
# Chain OpenAI GPT
qa_chain_openai = LLMChain(llm=llm, prompt=custom_prompt_template)

# Chain Mistral Ollama
qa_chain_mistral = LLMChain(llm=llm_mistral, prompt=custom_prompt_template)

# Chain Llama Ollama
qa_chain_llama = LLMChain(llm=llm_llama, prompt=custom_prompt_template)


In [None]:
docs = retriever.get_relevant_documents(query)
formatted_context = format_docs_with_references(docs)

In [None]:
# # Jalankan GPT
# response_openai = qa_chain_openai.run({
#     "context": formatted_context,
#     "question": query
# })



In [None]:
# # Jalankan Mistral
# response_mistral = qa_chain_mistral.run({
#     "context": formatted_context,
#     "question": query
# })


In [None]:
# print("=== Jawaban dari GPT-3.5 (OpenAI) ===")
# print(response_openai)

# print("\n=== Jawaban dari Mistral (Ollama) ===")
# print(response_mistral)

# print("\n=== Referensi yang digunakan ===")
# for doc in docs:
#     print(f"- {doc.metadata['full_reference']}")


## 15. Widget Interaktif untuk Query

In [None]:
# Uncomment baris ini jika ingin menggunakan widget interaktif di Jupyter Notebook
from ipywidgets import widgets
from IPython.display import display, clear_output
import textwrap  # pastikan sudah mengimpor textwrap jika digunakan

# Fungsi utama untuk menangani pertanyaan dari user
def ask_question(b):
    with out:
        clear_output()
        query = text.value.strip()

        if query == "":
            print("Mohon masukkan pertanyaan")
            return

        print("Mencari jawaban...")

        # Ambil dokumen relevan dari retriever
        docs = retriever.get_relevant_documents(query)

        # Format dokumen menjadi konteks beserta referensinya
        formatted_context = format_docs_with_references(docs)

        # Menjalankan model Mistral/Ollama dengan konteks dan pertanyaan
        response_llama = qa_chain_mistral.run({
            "context": formatted_context,
            "question": query
        })

        # Tampilkan jawaban
        print(f"Jawaban dari Mistral (Ollama):\n{textwrap.fill(response_llama, width=100)}\n")

        # Tampilkan referensi dokumen yang digunakan
        print("\n=== Referensi yang digunakan ===")
        print(f"{textwrap.fill(formatted_context, width=100)}\n")
        for doc in docs:
            wrapped_content = textwrap.fill(doc.metadata['full_reference'], width=100)
            print(f"- {wrapped_content}")

# Widget input pertanyaan
text = widgets.Textarea(
    value='',
    placeholder='Masukkan pertanyaan Anda tentang UU Ketenagakerjaan',
    description='Pertanyaan:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='80px')
)

# Tombol untuk memicu pencarian jawaban
button = widgets.Button(description="Tanya")
button.on_click(ask_question)

# Output area
out = widgets.Output()

# Tampilkan semua widget
display(text, button, out)
