In [23]:
# MODEL
EMBEDDING_PROVIDER="aws"
EMBEDDING_MODEL_ID="amazon.titan-embed-text-v2:0"
LLM_PROVIDER="aws"
LLM_ID = "apac.amazon.nova-lite-v1:0"
MAX_TOKEN = 1500
TEMPERATURE = 0.0
STREAMING = True

# DATABASE
REDIS_URL="redis://localhost:6379"
PERSIST_DIR="./data/chroma_dbs"
COLLECTION_NAME="parent-child-chunk-v1"
BM25_INDEX_PATH = "./data/bm25-index/bm25-index-v1.pkl"
STOPWORDS_PATH = "./data/stopwords.txt"

# RETRIEVER
RETRIEVAL_STRATEGY: str = "hybrid" 
MAX_RESULTS: int = 2
BM25_SEARCH_K: int = 5
VECTOR_SEARCH_K: int = 80
BM25_WEIGHT: float = 0.3
VECTOR_WEIGHT: float = 0.7 
RRF_CONSTANT: int = 5
SCORE_THRESHOLD: float = 0.01

# PROMPT
PROMPT_VERSION = "v1"
PROMPT_DIR = "./prompts"

# SPLITTER
PARENT_CHUNK_SIZE=2400
PARENT_CHUNK_OVERLAP=260
CHILD_CHUNK_SIZE=300
CHILD_CHUNK_OVERLAP=60

### Loader

In [24]:
import os
import glob
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document

class PDFFileLoader:
    def load(self, source: str) -> List[Document]:
        try:
            loader = PyMuPDFLoader(file_path=source)
            documents = loader.load()
            for doc in documents:
                doc.metadata["source_type"] = "pdf"
                doc.metadata["file_path"] = source
            return documents
        except Exception as e:
            error_msg = f"Gagal memuat PDF {source}: {str(e)}"
            raise RuntimeError(error_msg) from e

class PDFDirectoryLoader:
    def __init__(self):
        self.single_loader = PDFFileLoader()

    def load(self, source: str, recursive: bool = True) -> List[Document]:
        documents: List[Document] = []
        search_pattern = os.path.join(source, "*.pdf")
        # Ambil list semua file path
        pdf_files = glob.glob(search_pattern)
        if not pdf_files:
            return []
        # Loop setiap file di dalam direktori
        for file_path in pdf_files:
            # Skip hidden files
            if "/." in file_path or "\\." in file_path:
                continue
            # Panggil loader satuan
            docs = self.single_loader.load(file_path)
            documents.extend(docs)
        return documents

## Splitter

In [25]:
import re
import uuid
from collections import defaultdict
from typing import List, Dict, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class ParentChildSplitter:
    def __init__(self, parent_chunk_size: int, parent_chunk_overlap: int, child_chunk_size: int, child_chunk_overlap: int):
        self.parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size,
                                                            chunk_overlap=parent_chunk_overlap,
                                                            separators=["\n", ". ", ", ", " ", ""])
        self.child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size,
                                                            chunk_overlap=child_chunk_overlap,
                                                            separators=["\n\n", "\n", ". ", ", ", " ", ""])
                                                            
    def _preprocess_text(self, chunk_text: str) -> Tuple[str, List[str]]:
        # Ekstrak URL
        urls = re.findall(r'https?://\S+', chunk_text)
        # Replace URL dengan placeholder LINK
        text_clean = re.sub(r'https?://\S+', '[LINK]', chunk_text)
        # Hapus karakter yang tidak diinginkan tapi
        text_clean = re.sub(r'[^\w\s\[\],.:()%=+\-/]', '', text_clean)
        # Rapikan newline ganda > 2 menjadi 1
        text_clean = re.sub(r'\n{2,}', '\n', text_clean)
        # Rapikan spasi berlebihan
        text_clean = re.sub(r'[ ]{2,}', ' ', text_clean)
        # Fix multiple blank lines -> single newline
        text_clean = re.sub(r'\n\s*\n+', '\n', text_clean)
        return text_clean.strip(), urls

    def split_documents(self, documents: List[Document]) -> Dict[str, List[Document]]:
        # Grouping by Source/File Path
        docs_by_source = defaultdict(list)
        for doc in documents:
            source_key = doc.metadata.get("file_path")
            docs_by_source[source_key].append(doc)
        all_parent_docs = []
        all_child_docs = []
        # Proses per File Source
        for source_name, doc_group in docs_by_source.items():
            # Gabungkan semua text dari satu file (merge pages)
            combined_text = "\n".join([d.page_content for d in doc_group])
            # Ambil metadata dasar dari halaman pertama
            base_metadata = doc_group[0].metadata.copy() if doc_group else {}
            # Buat satu dokumen besar sementara
            combined_doc = Document(page_content=combined_text, metadata=base_metadata)
            # Generate Parent Chunks
            parent_chunks = self.parent_splitter.split_documents([combined_doc])
            for p_doc in parent_chunks:
                # Cleaning Text
                clean_text, specific_urls = self._preprocess_text(p_doc.page_content)
                # Generate Parent ID (UUID)
                parent_id = str(uuid.uuid4())
                # Update Metadata Parent
                parent_meta = p_doc.metadata.copy()
                parent_meta.update({
                    "doc_id": parent_id,
                    "type": "parent",
                    "source": source_name,
                    "urls": specific_urls, 
                })
                final_parent_doc = Document(page_content=clean_text, metadata=parent_meta)
                all_parent_docs.append(final_parent_doc)
                # Generate Child Chunk dari Parent Chunk
                child_texts = self.child_splitter.split_text(clean_text)
                for c_text in child_texts:
                    child_meta = {
                        "parent_id": parent_id,
                        "type": "child",
                        "source": source_name,
                    }
                    child_doc = Document(page_content=c_text, metadata=child_meta)
                    all_child_docs.append(child_doc)

        return {"parents": all_parent_docs,
                "children": all_child_docs}

## Embedding

In [26]:
import os
from functools import lru_cache
from langchain_core.embeddings import Embeddings
from langchain_aws import BedrockEmbeddings

@lru_cache(maxsize=1)
def get_embeddings(provider: str = EMBEDDING_PROVIDER, model_id: str = EMBEDDING_MODEL_ID) -> Embeddings:
    if provider == "aws":
        return BedrockEmbeddings(model_id=model_id,     
                                region_name=os.getenv("AWS_REGION"))
    else:
        raise ValueError(f"Provider embedding tidak didukung")


## Document Serialization

In [27]:
from typing import Any
from langchain_core.documents import Document
from langchain_core.load import dumps, loads

def encode(doc: Document) -> bytes:
    return dumps(doc).encode("utf-8")

def decode(data: bytes) -> Document:
    if isinstance(data, str):
        data = data.encode("utf-8")
    return loads(data.decode("utf-8"))

def encode_key(key: Any) -> str:
    if isinstance(key, bytes):
        return key.decode("utf-8")
    return str(key)

## VectorStore

In [28]:
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings

class VectorStore:
    @staticmethod
    def get_vector_store(embedding_model: Embeddings, collection_name: str = COLLECTION_NAME, persist_directory: str = PERSIST_DIR):
        return Chroma(embedding_function=embedding_model, 
                    persist_directory=persist_directory,
                    collection_name=collection_name,
                    collection_metadata={"hnsw:space": "cosine"})

## Docstore

In [29]:
import logging
from langchain_community.storage import RedisStore
from langchain.storage import EncoderBackedStore

logger = logging.getLogger(__name__)

class DocStore:
    @staticmethod
    def get_doc_store(collection_name: str = COLLECTION_NAME, redis_url: str = REDIS_URL):
        namespace = f"docstore_{collection_name}"
        try:
            logger.info(f"Connecting to Redis DocStore {redis_url}, Namespace: {namespace})")
            raw_store = RedisStore(redis_url=redis_url, namespace=namespace)
            return EncoderBackedStore(store=raw_store,
                                    key_encoder=encode_key,
                                    value_serializer=encode,
                                    value_deserializer=decode)
        except Exception as e:
            logger.error(f"Gagal koneksi ke Redis {e}")
            raise e

## Text Preprocessing

In [30]:
import os
import re
import logging
from typing import Set, List

logger = logging.getLogger(__name__)

def load_stopwords(path: str) -> Set[str]:
    try:
        if os.path.exists(path):
            with open(path, "r", encoding="utf-8") as f:
                stopwords = {line.strip().lower() for line in f if line.strip()}
            logger.info("Stopwords berhasil di load")
            return stopwords
        else:
            logger.warning(f"Stopword file tidak ditemukan {path}")
            return set()
    except Exception as e:
        logger.error(f"Gagal load stopwords: {e}")
        return set()

def preprocess_text(text: str, stopwords: Set[str]) -> List[str]:
    text = text.lower()
    tokens = re.findall(r"[a-z0-9]+(?:-[a-z0-9]+)*", text)
    if stopwords:
        tokens = [t for t in tokens if t not in stopwords]
    return tokens

## Parent-Child Indexer

In [31]:
import uuid
import logging
import pickle
from langchain_community.retrievers import BM25Retriever
from typing import List, Dict
from functools import partial
from langchain_core.documents import Document

logger = logging.getLogger(__name__)

class ParentChildIndexer:
    def __init__(self):
        self.embedding_model = get_embeddings()
        self.collection_name = COLLECTION_NAME
        self.vector_store = VectorStore.get_vector_store(
            embedding_model=self.embedding_model,
            collection_name=self.collection_name
        )
        self.doc_store = DocStore.get_doc_store(collection_name=self.collection_name)

    def index_documents(self, split_result: Dict[str, List[Document]]):
        parent_docs = split_result.get("parents", [])
        child_docs = split_result.get("children", [])
        logger.info(f"Indexing {len(parent_docs)} Parents & {len(child_docs)} Child")
        # SIMPAN PARENTS KE REDIS
        if parent_docs:
            try:
                parent_key_value_pairs = []
                for doc in parent_docs:
                    doc_id = doc.metadata.get("doc_id") or str(uuid.uuid4())
                    doc.metadata["doc_id"] = doc_id
                    parent_key_value_pairs.append((doc_id, doc))
                self.doc_store.mset(parent_key_value_pairs)
                logger.info(f"Berhasil menyimpan {len(parent_docs)} Parent Chunk ke Redis.")
            except Exception as e:
                logger.error(f"Error simpan Parent Chunk ke Redis: {e}")
                raise e
        # SIMPAN CHILDREN KE CHROMA
        if child_docs:
            try:
                valid_children = [d for d in child_docs if "parent_id" in d.metadata]
                self.vector_store.add_documents(valid_children)
                logger.info(f"Berhasil menyimpan {len(valid_children)} Child Chunk ke ChromaDB.")
            except Exception as e:
                logger.error(f"Error simpan Children Chunk ke Chroma: {e}")
                raise e
        return {"Parents indexed": len(parent_docs), "Children indexed": len(child_docs)}

class BM25Indexer:
    def __init__(self):
        self.vector_store = VectorStore.get_vector_store(
            collection_name=COLLECTION_NAME,
            embedding_model=get_embeddings()
        )
    def build_and_save_index(self):
        # 1. Load stopwords
        stopwords = load_stopwords(STOPWORDS_PATH)
        # 2. Fetch documents child dari vectordb
        try:
            result = self.vector_store.get(
                where={"type": "child"},
                include=["documents", "metadatas"]
            )
        except Exception as e:
            logger.error(f"Gagal fetch data dari Chroma: {e}")
            raise e
        raw_docs = result.get("documents", [])
        raw_metadatas = result.get("metadatas", [])
        if not raw_docs:
            logger.warning("Tidak ditemukan dokumen 'child' di Chroma")
            return
        logger.info(f"Indexing {len(raw_docs)} dokumen")
        # 3. Convert ke LangChain Document
        documents = [
            Document(page_content=text, metadata=meta)
            for text, meta in zip(raw_docs, raw_metadatas)
        ]
        # 4. Build BM25 retriever
        bm25_retriever = BM25Retriever.from_documents(documents, preprocess_func=partial(preprocess_text, stopwords=stopwords))
        # 5. Save index
        try:
            with open(BM25_INDEX_PATH, "wb") as f:
                pickle.dump(bm25_retriever, f)
            logger.info(f"BM25 Index berhasil disimpan di: {BM25_INDEX_PATH}")
        except Exception as e:
            logger.error(f"Gagal menyimpan file pickle: {e}")
            raise e

## Ingestion Pipeline

In [32]:
import logging

logger = logging.getLogger(__name__)

def run_ingestion_pipeline(folder_path: str):
    print("Loading Data")
    loader = PDFDirectoryLoader()
    raw_docs = loader.load(folder_path)
    print("Splitting Dokumen")
    splitter = ParentChildSplitter(
        parent_chunk_size=PARENT_CHUNK_SIZE,
        parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
        child_chunk_size=CHILD_CHUNK_SIZE,
        child_chunk_overlap=CHILD_CHUNK_OVERLAP
    )
    split_result = splitter.split_documents(raw_docs)
    print("Indexing Document Chunks")
    vector_indexer = ParentChildIndexer()
    result = vector_indexer.index_documents(split_result)
    print(f"{result}")
    print("Building BM25 Index")
    keyword_indexer = BM25Indexer()
    keyword_indexer.build_and_save_index()
    print("Ingestion DONE")

In [33]:
run_ingestion_pipeline("MajaAI_Data")

Loading Data
Splitting Dokumen
Indexing Document Chunks
{'Parents indexed': 90, 'Children indexed': 631}
Building BM25 Index
Ingestion DONE


## Compatible Langflow Version

### Loader

In [21]:
import os
import glob
import logging
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document

logger = logging.getLogger(__name__)

class LangflowCompatibleLoader:
    def __init__(self, recursive: bool = True):
        self.recursive = recursive

    def load(self, directory_path: str) -> List[Document]:
        documents = []
        # Pola glob recursive ala Langflow
        search_pattern = os.path.join(directory_path, "**/*.pdf") if self.recursive else os.path.join(directory_path, "*.pdf")
        
        # recursive=True di glob hanya bekerja jika pola mengandung "**"
        pdf_files = glob.glob(search_pattern, recursive=self.recursive)
        
        if not pdf_files:
            logger.warning(f"Tidak ditemukan file PDF di: {directory_path}")
            return []

        logger.info(f"Ditemukan {len(pdf_files)} file PDF.")

        for file_path in pdf_files:
            try:
                # Langflow menggunakan PyMuPDFLoader standar tanpa pre-cleaning \x00
                loader = PyMuPDFLoader(file_path)
                docs = loader.load()
                
                for doc in docs:
                    # Metadata standar Langflow
                    doc.metadata["source"] = file_path  # Langflow pakai 'source'
                    doc.metadata["file_path"] = file_path
                
                documents.extend(docs)
            except Exception as e:
                logger.error(f"Error loading {file_path}: {e}")
                
        logger.info(f"Total {len(documents)} halaman berhasil dimuat.")
        return documents

### Splitter

In [22]:
import re
import uuid
from collections import defaultdict
from typing import List, Dict, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import logging

logger = logging.getLogger(__name__)

class LangflowCompatibleSplitter:
    def __init__(self, parent_chunk_size: int, parent_chunk_overlap: int, child_chunk_size: int, child_chunk_overlap: int):
        self.parent_splitter = RecursiveCharacterTextSplitter(
            chunk_size=parent_chunk_size,
            chunk_overlap=parent_chunk_overlap,
            separators=["\n", ". ", ", ", " ", ""]
        )
        self.child_splitter = RecursiveCharacterTextSplitter(
            chunk_size=child_chunk_size,
            chunk_overlap=child_chunk_overlap,
            # Langflow child splitter punya prioritas \n\n meski sudah di-clean
            separators=["\n\n", "\n", ". ", ", ", " ", ""] 
        )

    def _preprocess_text(self, chunk_text: str) -> Tuple[str, List[str]]:
        # 1. URL handling (Langflow code: https:// -> [LINK])
        # Note: Code Langflow Anda pakai 'https://' (tanpa ?). 
        # Code LangChain Anda pakai 'https?://'. 
        # Kita pakai versi LangChain (lebih robust) atau Langflow (lebih strict).
        # Di sini saya pakai versi LangChain Anda karena lebih aman, impact ke akurasi minim.
        urls = re.findall(r'https?://\S+', chunk_text)
        text_clean = re.sub(r'https?://\S+', '[LINK]', chunk_text)

        # 2. Character removal (SAMA PERSIS)
        text_clean = re.sub(r'[^\w\s\[\],.:()%=+\-/]', '', text_clean)

        # 3. Newline cleanup (SAMA PERSIS)
        # Ini mengubah \n\n menjadi \n
        text_clean = re.sub(r'\n{2,}', '\n', text_clean)
        text_clean = re.sub(r'[ ]{2,}', ' ', text_clean)
        text_clean = re.sub(r'\n\s*\n+', '\n', text_clean)
        
        return text_clean.strip(), urls

    def split_documents(self, documents: List[Document]) -> Dict[str, List[Document]]:
        docs_by_source = defaultdict(list)
        for doc in documents:
            # Langflow prefer metadata "source", fallback ke "file_path"
            source_key = doc.metadata.get("source") or doc.metadata.get("file_path")
            docs_by_source[source_key].append(doc)

        all_parent_docs = []
        all_child_docs = []

        logger.info(f"Processing split for {len(docs_by_source)} unique sources")

        for source_name, doc_group in docs_by_source.items():
            # 1. MERGE PAGES (SAMA PERSIS)
            combined_text = "\n".join([d.page_content for d in doc_group])
            base_metadata = doc_group[0].metadata.copy() if doc_group else {}
            
            combined_doc = Document(page_content=combined_text, metadata=base_metadata)

            # 2. PARENT SPLIT (Raw Text -> Raw Parent Chunks)
            parent_chunks = self.parent_splitter.split_documents([combined_doc])

            for p_doc in parent_chunks:
                # 3. PREPROCESS (Raw Parent -> Clean Parent)
                clean_text, specific_urls = self._preprocess_text(p_doc.page_content)
                
                # Skip jika kosong setelah dibersihkan
                if not clean_text:
                    continue

                parent_id = str(uuid.uuid4())
                
                # Metadata update
                parent_meta = p_doc.metadata.copy()
                parent_meta.update({
                    "doc_id": parent_id,
                    "type": "parent",
                    "source": source_name,
                    "urls": specific_urls, 
                })

                final_parent_doc = Document(page_content=clean_text, metadata=parent_meta)
                all_parent_docs.append(final_parent_doc)

                # 4. CHILD SPLIT (Clean Parent -> Clean Children)
                # Splitter child menerima teks yang SUDAH bersih (tanpa \n\n)
                child_texts = self.child_splitter.split_text(clean_text)
                
                for c_text in child_texts:
                    if not c_text.strip(): continue
                    
                    child_meta = {
                        "parent_id": parent_id,
                        "type": "child",
                        "source": source_name,
                        # Opsional: Child tidak wajib punya source full path untuk hemat redis, 
                        # tapi untuk konsistensi kita simpan.
                    }
                    child_doc = Document(page_content=c_text, metadata=child_meta)
                    all_child_docs.append(child_doc)

        return {"parents": all_parent_docs, "children": all_child_docs}

### Store

In [23]:
import logging
import uuid
import time
import chromadb
from typing import List, Dict, Any, Optional, Tuple
from langchain_core.documents import Document
from langchain_core.runnables import RunnableSerializable, RunnableConfig
from langchain_community.vectorstores import Chroma
from langchain_community.storage import RedisStore
from langchain.storage import EncoderBackedStore
from langchain_core.load import dumps, loads
from langchain_core.embeddings import Embeddings
from pydantic import PrivateAttr

logger = logging.getLogger(__name__)

# --- HELPER: DOCUMENT ENCODER (Sama Persis) ---
class DocumentEncoder:
    def encode(self, doc: Document) -> bytes:
        return dumps(doc).encode("utf-8")

    def decode(self, data: bytes) -> Document:
        return loads(data.decode("utf-8"))

# --- HELPER: FACTORY STORE ---
class StoreFactory:
    """
    Factory untuk membuat koneksi Redis & Chroma yang konsisten.
    """
    @staticmethod
    def get_doc_store(redis_url: str, collection_name: str):
        docstore_namespace = f"docstore_{collection_name}"
        redis_raw = RedisStore(redis_url=redis_url, namespace=docstore_namespace)
        encoder = DocumentEncoder()
        
        # Replikasi Logika Langflow: Helper key encoder
        def encode_key(key):
            return str(key.decode("utf-8") if isinstance(key, bytes) else key)

        return EncoderBackedStore(
            store=redis_raw,
            key_encoder=encode_key,
            value_serializer=encoder.encode,
            value_deserializer=encoder.decode,
        )

    @staticmethod
    def get_vector_store(embedding: Embeddings, collection_name: str):
        client = chromadb.HttpClient(host="localhost", port=8000)
        return Chroma(
            client=client,
            embedding_function=embedding,
            collection_name=collection_name,
            collection_metadata={"hnsw:space": "cosine"} 
        )

In [24]:
class LangflowCompatibleIndexer:
    """
    Replikasi logika 'HierarchicalStore' Langflow.
    Tugas: Menjamin Metadata (doc_id, type, parent_id) terisi konsisten.
    """
    def __init__(self, embedding: Embeddings, redis_url: str, collection_name: str = "langflow"):
        self.doc_store = StoreFactory.get_doc_store(redis_url, collection_name)
        self.vector_store = StoreFactory.get_vector_store(embedding, collection_name)

    def index(self, parents: List[Document], children: List[Document]):
        logger.info(f"Mulai Indexing: {len(parents)} Parents, {len(children)} Children")
        
        # 1. PROSES PARENT (Replikasi logic Langflow)
        # Pastikan Parent punya ID di metadata
        processed_parents = []
        for p in parents:
            if "doc_id" not in p.metadata:
                p.metadata["doc_id"] = str(uuid.uuid4())
            processed_parents.append(p)

        # 2. PROSES CHILDREN (CRITICAL POINT)
        # Langflow secara eksplisit menyuntikkan 'type': 'child'
        processed_children = []
        for c in children:
            # Pastikan punya doc_id
            if "doc_id" not in c.metadata:
                c.metadata["doc_id"] = str(uuid.uuid4())
            
            # [PENTING] Enforce metadata type='child'
            # Tanpa ini, filter={"type": "child"} di search akan gagal.
            c.metadata["type"] = "child"
            
            # Validasi link ke parent (Opsional tapi disarankan)
            if "parent_id" in c.metadata:
                processed_children.append(c)
            else:
                logger.warning(f"Child dibuang (missing parent_id): {c.page_content[:30]}...")

        # 3. SIMPAN PARENT KE REDIS
        if processed_parents:
            try:
                # Map doc_id -> Document Object
                parent_key_val = [(p.metadata["doc_id"], p) for p in processed_parents]
                self.doc_store.mset(parent_key_val)
                logger.info(f"Sukses menyimpan {len(parent_key_val)} Parent Docs ke Redis")
            except Exception as e:
                logger.error(f"Gagal simpan Redis: {e}")
                raise e

        # 4. SIMPAN CHILDREN KE CHROMA
        if processed_children:
            try:
                self.vector_store.add_documents(processed_children)
                logger.info(f"Sukses menyimpan {len(processed_children)} Child Chunks ke Chroma")
            except Exception as e:
                logger.error(f"Gagal simpan Chroma: {e}")
                raise e
        return {"status": "Success", "parents": len(processed_parents), "children": len(processed_children)}

In [36]:
# --- KONFIGURASI ---
SOURCE_DIRECTORY = "MajaAI_Data"  # Folder tempat menaruh file PDF
REDIS_URL = "redis://localhost:6379"
# CHROMA_DIR = "./data/parent-child-chunk-langflow-compatible"
COLLECTION = "parent-child-chunk-langflow-v5"
AWS_MODEL_ID = "amazon.titan-embed-text-v2:0"

# Setup Logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def run_indexing_pipeline():
    # 1. INISIALISASI KOMPONEN
    logger.info("--- 1. Menginisialisasi Komponen ---")
    
    # Ambil embedding model (tanpa wrapper Embeddings())
    embedding_model = get_embeddings("aws", AWS_MODEL_ID)
    
    # Siapkan Loader
    pdf_loader = LangflowCompatibleLoader()
    
    # Siapkan Splitter
    splitter = LangflowCompatibleSplitter(
        parent_chunk_size=2400,
        parent_chunk_overlap=260,
        child_chunk_size=300,
        child_chunk_overlap=60
    )
    
    # Siapkan Indexer
    indexer = LangflowCompatibleIndexer(
        embedding=embedding_model,
        redis_url=REDIS_URL,
        collection_name=COLLECTION
    )

    # 2. LOAD DOKUMEN (Tahap Integrasi Loader)
    logger.info(f"--- 2. Memuat Dokumen dari {SOURCE_DIRECTORY} ---")
    
    # Pastikan folder ada
    if not os.path.exists(SOURCE_DIRECTORY):
        os.makedirs(SOURCE_DIRECTORY)
        logger.warning(f"Folder '{SOURCE_DIRECTORY}' baru saja dibuat. Masukkan file PDF ke dalamnya lalu jalankan ulang.")
        return

    # Load Raw Documents
    raw_documents = pdf_loader.load(SOURCE_DIRECTORY)
    
    if not raw_documents:
        logger.warning("Tidak ada dokumen yang dimuat. Stopping pipeline.")
        return

    logger.info(f"Berhasil memuat {len(raw_documents)} halaman raw dari PDF.")

    # 3. SPLITTING (Raw Docs -> Parent & Child Chunks)
    logger.info("--- 3. Melakukan Splitting Parent-Child ---")
    
    # Splitter akan menggunakan metadata['file_path'] dari Loader 
    # untuk menggabungkan halaman dari file yang sama sebelum memecahnya.
    split_result = splitter.split_documents(raw_documents)
    
    parent_docs = split_result["parents"]
    child_docs = split_result["children"]
    
    logger.info(f"Splitting Selesai. Parents: {len(parent_docs)}, Children: {len(child_docs)}")

    # 4. INDEXING (Simpan ke DB)
    logger.info("--- 4. Menyimpan ke Vector DB & Redis ---")
    
    try:
        status = indexer.index(parent_docs, child_docs)
        logger.info(f"Indexing Sukses! Status: {status}")
    except Exception as e:
        logger.error(f"Indexing Gagal: {e}")

run_indexing_pipeline()

INFO:__main__:--- 1. Menginisialisasi Komponen ---
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/auth/identity "HTTP/1.1 200 OK"
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"
INFO:__main__:--- 2. Memuat Dokumen dari MajaAI_Data ---
INFO:__main__:Ditemukan 45 file PDF.
INFO:__main__:Total 72 halaman berhasil dimuat.
INFO:__main__:Berhasil memuat 72 halaman raw da

### Retriever

In [37]:
class LangflowCompatibleRetriever(RunnableSerializable):
    """
    Replikasi logika 'Parent_Child_Search' Langflow.
    """
    collection_name: str = "parent-child-chunk-langflow-v5"
    redis_url: str = "redis://localhost:6379"
    max_results: int = 2
    
    # Internal
    _doc_store: Any = PrivateAttr()
    _vector_store: Any = PrivateAttr()
    _embedding: Embeddings = PrivateAttr()

    def __init__(self, embedding: Embeddings, **kwargs):
        super().__init__(**kwargs)
        self._embedding = embedding
        # Inisialisasi koneksi
        self._doc_store = StoreFactory.get_doc_store(self.redis_url, self.collection_name)
        self._vector_store = StoreFactory.get_vector_store(self._embedding, self.collection_name)

    def invoke(self, query: str, config: Optional[RunnableConfig] = None, **kwargs) -> List[Document]:
        start_time = time.time()
        
        # 1. SEARCH CHILD CHUNKS
        # Langflow Logic: Ambil kandidat 2x max_results
        k_candidates = self.max_results * 2
        
        try:
            child_results = self._vector_store.similarity_search_with_relevance_scores(
                query=query,
                k=k_candidates,
                filter={"type": "child"}, # Filter ini hanya berhasil jika Indexer menambahkan tag 'child'
                score_threshold=0.01      # Threshold default Langflow
            )
        except Exception as e:
            logger.error(f"Vector Search Error: {e}")
            return []

        if not child_results:
            logger.warning("No results found in vector store.")
            return []

        # 2. DEDUPLIKASI & LINKING (Logika Langflow)
        # Mengambil hanya child skor tertinggi untuk setiap parent_id unik
        parent_id_map = {}
        unique_parent_ids = []

        for child, score in child_results:
            pid = child.metadata.get("parent_id")
            if not pid:
                continue
            
            # Langflow logic: "if parent_id not in parent_id_to_data"
            # Artinya: First come (skor tertinggi), first served. Sisanya diabaikan.
            if pid not in parent_id_map:
                parent_id_map[pid] = {
                    "score": score,
                    "matched_child_content": child.page_content
                }
                unique_parent_ids.append(pid)

        # 3. FETCH PARENTS FROM REDIS
        if not unique_parent_ids:
            return []

        try:
            parent_docs = self._doc_store.mget(unique_parent_ids)
        except Exception as e:
            logger.error(f"Redis Fetch Error: {e}")
            return []

        # 4. FORMATTING & RANKING
        final_results = []
        for pid, p_doc in zip(unique_parent_ids, parent_docs):
            if p_doc:
                info = parent_id_map[pid]
                
                # Inject Score & Child Content ke Metadata Parent (Sama seperti output Langflow)
                p_doc.metadata["retrieval_score"] = round(info["score"], 2) # Langflow melakukan rounding
                p_doc.metadata["matched_child_content"] = info["matched_child_content"]
                
                final_results.append(p_doc)

        # Sort descending by score
        final_results.sort(key=lambda x: x.metadata.get("retrieval_score", 0), reverse=True)
        
        # Potong sesuai max_results
        return final_results[:self.max_results]

In [38]:
# --- 3. SEARCHING ---
retriever = LangflowCompatibleRetriever(
    embedding=get_embeddings("aws", "amazon.titan-embed-text-v2:0"),
    redis_url=REDIS_URL,
    collection_name=COLLECTION,
    max_results=2
)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/auth/identity "HTTP/1.1 200 OK"
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"


In [84]:
query = "Apa saja layanan sosial yang tersedia di Majadigi?"
results = retriever.invoke(query)

for doc in results:
    print(f"Score: {doc.metadata['retrieval_score']}")
    print(f"File Path: {doc.metadata['file_path']}")
    print(f"Parent Content: {doc.page_content[:100]}...")
    print(f"Matched Child: {doc.metadata['matched_child_content'][:100]}...")
    print("-" * 20)

INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/9f434249-4421-41a2-bdb4-e9390ee93b1e/query "HTTP/1.1 200 OK"


Score: 0.85
File Path: MajaAI_Data\031_Daftar layanan publik kategori sosial.pdf
Parent Content: Kategori / topik
: Daftar layanan publik kategori sosial 
Pertanyaan utama / intent 
: Apa saja laya...
Matched Child: Kategori / topik
: Daftar layanan publik kategori sosial 
Pertanyaan utama / intent 
: Apa saja laya...
--------------------
Score: 0.68
File Path: MajaAI_Data\034_Daftar layanan Pemerintah dan desa.pdf
Parent Content: Layanan pemerintah dan desa adalah pelayanan publik yang diberikan oleh pemerintah daerah tingkat 
k...
Matched Child: di Jawa Timur. Pelayanannya meliputi pengurusan administrasi kependudukan, retribusi daerah, dan 
be...
--------------------


### RAGAs Evaluation

In [41]:
import pandas as pd
import requests
import json
from tqdm.auto import tqdm
from ragas import evaluate
from ragas.metrics import LLMContextRecall, ContextPrecision
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from datasets import Dataset
from langchain_aws import BedrockEmbeddings, BedrockLLM
from langchain_aws.chat_models import ChatBedrock
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
import time
from langchain_core.runnables import RunnableLambda

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
from typing import List
from langchain_core.documents import Document

def format_docs_to_string(docs: List[Document]) -> str:
    return "\n".join(doc.page_content for doc in docs) #Harusnya \n

def extract_urls_from_docs(docs: List[Document]) -> List[str]:
    all_urls = []
    for doc in docs:
        urls = doc.metadata.get("urls", [])
        if isinstance(urls, str):
            urls = [urls]
        all_urls.extend(urls)
    return sorted(set(all_urls))

In [43]:
def time_retriever(retriever):
    def _run(input):
        start = time.time()
        docs = retriever.invoke(input)
        end = time.time()
        return {
            "docs": docs,
            "retriever_latency": end - start
        }
    return RunnableLambda(_run)

timed_retriever = time_retriever(retriever)

retrieval = (
    itemgetter("question")
    | timed_retriever
)

RAG_retriever_eval = (
    retrieval
    | RunnablePassthrough.assign(
        context=lambda x: format_docs_to_string(x["docs"]),
        url=lambda x: extract_urls_from_docs(x["docs"])
    )
    | RunnableParallel({
        "context": itemgetter("docs"),
        "retriever_latency": itemgetter("retriever_latency"),
    })
)

In [44]:
df_eval = pd.read_csv("weekly backlog - QnA Maja.AI (2).csv")
total_data = len(df_eval) #60
llm_ragas = LangchainLLMWrapper(ChatBedrock(model='apac.amazon.nova-pro-v1:0', temperature=0))

  llm_ragas = LangchainLLMWrapper(ChatBedrock(model='apac.amazon.nova-pro-v1:0', temperature=0))


In [45]:
def get_answer_context(question:str):
    RAG_output = RAG_retriever_eval.invoke({"question": question})
    context = RAG_output["context"]
    contexts = [doc.page_content for doc in context]
    latency = RAG_output["retriever_latency"]
    return contexts, latency

In [46]:
questions_list = []
contexts_list = []
references_list = []
latency_list = []
df_to_eval = df_eval.iloc[0:60]

for row in tqdm(df_to_eval.itertuples(), total=len(df_to_eval), desc=f"Loading {len(df_to_eval)} Data"):
    question = row.Pertanyaan
    ground_truth = row.Jawaban
    context, latency = get_answer_context(question)
    if context is not None:
        questions_list.append(question)
        contexts_list.append(context)
        references_list.append(ground_truth)
        latency_list.append(latency)
    else:
        print(f"Melewatkan pertanyaan karena error API: '{question}'")

if not questions_list:
    print("\nTidak ada data yang berhasil diproses dari API.")
else:
    data = {
        "question": questions_list,
        "contexts": contexts_list,
        "reference": references_list 
    }
    dataset = Dataset.from_dict(data)
    print(f"Evaluate RAGAs pada {len(dataset)} data")
    results = evaluate(
        dataset=dataset,
        metrics=[LLMContextRecall(), ContextPrecision()],
        llm=llm_ragas
    )
    df_result = results.to_pandas()
    latency_list_float = []
    for x in latency_list:
        try:
            latency_list_float.append(float(x))
        except:
            latency_list_float.append(None)
    df_result["latency"] = latency_list_float
    print("--- Hasil Evaluasi RAGAs ---")
    display(df_result)

Loading 60 Data:   0%|          | 0/60 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/9f434249-4421-41a2-bdb4-e9390ee93b1e/query "HTTP/1.1 200 OK"
Loading 60 Data:   2%|▏         | 1/60 [00:00<00:53,  1.10it/s]INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/9f434249-4421-41a2-bdb4-e9390ee93b1e/query "HTTP/1.1 200 OK"
Loading 60 Data:   3%|▎         | 2/60 [00:01<00:31,  1.82it/s]INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/9f434249-4421-41a2-bdb4-e9390ee93b1e/query "HTTP/1.1 200 OK"
Loading 60 Data:   5%|▌         | 3/60 [00:01<00:24,  2.35it/s]INFO:httpx:HTTP Request: POST http://localhost:8000/api/v2/tenants/default_tenant/databases/default_database/collections/9f434249-4421-41a2-bdb4-e9390ee93b1e/query "HTTP/1.1 200 OK"
Loading 60 Data:   7%|▋     

Evaluate RAGAs pada 60 data


Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generate response
INFO:langchain_aws.chat_models.bedrock_converse:Using Bedrock Converse API to generat

--- Hasil Evaluasi RAGAs ---


Unnamed: 0,user_input,retrieved_contexts,reference,context_recall,context_precision,latency
0,Apa perbedaan pengurusan e-KTP online dan offl...,[Kategori\n: Dokumen Kependudukan KTP \nPertan...,Pengurusan offline dilakukan di kantor Dukcapi...,1.0,1.0,0.884073
1,Di mana saja perekaman e-KTP dapat dilakukan s...,[Kategori\n: Dokumen Kependudukan KTP \nPertan...,Perekaman e-KTP secara offline dapat dilakukan...,1.0,1.0,0.281677
2,Apa saja persyaratan umum dan dokumen yang har...,[Kategori\n: Dokumen Kependudukan KTP \nPertan...,"Sebelum mengurus e-KTP, pastikan Anda telah be...",1.0,1.0,0.264497
3,Kapan seseorang wajib mengajukan penerbitan Ka...,[Kategori\n:Kartu Keluarga (KK) \nPertanyaan u...,Seseorang wajib mengajukan penerbitan KK baru ...,0.666667,1.0,0.241582
4,Bagaimana cara mengurus Kartu Keluarga (KK) baru?,[Kategori\n:Kartu Keluarga (KK) \nPertanyaan u...,"Untuk mengurus KK baru, pemohon perlu menyiapk...",1.0,1.0,0.227664
5,Apa saja syarat untuk membuat KK baru karena h...,[Kategori\n:Kartu Keluarga (KK) \nPertanyaan u...,"Jika KK hilang, diperlukan surat keterangan hi...",1.0,1.0,0.280595
6,Apa itu Kartu Identitas Anak (KIA)?,[Kategori\n: Kartu Identitas Anak (KIA) \nPert...,Kartu Identitas Anak (KIA) adalah kartu identi...,1.0,1.0,0.313684
7,Apa syarat membuat KIA baru?,[Kategori\n: Kartu Identitas Anak (KIA) \nPert...,Syarat pembuatan KIA meliputi Kartu Keluarga (...,1.0,1.0,0.226729
8,Di mana mengurus KIA?,[Kategori\n: Kartu Identitas Anak (KIA) \nPert...,Pengurusan KIA bisa dilakukan secara tatap muk...,1.0,1.0,0.250192
9,Apa itu Akta Kelahiran?,[Kategori\n: Akta Kelahiran \nPertanyaan utama...,Akta Kelahiran adalah dokumen pencatatan sipil...,1.0,1.0,1.865977


In [47]:
recall_mean = df_result['context_recall'].mean()
print(f"rata-rata Recall: {recall_mean}")
precision_mean = df_result['context_precision'].mean()
print(f"rata-rata Precision: {precision_mean}")
latency_mean = df_result['latency'].mean()
print(f"rata-rata Latency: {latency_mean}")

rata-rata Recall: 0.9174145299145299
rata-rata Precision: 0.9666666666016667
rata-rata Latency: 0.3166940768559774


In [86]:
df_result.to_csv("Parent-Child-Vector-Search.csv", index=False)