In [None]:
# For LlamaIndex
!pip install llama-parse llama-index llama-index-embeddings-openai nltk

# For LangChain
!pip install langchain langchain-text-splitters langchain-community langchain-openai tiktoken
!pip install pypdf pymupdf pdfplumber unstructured


Collecting pymupdf
  Using cached pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting unstructured
  Using cached unstructured-0.18.26-py3-none-any.whl.metadata (25 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Using cached pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
Collecting python-magic (from unstructured)
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Using cached emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Using cached python_iso639-2025.11.16-py3-none-any.whl.metadata (15 kB)
Collecting langdetect (from unstructured)
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing

In [None]:
!pip install python-dotenv



In [None]:
from pprint import pprint

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env into environment variables

secret_key = os.getenv("LLAMA_CLOUD_API_KEY")


**PDF Document Parsing Using LLamaParse**

In [None]:
import os
from llama_parse import LlamaParse

# Initialize parser
parser = LlamaParse(
    result_type="markdown",  # or "text"
    verbose=True
)

# Parse PDF
documents = parser.load_data("/content/sample-local-pdf.pdf")
len(documents), type(documents), type(documents[0])



Started parsing the file under job_id b5d5d9c8-a3dd-487e-b0a5-3d14b9cf8c69


(3, list, llama_index.core.schema.Document)

In [None]:
pprint(dict(documents[0]))

{'audio_resource': None,
 'embedding': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '09942564-dfa8-42c8-bfcc-a49cc4983fdb',
 'image_resource': None,
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'text_resource': MediaResource(embeddings=None, data=None, text='Sample PDF\nCreated for testing PDFObject\n\nThis PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?\n\nI digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue 

**LlamaIndex Chunking Techniques**

LlamaIndex uses Node Parsers that convert Documents into Node objects, where each node is a chunk inheriting metadata from the parent document.

1. **SentenceSplitter (Basic & Recommended)**

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

splitter1 = SentenceSplitter(
    chunk_size=1024,        # Target tokens per chunk
    chunk_overlap=20,       # Overlap between chunks
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.;。]+[,.;。]?"
)
splitter2 = SentenceSplitter(
    chunk_size=256,        # Target tokens per chunk
    chunk_overlap=20,       # Overlap between chunks
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.;。]+[,.;。]?"
)

In [None]:
splitter1

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7a3971caea20>, id_func=<function default_id_func at 0x7a397c980fe0>, chunk_size=1024, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。]+[,.;。]?')

In [None]:
nodes_with_splitter1 = splitter1.get_nodes_from_documents(documents)
nodes_with_splitter2 = splitter2.get_nodes_from_documents(documents)

print(f"Nodes with Splitter 1: \nType: {type(nodes_with_splitter1)}, \nNum_of_nodes: {len(nodes_with_splitter1)}", end="\n\n")
print(f"Nodes with Splitter 2: \nType: {type(nodes_with_splitter2)}, \nNum_of_nodes: {len(nodes_with_splitter2)}")


Nodes with Splitter 1: 
Type: <class 'list'>, 
Num_of_nodes: 4

Nodes with Splitter 2: 
Type: <class 'list'>, 
Num_of_nodes: 15


In [None]:
pprint(dict(nodes_with_splitter1[0]))

{'embedding': None,
 'end_char_idx': 2970,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '36900db6-1d83-4ed6-b871-6b6870b270c5',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c')},
 'start_char_idx': 0,
 'text': 'Sample PDF\n'
         'Created for testing PDFObject\n'
         '\n'
         'This PDF is three pages long. Three long pages. Or three short pages '
         'if you’re optimistic. Is it the same as saying “three long minutes”, '
         'knowing that all minutes are the same duration, and one cannot '
         'possibly be longer than the other? If these pages are all the same '
         'size, can one possibly be

In [None]:
pprint(dict(nodes_with_splitter2[0]))

{'embedding': None,
 'end_char_idx': 869,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '87c12de1-381a-4c0c-baf2-d2b635660a07',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'),
                   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='94893b92-e9f1-4058-ad76-8c803c7b63d9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f9030d151e905b83458cb3ceab0b58d6aa9a6abe2e261037f8a3ee32a9ef378a')},
 'start_char_idx': 0,
 'text': 'Sample PDF\n'
         'Created for testing PDFObject\n'
         '\n'
         'This PDF is three pages long. Three long pages. Or three short pages '
         'if you’re optimistic. Is it

In [None]:
pprint(dict(nodes_with_splitter2[1]))

{'embedding': None,
 'end_char_idx': 1512,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '94893b92-e9f1-4058-ad76-8c803c7b63d9',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'),
                   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='87c12de1-381a-4c0c-baf2-d2b635660a07', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='c7e0d9657a10dc7ff95928b556c33f5939e31edcacde7c7b26acee50e3359fab'),
                   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9531154a-162a-4776-8bf1-491d37954bb4', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='03e1cf68d62801060897ccaf9b7246811bef3776dfca4c65d00

In [None]:
# Inspect nodes : nodes_with_splitter1
print(f"Total nodes created: {len(nodes_with_splitter1)}")
print("\n=== First Node ===")
print(f"Node ID: {nodes_with_splitter1[0].node_id}")
print(f"Text length: {len(nodes_with_splitter1[0].text)} chars")
print(f"Metadata: {nodes_with_splitter1[0].metadata}")
print(f"\nContent preview:\n{nodes_with_splitter1[0].text[:300]}")

# Analyze relationships
print(f"\n=== Node Relationships ===")
print(f"Source doc: {nodes_with_splitter1[0].source_node}")
if len(nodes_with_splitter1) > 1:
    print(f"Next node: {nodes_with_splitter1[0].relationships.get('next')}")

Total nodes created: 4

=== First Node ===
Node ID: 815cc067-83df-4a99-9de2-c8a3b1679844
Text length: 2970 chars
Metadata: {}

Content preview:
Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all

=== Node Relationships ===
Source doc: node_id='a138ae6d-a5a8-4b52-9e9d-4d3e68f58f92' node_type=<ObjectType.DOCUMENT: '4'> metadata={} hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'
Next node: None


In [None]:
# Inspect nodes : nodes_with_splitter2
print(f"Total nodes created: {len(nodes_with_splitter2)}")
print("\n=== First Node ===")
print(f"Node ID: {nodes_with_splitter2[0].node_id}")
print(f"Text length: {len(nodes_with_splitter2[0].text)} chars")
print(f"Metadata: {nodes_with_splitter2[0].metadata}")
print(f"\nContent preview:\n{nodes_with_splitter2[0].text[:300]}")

# Analyze relationships
print(f"\n=== Node Relationships ===")
print(f"Source doc: {nodes_with_splitter2[0].source_node}")
if len(nodes_with_splitter2) > 1:
    print(f"Next node: {nodes_with_splitter2[0].relationships.get('next')}")

Total nodes created: 15

=== First Node ===
Node ID: 5833dd49-f87d-429f-866b-1e81975d89c4
Text length: 869 chars
Metadata: {}

Content preview:
Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all

=== Node Relationships ===
Source doc: node_id='a138ae6d-a5a8-4b52-9e9d-4d3e68f58f92' node_type=<ObjectType.DOCUMENT: '4'> metadata={} hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'
Next node: None


**Mixed Content PDF Hybrid Splitter**

In [None]:
import os
from llama_parse import LlamaParse

# Initialize parser
parser = LlamaParse(
    result_type="markdown",  # or "text"
    verbose=True
)

# Parse PDF
documents = parser.load_data("/content/mixed_document_10_page.pdf")
len(documents), type(documents), type(documents[0])

Started parsing the file under job_id 69c52dc2-7f9d-4f13-9a24-591822b8dff8


(10, list, llama_index.core.schema.Document)

In [None]:
import re
from typing import List
from collections import Counter
from llama_index.core import Document
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.text_splitter import TokenTextSplitter, SentenceSplitter

# ---------- Try to import CodeSplitter from the core path you mentioned ----------
try:
    from llama_index.core.node_parser import CodeSplitter
    HAS_CODE_SPLITTER = True
except Exception:
    HAS_CODE_SPLITTER = False

# ---------- TableTextSplitter (same as before; safe fallback) ----------
class TableTextSplitter:
    def __init__(self, rows_per_chunk: int = 20, row_overlap: int = 0):
        self.rows_per_chunk = max(1, int(rows_per_chunk))
        self.row_overlap = max(0, int(row_overlap))

    def _parse_table_lines(self, lines):
        # pipe-delimited first
        if any("|" in ln for ln in lines):
            def split_pipe(ln):
                s = ln.strip()
                if s.startswith("|"): s = s[1:]
                if s.endswith("|"): s = s[:-1]
                return [c.strip() for c in re.split(r"\|", s)]
            parsed = [split_pipe(ln) for ln in lines if ln.strip() != ""]
            header = parsed[0] if parsed else []
            rows = parsed[1:] if len(parsed) > 1 else []
            return {"header": header, "rows": rows}
        else:
            def split_ws(ln):
                parts = [p.strip() for p in re.split(r"\s{2,}", ln) if p.strip() != ""]
                return parts
            parsed = [split_ws(ln) for ln in lines if ln.strip() != ""]
            counts = [len(p) for p in parsed]
            if not counts:
                return None
            mode_count = Counter(counts).most_common(1)[0][0]
            filtered = [p for p in parsed if len(p) == mode_count]
            if not filtered:
                return None
            header = filtered[0]
            rows = filtered[1:] if len(filtered) > 1 else []
            return {"header": header, "rows": rows}

    def _rows_to_markdown(self, header, rows):
        def sanitize(cell):
            return str(cell).replace("\n", " ").strip()
        hdr = "| " + " | ".join(sanitize(h) for h in header) + " |"
        sep = "| " + " | ".join(["---"] * len(header)) + " |"
        r_strs = []
        for r in rows:
            cells = [sanitize(c) for c in r]
            if len(cells) < len(header):
                cells += [""] * (len(header) - len(cells))
            elif len(cells) > len(header):
                cells = cells[:len(header)]
            r_strs.append("| " + " | ".join(cells) + " |")
        return "\n".join([hdr, sep] + r_strs)

    def split_text(self, text: str):
        lines = [ln.rstrip() for ln in text.splitlines() if ln.strip() != ""]
        parsed = self._parse_table_lines(lines)
        if parsed is None:
            return [text.strip()]
        header = parsed["header"]
        rows = parsed["rows"]
        if not rows:
            return [self._rows_to_markdown(header, [])]
        chunks = []
        total_rows = len(rows)
        step = max(1, self.rows_per_chunk - self.row_overlap)
        for start in range(0, total_rows, step):
            end = min(start + self.rows_per_chunk, total_rows)
            chunk_rows = rows[start:end]
            chunks.append(self._rows_to_markdown(header, chunk_rows))
            if end == total_rows:
                break
        return chunks

# ---------- Heuristics for detection ----------
CODE_KEYWORDS = ["def ", "class ", "import ", "from ", "#include", "console.log", "printf(", "println(", "System.out."]
CODE_SYMBOLS_RE = r"[{};=><>#\(\)\[\]]"

def is_code_block(text: str, min_lines_for_code=2, symbol_ratio_threshold=0.03) -> bool:
    if not text or not text.strip():
        return False
    if "```" in text:
        return True
    lines = [ln for ln in text.splitlines() if ln.strip() != ""]
    if len(lines) >= min_lines_for_code:
        indented = sum(1 for ln in lines if ln.startswith("    ") or ln.startswith("\t"))
        if indented >= max(1, len(lines)//3):
            return True
    lower = text.lower()
    for kw in CODE_KEYWORDS:
        if kw in lower:
            return True
    symbols = len(re.findall(CODE_SYMBOLS_RE, text))
    if len(text) > 0 and (symbols / len(text) > symbol_ratio_threshold):
        return True
    ends_semi = sum(1 for ln in lines if ln.strip().endswith(";"))
    if ends_semi >= max(1, len(lines)//4):
        return True
    return False

def is_table_block(text: str, min_rows=2, min_cols=2, consistency_threshold=0.6) -> bool:
    if not text or not text.strip():
        return False
    lines = [ln.rstrip() for ln in text.splitlines() if ln.strip() != ""]
    if len(lines) < min_rows:
        return False
    if any("|" in ln for ln in lines):
        col_counts = [len([c for c in re.split(r"\|", ln) if c.strip() != ""]) for ln in lines]
    else:
        def split_cols(ln): return [p for p in re.split(r"\s{2,}", ln) if p.strip() != ""]
        col_counts = [len(split_cols(ln)) for ln in lines]
    valid = [c for c in col_counts if c >= min_cols]
    if not valid:
        return False
    cnt = Counter(col_counts)
    mode_count = cnt.most_common(1)[0][1]
    ratio = mode_count / len(lines)
    return ratio >= consistency_threshold and cnt.most_common(1)[0][0] >= min_cols

# ---------- Main dispatcher: uses CodeSplitter when available ----------
def hybrid_chunk_documents_with_codesplitter(
        documents: List[Document],
        paragraph_regex: str = r"\n{2,}",
        text_chunk_size: int = 128,
        text_overlap: int = 16,
        code_chunk_lines: int = 30,
        code_overlap_lines: int = 10,
        code_max_chars: int = 1500,
        table_rows_per_chunk: int = 20,
        table_row_overlap: int = 0,
):
    all_nodes = []

    # instantiate splitters (reuse)
    text_splitter = None
    try:
        text_splitter = SentenceSplitter(chunk_size=text_chunk_size, chunk_overlap=text_overlap)
    except Exception:
        text_splitter = TokenTextSplitter(chunk_size=text_chunk_size, chunk_overlap=text_overlap)

    table_splitter = TableTextSplitter(rows_per_chunk=table_rows_per_chunk, row_overlap=table_row_overlap)

    # prepare CodeSplitter if available
    code_splitter = None
    if HAS_CODE_SPLITTER:
        try:
            code_splitter = CodeSplitter(language="python",
                                         chunk_lines=code_chunk_lines,
                                         chunk_lines_overlap=code_overlap_lines,
                                         max_chars=code_max_chars)
        except Exception:
            code_splitter = None

    for doc in documents:
        # robustly extract text
        if hasattr(doc, "get_text"):
            text = doc.get_text()
        elif hasattr(doc, "text"):
            text = doc.text
        else:
            text = str(doc)

        # coarse break into blocks (paragraphs / blank lines)
        blocks = re.split(paragraph_regex, text)
        for b_idx, block in enumerate(blocks):
            block = block.strip()
            if not block:
                continue

            detected_table = is_table_block(block)
            detected_code = False if detected_table else is_code_block(block)

            # CASE 1: Table
            if detected_table:
                splitter_name = "table"
                # TableTextSplitter returns chunk strings; convert to Documents
                chunk_texts = table_splitter.split_text(block)
                for ci, ct in enumerate(chunk_texts):
                    tmp_doc = Document(text=ct, metadata={
                        "orig_doc_id": getattr(doc, "id_", None) or getattr(doc, "doc_id", None),
                        "page": getattr(doc, "metadata", {}).get("page") if getattr(doc, "metadata", None) else None,
                        "block_index": b_idx,
                        "detected_table": True,
                        "splitter": splitter_name,
                        "chunk_index": ci,
                    })
                    # use SimpleNodeParser to produce nodes from that small doc (keeps interface consistent)
                    # node_parser = SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=10000, chunk_overlap=0))
                    # nodes = node_parser.get_nodes_from_documents([tmp_doc])
                    nodes = TokenTextSplitter(
                        chunk_size=10000,
                        chunk_overlap=0
                    ).get_nodes_from_documents([tmp_doc])

                    for n in nodes:
                        # n.node_info = {**(getattr(n, "node_info", {}) or {}), "splitter_used": splitter_name}
                        n.metadata = n.metadata or {}
                        n.metadata["splitter_used"] = splitter_name
                    all_nodes.extend(nodes)
                continue

            # CASE 2: Code
            if detected_code and code_splitter is not None:
                splitter_name = "code"
                # wrap block into a Document and call CodeSplitter.get_nodes_from_documents
                tmp_doc = Document(text=block, metadata={
                    "orig_doc_id": getattr(doc, "id_", None) or getattr(doc, "doc_id", None),
                    "page": getattr(doc, "metadata", {}).get("page") if getattr(doc, "metadata", None) else None,
                    "block_index": b_idx,
                    "detected_code": True,
                    "splitter": splitter_name,
                })
                try:
                    code_nodes = code_splitter.get_nodes_from_documents([tmp_doc])
                    for n in code_nodes:
                        # n.node_info = {**(getattr(n, "node_info", {}) or {}), "splitter_used": splitter_name}
                        n.metadata = n.metadata or {}
                        n.metadata["splitter_used"] = splitter_name
                    all_nodes.extend(code_nodes)
                    continue
                except Exception:
                    # fallback to TokenTextSplitter preserving newlines
                    pass

            # CASE 3: Text (or code fallback)
            if detected_code and code_splitter is None:
                splitter_name = "code_fallback"
                splitter_for_node = TokenTextSplitter(chunk_size=400, chunk_overlap=20)
            else:
                splitter_name = "sentence" if isinstance(text_splitter, SentenceSplitter) else "token"
                splitter_for_node = text_splitter

            # Use SimpleNodeParser to turn small block into nodes
            tmp_doc = Document(text=block, metadata={
                "orig_doc_id": getattr(doc, "id_", None) or getattr(doc, "doc_id", None),
                "page": getattr(doc, "metadata", {}).get("page") if getattr(doc, "metadata", None) else None,
                "block_index": b_idx,
                "detected_code": bool(detected_code),
                "detected_table": bool(detected_table),
                "splitter": splitter_name,
            })
            # node_parser = SimpleNodeParser(text_splitter=splitter_for_node)
            # nodes = node_parser.get_nodes_from_documents([tmp_doc])
            nodes = splitter_for_node.get_nodes_from_documents([tmp_doc])

            for n in nodes:
                # n.node_info = {**(getattr(n, "node_info", {}) or {}), "splitter_used": splitter_name}
                n.metadata = n.metadata or {}
                n.metadata["splitter_used"] = splitter_name
            all_nodes.extend(nodes)

    return all_nodes

# ----------------- Example usage -----------------
# nodes = hybrid_chunk_documents_with_codesplitter(documents)
# print("nodes:", len(nodes))
# print("sample node metadata:", nodes[0].node_info)


In [None]:
nodes = hybrid_chunk_documents_with_codesplitter(documents)

In [None]:
len(nodes)

46

In [None]:
# pprint(nodes)
for nodeob in nodes:
    pprint(dict(nodeob))
    print("\n=====================================================\n")

{'embedding': None,
 'end_char_idx': 29,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': 'e3cc8ec4-fcfe-49a2-a529-0b09b555b81f',
 'metadata': {'block_index': 0,
              'detected_code': False,
              'detected_table': False,
              'orig_doc_id': '07d3ec08-b4c7-48d7-8dcc-3b3c58f0d4fe',
              'page': None,
              'splitter': 'sentence',
              'splitter_used': 'sentence'},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5f650162-9e05-401e-bec9-791cf3225ce6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'orig_doc_id': '07d3ec08-b4c7-48d7-8dcc-3b3c58f0d4fe', 'page': None, 'block_index': 0, 'detected_code': False, 'detected_table': False, 'splitter': 'sentence'}, hash='57c3699890d31d53be16654720ecd67e94157d82b387bf4cc13d11056a478615')},
 'start_char_idx': 0,
 'tex