### Read data

In [None]:
from docling.document_converter import DocumentConverter
import os 

DATA_DIR = '../data/'
source = 'Disposizione Di Bonifico.pdf'  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(os.path.join(DATA_DIR, source))
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

### Define Template

In [None]:
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime

class LineItem(BaseModel):
    """Una linea di un oggetto in una fattura."""

    item_name: str = Field(description="Il nome dell'articolo")
    price: float = Field(description="Il prezzo dell'articolo")


class Invoice(BaseModel):
    """Una rappresentazione di informazione di una fattura."""

    invoice_id: str = Field(
        description="Un identificativo univoco per questa fattura. Spesso un codice alfanumerico."
    )
    date: datetime = Field(description="La data in cui la fattura è stata creata.")
    total_price: float = Field(description="Il prezzo totale della fattura.")
    iva: float = Field(description="L'IVA applicata alla fattura.")
    line_items: list[LineItem] = Field(
        description="Una lista di tutti gli articoli compresi nella fattura"
    )


In [None]:
class Answer(BaseModel):
    """A generic structured answer to a question which requires dicotomic response (yes or no)."""
    short_response : bool = Field(description="True if the answer to the question is yes, False otherwise")
    response: str = Field(description="The long and detailed answer to the question.")

In [None]:
from llama_index.llms.ollama import Ollama

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.core.program import FunctionCallingProgram

# Initialize the Ollama LLM
llm = Ollama(
    model="phi4-mini", 
    request_timeout=120.0,
    format='json', 
    temperature=0.01,
    num_ctx=32000
    )

# Define the prompt template
prompt_template_str = (
    "Extract the invoice details from the following document:\n\n"
    "{document}\n\n"
)

# Create the LLMPydanticProgram
#program = FunctionCallingProgram.from_defaults(
#    output_cls=Invoice,
#    prompt_template_str=prompt_template_str,
#    llm=llm,
#    verbose=True
#)


In [None]:
#program(document=result.document.export_to_markdown())

In [None]:
#source = 'E00324337228-IT4453739-z80K5gyUNjnHM3sO8hfJk0XP.pdf'  # document per local path or URL
#converter = DocumentConverter()
#result = converter.convert(source)
#print(result.document.export_to_markdown())

#program(document=result.document.export_to_markdown())

### Example with Structured_Predict (using function)

In [None]:
from llama_index.core.prompts import PromptTemplate

prompt = PromptTemplate(
    "Estrai i dati della fattura dal seguente testo:{text}"
)


llm.structured_predict(
    Invoice, 
    prompt=prompt, 
    text=result.document.export_to_markdown()
    )

### Example combining RAG

In [None]:
from llama_index.core import Document

document = Document(text=result.document.export_to_markdown(), metadata={'source':os.path.join(DATA_DIR, source)})

In [None]:
document

In [None]:
from llama_index.core.node_parser import MarkdownNodeParser

nodes = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True).get_nodes_from_documents([document])

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings


embed_model = HuggingFaceEmbedding(model_name='intfloat/multilingual-e5-small')
Settings.embed_model = embed_model



In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes, embed_model=embed_model)

In [None]:
index.as_retriever(
    similarity_top_k = 5,embed_model=embed_model
).retrieve("Beneficiario fattura")

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k = 5,
    llm=llm.as_structured_llm(Invoice),
    response_mode="tree_summarize"
)

In [None]:
query_engine.query("sberbank")

In [None]:
# get document and parse it to markdown

# get markdown and build index

# from index build query engine (with reranker if possible)

# Option 1: Make question (aka checklist) <- envetually to be commbined with feature extraction features

# Option 2: Combine with Pydantic and extract structured output (aka datafeed)

# Option 3: Combine Retriever and Pydantic Template as "tools" for the workflow

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k = 5,
    llm=llm.as_structured_llm(Answer),
    response_mode="tree_summarize"
)

In [None]:
query_engine.query("Il conto è cointestato? Rispondi con sì o no.")

In [None]:
Invoice.model_json_schema()

# 1. Decomposing the Problem

You need to:

* ✅ Process text that exceeds the context window.
* ✅ Extract structured information using a Pydantic schema.
* ✅ Use RAG to retrieve relevant sections.
* ✅ Utilize an agent to manage the flow.



# 2. Suggested Approach
Here’s a modular pipeline for handling the task:

## A. Chunking & Indexing (Preprocessing Step)
Break the long text into semantic chunks (e.g., using LangChain's RecursiveCharacterTextSplitter).
Store these in a vector database (like FAISS, Chroma, Weaviate, etc.) for retrieval.
## B. Agent as an Orchestrator
The agent's role is to:
* Interpret the query (e.g., "Extract company details" → determines which Pydantic model to use).
* Retrieve relevant chunks from the vector store using RAG.
* Pass the chunks to the LLM for structured extraction using the Pydantic class.
* Aggregate results across multiple LLM calls (if necessary).
## C. Query’s Role in the Process
* The query defines what needs to be extracted from the long text.
* It helps the agent filter and retrieve relevant chunks.
Example queries:
"Extract all person names and their affiliations."
"Find product descriptions and their pricing information."
"Summarize legal clauses about termination conditions."


https://docs.llamaindex.ai/en/stable/module_guides/querying/structured_outputs/
https://docs.llamaindex.ai/en/stable/understanding/extraction/lower_level/