### Read data

In [1]:
from docling.document_converter import DocumentConverter
import os 

DATA_DIR = '../data/'
source = 'Disposizione Di Bonifico.pdf'  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(os.path.join(DATA_DIR, source))
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

  from .autonotebook import tqdm as notebook_tqdm


<!-- image -->

## Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38

Internet Banking

Vi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024

Numero ordine

INTER20241120BOSBE350192748

Ordinante De Maio Raul - Pantaleo Rossella Filiale ROMA-TUSCOLANA

N. C/C

1000/00014233

Dati dell'operazione

Beneficiario

EDIL FIORINI SNC

Indirizzo

Localit

-

Paese

-

Nazione sede legale o residenza beneficiario

ITALIA

Banca

INTESA SANPAOLO SPA Cerveteri VIA ARMANDO DIAZ 10

IBAN

IT51L0306939030000000003129

BIC

BCITITMMXXX

C.F. o P.IVA beneficiario 04223531007

C.F. fruitore della detrazione

DMERLA91S09H501U

Debitore Effettivo

-

Creditore Effettivo

-

Identificativo bonifico

Tipologia

-

Tipologia di spese

Spese su beni di proprieta'

Descrizione

RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR

FATTURA nr. 48/2024 del 01/10/ 2024-PROT. CILA N CI/2024/1130 67-ACCONTO LAVORI MAN.

STRAORD

INARIA VIA VESTRICIO SPURINNA 57 00175 ROMA

Impo

### Define Template

In [2]:
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime

class LineItem(BaseModel):
    """Una linea di un oggetto in una fattura."""

    item_name: str = Field(description="Il nome dell'articolo")
    price: float = Field(description="Il prezzo dell'articolo")


class Invoice(BaseModel):
    """Una rappresentazione di informazione di una fattura."""

    invoice_id: str = Field(
        description="Un identificativo univoco per questa fattura. Spesso un codice alfanumerico."
    )
    date: datetime = Field(description="La data in cui la fattura è stata creata.")
    total_price: float = Field(description="Il prezzo totale della fattura.")
    iva: float = Field(description="L'IVA applicata alla fattura.")
    line_items: list[LineItem] = Field(
        description="Una lista di tutti gli articoli compresi nella fattura"
    )


In [29]:
class Answer(BaseModel):
    """A generic structured answer to a question which requires dicotomic response (yes or no)."""
    short_response : bool = Field(description="True if the answer to the question is yes, False otherwise")
    response: str = Field(description="The long and detailed answer to the question.")

In [30]:
from llama_index.llms.ollama import Ollama

In [31]:
from llama_index.llms.ollama import Ollama
from llama_index.core.program import FunctionCallingProgram

# Initialize the Ollama LLM
llm = Ollama(
    model="phi4-mini", 
    request_timeout=120.0,
    format='json', 
    temperature=0.01,
    num_ctx=32000
    )

# Define the prompt template
prompt_template_str = (
    "Extract the invoice details from the following document:\n\n"
    "{document}\n\n"
)

# Create the LLMPydanticProgram
#program = FunctionCallingProgram.from_defaults(
#    output_cls=Invoice,
#    prompt_template_str=prompt_template_str,
#    llm=llm,
#    verbose=True
#)


In [32]:
#program(document=result.document.export_to_markdown())

In [33]:
#source = 'E00324337228-IT4453739-z80K5gyUNjnHM3sO8hfJk0XP.pdf'  # document per local path or URL
#converter = DocumentConverter()
#result = converter.convert(source)
#print(result.document.export_to_markdown())

#program(document=result.document.export_to_markdown())

### Example with Structured_Predict (using function)

In [34]:
from llama_index.core.prompts import PromptTemplate

prompt = PromptTemplate(
    "Estrai i dati della fattura dal seguente testo:{text}"
)


llm.structured_predict(
    Invoice, 
    prompt=prompt, 
    text=result.document.export_to_markdown()
    )

Invoice(invoice_id='48/2024', date=datetime.datetime(110, 4, 24, 0, 0, tzinfo=TzInfo(+02:00)), total_price=7700.0, iva=1001423330000092.0, line_items=[LineItem(item_name='RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR', price=7700.0)])

### Example combining RAG

In [35]:
from llama_index.core import Document

document = Document(text=result.document.export_to_markdown(), metadata={'source':os.path.join(DATA_DIR, source)})

In [36]:
document

Document(id_='126ba574-e331-4a94-8521-5ae7393abe8d', embedding=None, metadata={'source': '../data/Disposizione Di Bonifico.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='<!-- image -->\n\n## Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38\n\nInternet Banking\n\nVi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024\n\nNumero ordine\n\nINTER20241120BOSBE350192748\n\nOrdinante De Maio Raul - Pantaleo Rossella Filiale ROMA-TUSCOLANA\n\nN. C/C\n\n1000/00014233\n\nDati dell\'operazione\n\nBeneficiario\n\nEDIL FIORINI SNC\n\nIndirizzo\n\nLocalit\n\n-\n\nPaese\n\n-\n\nNazione sede legale o residenza beneficiario\n\nITALIA\n\nBanca\n\nINTESA SANPAOLO SPA Cerveteri VIA ARMANDO DIAZ 10\n\nIBAN\n\nIT51L0306939030000000003129\n\nBIC\n\nBCITITMMXXX\n\nC.F. o P.IVA beneficiario 04223531007\n\nC.F

In [37]:
from llama_index.core.node_parser import MarkdownNodeParser

nodes = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True).get_nodes_from_documents([document])

In [38]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings


embed_model = HuggingFaceEmbedding(model_name='intfloat/multilingual-e5-small')
Settings.embed_model = embed_model



In [39]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes, embed_model=embed_model)

In [40]:
index.as_retriever(
    similarity_top_k = 5,embed_model=embed_model
).retrieve("Beneficiario fattura")

[NodeWithScore(node=TextNode(id_='e0a120c3-252e-4896-83fe-83b78a3bbc14', embedding=None, metadata={'source': '../data/Disposizione Di Bonifico.pdf', 'header_path': '/'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='126ba574-e331-4a94-8521-5ae7393abe8d', node_type='4', metadata={'source': '../data/Disposizione Di Bonifico.pdf'}, hash='35aed61174b229224b6ff2e00b0fe0071348720cfbccbdf0ff6bc7726cfba554'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='21b0ee65-dbde-43a9-bb1a-e966c1901232', node_type='1', metadata={'header_path': '/'}, hash='c66663a6feb02f619517f95d8239e581488efcbd544137584b9e66917ffa800f')}, metadata_template='{key}: {value}', metadata_separator='\n', text='<!-- image -->', mimetype='text/plain', start_char_idx=0, end_char_idx=14, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.8721792632671911),
 NodeWithScore(node=TextNode(id_='21b0ee65-dbde-43

In [41]:
query_engine = index.as_query_engine(
    similarity_top_k = 5,
    llm=llm.as_structured_llm(Invoice),
    response_mode="tree_summarize"
)

In [42]:
query_engine.query("sberbank")

PydanticResponse(response=Invoice(invoice_id='0306968849728104480320003200IT', date=datetime.datetime(2011, 4, 24, 0, 38, 44, tzinfo=TzInfo(+01:00)), total_price=7701.2, iva=1.2, line_items=[LineItem(item_name='RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR', price=7700.0), LineItem(item_name='Commissione per Bonifico Europeo Unico (EUR)', price=1.2)]), source_nodes=[NodeWithScore(node=TextNode(id_='e0a120c3-252e-4896-83fe-83b78a3bbc14', embedding=None, metadata={'source': '../data/Disposizione Di Bonifico.pdf', 'header_path': '/'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='126ba574-e331-4a94-8521-5ae7393abe8d', node_type='4', metadata={'source': '../data/Disposizione Di Bonifico.pdf'}, hash='35aed61174b229224b6ff2e00b0fe0071348720cfbccbdf0ff6bc7726cfba554'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='21b0ee65-dbde-43a9-bb1a-e966c1901232', node_type='1', metadata={'header_path': '

In [43]:
# get document and parse it to markdown

# get markdown and build index

# from index build query engine (with reranker if possible)

# Option 1: Make question (aka checklist) <- envetually to be commbined with feature extraction features

# Option 2: Combine with Pydantic and extract structured output (aka datafeed)

# Option 3: Combine Retriever and Pydantic Template as "tools" for the workflow

In [27]:
query_engine = index.as_query_engine(
    similarity_top_k = 5,
    llm=llm.as_structured_llm(Answer),
    response_mode="tree_summarize"
)

In [28]:
query_engine.query("Il conto è cointestato? Rispondi con sì o no.")

PydanticResponse(response=Answer(short_response=True, response='Sì'), source_nodes=[NodeWithScore(node=TextNode(id_='bcdbcfb1-d81e-4b98-b79f-8f2030ad680b', embedding=None, metadata={'source': '../data/Disposizione Di Bonifico.pdf', 'header_path': '/'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ff90139f-9383-4ab8-a7c7-8cb17cec169e', node_type='4', metadata={'source': '../data/Disposizione Di Bonifico.pdf'}, hash='35aed61174b229224b6ff2e00b0fe0071348720cfbccbdf0ff6bc7726cfba554'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='8e0f6c35-784d-4ebc-807b-84d0fe6ddb18', node_type='1', metadata={'header_path': '/'}, hash='c66663a6feb02f619517f95d8239e581488efcbd544137584b9e66917ffa800f')}, metadata_template='{key}: {value}', metadata_separator='\n', text='<!-- image -->', mimetype='text/plain', start_char_idx=0, end_char_idx=14, metadata_seperator='\n', text_template='{metadata_str}\n\n{conten

In [85]:
Invoice.model_json_schema()

{'$defs': {'LineItem': {'description': 'Una linea di un oggetto in una fattura.',
   'properties': {'item_name': {'description': "Il nome dell'articolo",
     'title': 'Item Name',
     'type': 'string'},
    'price': {'description': "Il prezzo dell'articolo",
     'title': 'Price',
     'type': 'number'}},
   'required': ['item_name', 'price'],
   'title': 'LineItem',
   'type': 'object'}},
 'description': 'Una rappresentazione di informazione di una fattura.',
 'properties': {'invoice_id': {'description': 'Un identificativo univoco per questa fattura. Spesso un codice alfanumerico.',
   'title': 'Invoice Id',
   'type': 'string'},
  'date': {'description': 'La data in cui la fattura è stata creata.',
   'format': 'date-time',
   'title': 'Date',
   'type': 'string'},
  'total_price': {'description': 'Il prezzo totale della fattura.',
   'title': 'Total Price',
   'type': 'number'},
  'iva': {'description': "L'IVA applicata alla fattura.",
   'title': 'Iva',
   'type': 'number'},
  'l

# 1. Decomposing the Problem

You need to:

* ✅ Process text that exceeds the context window.
* ✅ Extract structured information using a Pydantic schema.
* ✅ Use RAG to retrieve relevant sections.
* ✅ Utilize an agent to manage the flow.



# 2. Suggested Approach
Here’s a modular pipeline for handling the task:

## A. Chunking & Indexing (Preprocessing Step)
Break the long text into semantic chunks (e.g., using LangChain's RecursiveCharacterTextSplitter).
Store these in a vector database (like FAISS, Chroma, Weaviate, etc.) for retrieval.
## B. Agent as an Orchestrator
The agent's role is to:
* Interpret the query (e.g., "Extract company details" → determines which Pydantic model to use).
* Retrieve relevant chunks from the vector store using RAG.
* Pass the chunks to the LLM for structured extraction using the Pydantic class.
* Aggregate results across multiple LLM calls (if necessary).
## C. Query’s Role in the Process
* The query defines what needs to be extracted from the long text.
* It helps the agent filter and retrieve relevant chunks.
Example queries:
"Extract all person names and their affiliations."
"Find product descriptions and their pricing information."
"Summarize legal clauses about termination conditions."


https://docs.llamaindex.ai/en/stable/module_guides/querying/structured_outputs/
https://docs.llamaindex.ai/en/stable/understanding/extraction/lower_level/