In [6]:
# define llm
from llama_index.llms.ollama import Ollama

llm = Ollama(
    model="phi4-mini",
    base_url="http://localhost:11434", 
    request_timeout=360.0,
    temperature=0.01
    )

response = llm.complete("What is the capital of France?")
print(response)

The capital of France is Paris. It serves as a major European city and cultural center, known for its history dating back to Roman times when it was called Lutetia.


### Instruction 2 (More difficult with at least 5 more constraints)


In [7]:
from pathlib import Path

all_files_gen = Path("./data/").rglob("*")
all_files = [f.resolve() for f in all_files_gen]
all_pdf_files = [f for f in all_files if f.suffix.lower() == ".pdf"]
len(all_pdf_files)


3

In [8]:
from llama_index.core import Document
from docling.document_converter import DocumentConverter

converter = DocumentConverter()

In [9]:
doc = converter.convert(all_pdf_files[0])

In [10]:
from docling.chunking import HybridChunker

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=doc.document)

Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


In [11]:
from transformers import AutoTokenizer

from docling.chunking import HybridChunker,HierarchicalChunker

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 500  # set to a small number for illustrative purposes

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

chunker = HierarchicalChunker(
    tokenizer=tokenizer,  # instance or model name, defaults to "sentence-transformers/all-MiniLM-L6-v2"
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer`
    merge_peers=True,  # optional, defaults to True
)
chunk_iter = chunker.chunk(dl_doc=doc.document)
chunks = list(chunk_iter)

In [12]:
for i, chunk in enumerate(chunks):
    print(f"=== {i} ===")
    txt_tokens = len(tokenizer.tokenize(chunk.text))
    print(f"chunk.text ({txt_tokens} tokens):\n{repr(chunk.text)}")

    ser_txt = chunker.serialize(chunk=chunk)
    ser_tokens = len(tokenizer.tokenize(ser_txt))
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")

    print()

=== 0 ===
chunk.text (2 tokens):
'Internet Banking'
chunker.serialize(chunk) (30 tokens):
'Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38\nInternet Banking'

=== 1 ===
chunk.text (27 tokens):
'Vi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024'
chunker.serialize(chunk) (55 tokens):
'Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38\nVi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024'

=== 2 ===
chunk.text (5 tokens):
'Numero ordine'
chunker.serialize(chunk) (33 tokens):
'Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38\nNumero ordine'

=== 3 ===
chunk.text (12 tokens):
'INTER20241120BOSBE350192748'
chunker.serialize(chunk) (40 tokens):
'Eseguito Bonifico Europeo Unico in data 20.11.2024 20.11.2024 21:38\nINTER20241120BOSBE350192748'

=== 4 ===
chunk.text (21 tokens):
'Ordinante De Maio Raul - Pantaleo Rossella Filiale ROMA-TUSCOLANA'
chunker.serialize(chunk) (49 tokens):
'Esegu

In [25]:
from llama_index.embeddings.ollama import OllamaEmbedding
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [27]:
embed_dim = len(ollama_embedding.get_text_embedding("hi"))


In [19]:
doc.document.json()

/var/folders/sj/w2mxmxh96gn0y5qrl52g4qxh0000gn/T/ipykernel_10440/2301761976.py:1: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  doc.document.json()


'{"schema_name":"DoclingDocument","version":"1.1.0","name":"bonifico","origin":{"mimetype":"application/pdf","binary_hash":1676812655808121847,"filename":"bonifico.pdf","uri":null},"furniture":{"self_ref":"#/furniture","parent":null,"children":[],"content_layer":"furniture","name":"_root_","label":"unspecified"},"body":{"self_ref":"#/body","parent":null,"children":[{"cref":"#/pictures/0"},{"cref":"#/groups/0"},{"cref":"#/texts/51"},{"cref":"#/texts/52"}],"content_layer":"body","name":"_root_","label":"unspecified"},"groups":[{"self_ref":"#/groups/0","parent":{"cref":"#/body"},"children":[{"cref":"#/texts/0"},{"cref":"#/texts/1"},{"cref":"#/texts/2"},{"cref":"#/texts/3"},{"cref":"#/texts/4"},{"cref":"#/texts/5"},{"cref":"#/texts/6"},{"cref":"#/texts/7"},{"cref":"#/texts/8"},{"cref":"#/texts/9"},{"cref":"#/texts/10"},{"cref":"#/texts/11"},{"cref":"#/texts/12"},{"cref":"#/texts/13"},{"cref":"#/texts/14"},{"cref":"#/texts/15"},{"cref":"#/texts/16"},{"cref":"#/texts/17"},{"cref":"#/texts/18