In [13]:
import os
import json
import numpy as np
import ollama
import faiss
from PyPDF2 import PdfReader

In [2]:


class DocumentQAAgent:
    def __init__(self, model="mistral",embedding_model='nomic-embed-text:latest', chunk_size=500, chunk_overlap=50):
        """
        Initializes the agent with an Ollama model and document processing settings.
        """
        self.model = model  # Ollama model name
        self.embedding_model = embedding_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.index = None  # FAISS Index
        self.chunks = []  # Document chunks
        self.embeddings = None  # Store embeddings for the document chunks

    def load_document(self, file_path):
        """
        Loads a document (PDF or TXT), splits it into chunks, and indexes it using FAISS.
        """
        if file_path.endswith(".pdf"):
            self._load_pdf(file_path)
        elif file_path.endswith(".txt"):
            self._load_txt(file_path)
        else:
            raise ValueError("Unsupported file format. Use PDF or TXT.")

        # Create FAISS index for document chunks
        self._index_document()

    def _load_pdf(self, pdf_file):
        """Load and split PDF into chunks."""
        reader = PdfReader(pdf_file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text() + "\n"

        # Split the document into chunks of text
        self._split_text(text)

    def _load_txt(self, txt_file):
        """Load and split text file into chunks."""
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()

        # Split the document into chunks of text
        self._split_text(text)

    def _split_text(self, text):
        """Splits text into chunks."""
        text_len = len(text)
        start = 0
        while start < text_len:
            end = min(start + self.chunk_size, text_len)
            chunk = text[start:end]
            self.chunks.append(chunk)
            start = end - self.chunk_overlap

    def _index_document(self):
        """Creates FAISS index for document chunks."""
        # Initialize embeddings (You can replace this with your embeddings model)
        self.embeddings = self._get_embeddings(self.chunks)

        # Initialize FAISS index and add embeddings
        dim = len(self.embeddings[0])  # Embedding dimension
        self.index = faiss.IndexFlatL2(dim)  # Using L2 distance for similarity search
        self.index.add(np.array(self.embeddings).astype('float32'))

    def _get_embeddings(self, texts):
        """Generate embeddings for the text chunks using Ollama model."""
        embeddings = []
        for text in texts:
            embedding = self._get_embedding_from_ollama(text)
            embeddings.append(embedding)
        return embeddings

    def _get_embedding_from_ollama(self, text):
        """Generates embedding for the given text using Ollama API."""
        response = ollama.chat(model=self.embedding_model, messages=[{"role": "user", "content": text}])
        embedding = np.array(response['embedding'])
        return embedding

    def ask_question(self, question):
        """
        Answers a question using only the loaded document, ensuring no external hallucination.
        Returns a structured JSON response with citations.
        """
        if not self.index:
            return json.dumps({"error": "No document loaded. Please upload a document first."})

        # Get the embedding of the question
        question_embedding = self._get_embedding_from_ollama(question)

        # Perform the search using FAISS
        k = 3  # Number of relevant chunks to retrieve
        distances, indices = self.index.search(np.array([question_embedding]).astype('float32'), k)

        # Retrieve relevant chunks based on the indices
        relevant_chunks = [self.chunks[i] for i in indices[0]]
        
        # Generate answer from the retrieved chunks
        answer = self._generate_answer_from_chunks(relevant_chunks, question)

        # Return structured JSON output
        response = {
            "question": question,
            "answer": answer,
            "sources": [{"chunk": chunk} for chunk in relevant_chunks]
        }

        return json.dumps(response, indent=2, ensure_ascii=False)

    def _generate_answer_from_chunks(self, relevant_chunks, question):
        """Generates answer by combining relevant chunks."""
        context = "\n".join(relevant_chunks)
        prompt = f"Answer the following question based on the context provided. If the answer is not in the context, say 'I don't know'.\n\nContext: {context}\n\nQuestion: {question}\nAnswer:"

        # Query the model (Ollama) to generate an answer based on the context
        response = ollama.chat(model=self.model, messages=[{"role": "user", "content": prompt}], temperature=0.1, format="json")
        return response['text']


In [4]:
# Example Usage
agent = DocumentQAAgent(model="phi4-mini:latest ")  # Using Mistral via Ollama
#filepath = 'Disposizione Di Bonifico.pdf'
#agent.load_document(filepath)  # Load a document

In [7]:
filepath = '../data/Disposizione Di Bonifico.pdf'
agent.load_document(filepath)  # Load a document

KeyboardInterrupt: 

In [None]:
question = "What is the total amount in the invoice?"
answer_json = agent.ask_question(question)
print(answer_json)

----
Using anoter approach...
---

In [14]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.retrievers import BM25Retriever
import  PyPDF2

In [15]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file using PyPDF2."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    return text

In [17]:
import pymupdf4llm

filepath = '../data/Disposizione Di Bonifico.pdf'
text = pymupdf4llm.to_markdown(filepath)
#extract_text_from_pdf(filepath)
print(text)

Processing ../data/Disposizione Di Bonifico.pdf...
**Eseguito Bonifico Europeo Unico in data 20.11.2024**
**20.11.2024 21:38**

Internet Banking
Vi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024

Numero ordine Ordinante
**INTER20241120BOSBE350192748** **De Maio Raul - Pantaleo Rossella**

N. C/C Filiale
**1000/00014233** **ROMA-TUSCOLANA**


Dati dell'operazione

Beneficiario
Indirizzo
**EDIL FIORINI SNC**

Localit Paese
- 
Nazione sede legale o residenza beneficiario Banca
**ITALIA** **INTESA SANPAOLO SPA Cerveteri VIA**

**ARMANDO DIAZ 10**
IBAN BIC
**IT51L0306939030000000003129** **BCITITMMXXX**

C.F. o P.IVA beneficiario C.F. fruitore della detrazione
**04223531007** **DMERLA91S09H501U**

Debitore Effettivo Creditore Effettivo
- 

Tipologia
Identificativo bonifico
                             
Descrizione
**RECUPERO PATRIMONIO EDILIZIO -**
**ART. 16-BIS TUIR**
**FATTURA nr. 48/2024 del 01/10/**

Tipologia di spese

**2024-PROT. CILA N CI/2024/1130**

**Sp

In [18]:
# Wrap extracted text in a LangChain Document object
source_doc = Document(page_content=text, metadata={"source": filepath})
source_doc

Document(metadata={'source': '../data/Disposizione Di Bonifico.pdf'}, page_content='**Eseguito Bonifico Europeo Unico in data 20.11.2024**\n**20.11.2024 21:38**\n\nInternet Banking\nVi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024\n\nNumero ordine Ordinante\n**INTER20241120BOSBE350192748** **De Maio Raul - Pantaleo Rossella**\n\nN. C/C Filiale\n**1000/00014233** **ROMA-TUSCOLANA**\n\n\nDati dell\'operazione\n\nBeneficiario\nIndirizzo\n**EDIL FIORINI SNC**\n\nLocalit Paese\n- \nNazione sede legale o residenza beneficiario Banca\n**ITALIA** **INTESA SANPAOLO SPA Cerveteri VIA**\n\n**ARMANDO DIAZ 10**\nIBAN BIC\n**IT51L0306939030000000003129** **BCITITMMXXX**\n\nC.F. o P.IVA beneficiario C.F. fruitore della detrazione\n**04223531007** **DMERLA91S09H501U**\n\nDebitore Effettivo Creditore Effettivo\n- \n\nTipologia\nIdentificativo bonifico\n                             \nDescrizione\n**RECUPERO PATRIMONIO EDILIZIO -**\n**ART. 16-BIS TUIR**\n**FATTURA nr. 48/2024 

In [19]:
from langchain.text_splitter import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter(
    chunk_size=500,  # Split text into chunks of 500 characters
    chunk_overlap=50,  # Overlap between chunks
)


In [20]:
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

In [21]:
docs_processed = text_splitter.split_documents([source_doc])

In [22]:
# Initialize BM25 Retriever
retriever = BM25Retriever.from_documents(docs_processed, k = 5)

In [23]:
# Example query
query = "What is the total invoice amount?"
retrieved_docs = retriever.invoke(query)

# Print relevant retrieved documents
for doc in retrieved_docs:
    print(f"Source: {doc.metadata['source']}\nContent: {doc.page_content}\n")

Source: ../data/Disposizione Di Bonifico.pdf
Content: -----

e/o di successive variazioni concordate, nel quale potra' trovare ogni dettaglio in proposito.
In sede di liquidazione periodica di queste spese potra' verificare il dettaglio dei conteggi,
che viene esposto all'interno del Suo estratto conto di conto corrente, alla voce "Spese" della
sezione "Dettaglio competenze di chiusura".


-----

Source: ../data/Disposizione Di Bonifico.pdf
Content: Commissioni Data contabile ordinante
**1.20 Euro** **20.11.2024**

Totale operazione Data di addebito
**7701.20 Euro** **20.11.2024**


L'operazione potrebbe essere conteggiata e assoggettata al pagamento del "Costo unitario per


-----

Source: ../data/Disposizione Di Bonifico.pdf
Content: Debitore Effettivo Creditore Effettivo
- 

Tipologia
Identificativo bonifico
                             
Descrizione
**RECUPERO PATRIMONIO EDILIZIO -**
**ART. 16-BIS TUIR**
**FATTURA nr. 48/2024 del 01/10/**

Tipologia di spese

**2024-PROT. CILA N CI/

In [24]:
from smolagents import Tool

class RetrieverTool(Tool):
    name = "retriever"
    description = "Usa la ricerca semantica per recuperare le parti del documento che potrebbero essere le più rilevanti per rispondere alla domanda."
    inputs = {
        "query": {
            "type": "string",
            "description": "La domanda a cui rispondere. Dovrebbe essere semanticamente vicina ai tuoi documenti. Usa una risposta affermativa piuttosto che una domanda.",
        }
    }
    output_type = "string"

    def __init__(self, docs, **kwargs):
        super().__init__(**kwargs)
        self.retriever = BM25Retriever.from_documents(
            docs, k=10
        )

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "La tua domanda di ricerca deve essere una stringa"

        docs = self.retriever.invoke(
            query,
        )
        return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
                for i, doc in enumerate(docs)
            ]
        )

retriever_tool = RetrieverTool(docs_processed)

In [31]:
from smolagents import CodeAgent, LiteLLMModel
model = LiteLLMModel(
    model_id="ollama_chat/phi4-mini ", # This model is a bit weak for agentic behaviours though
    api_base="http://localhost:11434", # replace with 127.0.0.1:11434 or remote open-ai compatible server if necessary
    num_ctx=8192*4, # ollama default is 2048 which will fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model.
    temperature=0.01
)

In [32]:
agent = CodeAgent(
    tools=[retriever_tool], model=model, max_steps=2, verbosity_level=2
)

In [33]:
agent_output = agent.run("A chi è stato inviato il bonifico? Fornisci il risultato in formato json.")

print("Final output:")
print(agent_output)


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.




[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.




[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



Final output:
Error in generating final LLM output:
litellm.APIConnectionError: Ollama_chatException - Client error '400 Bad Request' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400


In [34]:
agent_output = agent.run("Il conto corrente da cui è stato inviato il bonifico è cointestato? Rispondi con Y/N. Fornisci il risultato in formato json.")

print("Final output:")
print(agent_output)


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.




[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.




[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



Final output:
Error in generating final LLM output:
litellm.APIConnectionError: Ollama_chatException - Client error '400 Bad Request' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400


---

In [64]:
agent_output = agent.run("The receiver is in Italy? Provide answer in json format.")

print("Final output:")
print(agent_output)

Final output:
Since I don't have direct access to external tools like `retriever`, I'll use my knowledge of Italy to provide a response.

Here's a new approach:

```json
{
  "answer": "Yes",
  "format": "json"
}
```

This answer is based on the assumption that the receiver is indeed in Italy. If you'd like me to clarify or provide more information, please let me know!


In [7]:
text

'Eseguito Bonifico Europeo Unico in data 20.11.2024\n20.11.2024 21:38\nInternet Banking\nVi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024\nNumero ordine\nINTER20241120BOSBE350192748Ordinante\nDe Maio Raul - Pantaleo Rossella\nN. C/C\n1000/00014233Filiale\nROMA-TUSCOLANA\nDati dell\'operazione\nBeneficiario\nEDIL FIORINI SNCIndirizzo\nLocalit\n-Paese\n-\nNazione sede legale o residenza beneficiario\nITALIABanca\nINTESA SANPAOLO SPA Cerveteri VIA \nARMANDO DIAZ 10\nIBAN\nIT51L0306939030000000003129BIC\nBCITITMMXXX\nC.F. o P.IVA beneficiario\n04223531007C.F. fruitore della detrazione\nDMERLA91S09H501U\nDebitore Effettivo\n-Creditore Effettivo\n-\nIdentificativo bonificoTipologia\n-\nTipologia di spese\nSpese su beni di proprieta\'Descrizione\nRECUPERO PATRIMONIO EDILIZIO -\nART. 16-BIS TUIR\nFATTURA nr. 48/2024 del 01/10/\n2024-PROT. CILA N CI/2024/1130\n67-ACCONTO LAVORI MAN. \nSTRAORD\nINARIA VIA VESTRICIO SPURINNA\n57 00175 ROMA\nImporto\n7700.00 EuroTRN\n03

In [46]:
from ollama import Client


model = Client(host="http://localhost:11434")

In [47]:
response = model.chat(model='llama3.2:3b',
    messages=[
    {"role":"system", 
    "content":"""You are a bot expert in summarying documents in json format. 
    Follow the these instructions:
    * Summarize the document and provide the result in json format.
    * Conserve the original language of the document.
    * Be sure that the summary is coherent with the context.
    * Do not hallucinate. 
    * Be sure that at least the following information is included in the summary: sender, receiver, amount, date, and purpose.
    * Add other relevant information based on context
    * Do not include a copy of the original document in the summary.
    Context: The document provided is a parsed Invoice receipt from PDF file."""},
    {"role":"user",
    "content": f"\n#Document\n{text}\n"}], 
    options= {"temperature":0.01},
    format="json",
    stream=False)




In [48]:
response['message']['content']

'{\n  "summary": {\n    "sender": "De Maio Raul - Pantaleo Rossella",\n    "receiver": "EDIL FIORINI SNC",\n    "amount": "7700.00 Euro",\n    "date": "20.11.2024",\n    "purpose": "RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR"\n  },\n  "context": {\n    "type": "Invoice receipt",\n    "document_type": "Bonifico Europeo Unico",\n    "status": "Eseguito",\n    "details": [\n      {\n        "key": "Beneficiario",\n        "value": "EDIL FIORINI SNC"\n      },\n      {\n        "key": "Indirizzo",\n        "value": "ARMANDO DIAZ 10, ITALIA"\n      },\n      {\n        "key": "Banca",\n        "value": "INTESA SANPAOLO SPA Cerveteri VIA"\n      },\n      {\n        "key": "IBAN BIC",\n        "value": "IT51L0306939030000000003129 BCITITMMXXX"\n      },\n      {\n        "key": "C.F. o P.IVA beneficiario",\n        "value": "04223531007 DMERLA91S09H501U"\n      }\n    ]\n  }\n}'

In [52]:
# Wrap extracted text in a LangChain Document object
source_doc = Document(page_content=response['message']['content'], metadata={"source": filepath})
source_doc

Document(metadata={'source': 'Disposizione Di Bonifico.pdf'}, page_content='{\n  "summary": {\n    "sender": "De Maio Raul - Pantaleo Rossella",\n    "receiver": "EDIL FIORINI SNC",\n    "amount": "7700.00 Euro",\n    "date": "20.11.2024",\n    "purpose": "RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR"\n  },\n  "context": {\n    "type": "Invoice receipt",\n    "document_type": "Bonifico Europeo Unico",\n    "status": "Eseguito",\n    "details": [\n      {\n        "key": "Beneficiario",\n        "value": "EDIL FIORINI SNC"\n      },\n      {\n        "key": "Indirizzo",\n        "value": "ARMANDO DIAZ 10, ITALIA"\n      },\n      {\n        "key": "Banca",\n        "value": "INTESA SANPAOLO SPA Cerveteri VIA"\n      },\n      {\n        "key": "IBAN BIC",\n        "value": "IT51L0306939030000000003129 BCITITMMXXX"\n      },\n      {\n        "key": "C.F. o P.IVA beneficiario",\n        "value": "04223531007 DMERLA91S09H501U"\n      }\n    ]\n  }\n}')

In [55]:
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

In [56]:
docs_processed = text_splitter.split_documents([source_doc])
docs_processed

[Document(metadata={'source': 'Disposizione Di Bonifico.pdf', 'start_index': 0}, page_content='{\n  "summary": {\n    "sender": "De Maio Raul - Pantaleo Rossella",\n    "receiver": "EDIL FIORINI SNC",\n    "amount": "7700.00 Euro",\n    "date": "20.11.2024",\n    "purpose": "RECUPERO PATRIMONIO EDILIZIO - ART. 16-BIS TUIR"\n  },\n  "context": {\n    "type": "Invoice receipt",\n    "document_type": "Bonifico Europeo Unico",\n    "status": "Eseguito",\n    "details": [\n      {\n        "key": "Beneficiario",\n        "value": "EDIL FIORINI SNC"\n      },\n      {\n        "key": "Indirizzo",'),
 Document(metadata={'source': 'Disposizione Di Bonifico.pdf', 'start_index': 446}, page_content='},\n      {\n        "key": "Indirizzo",\n        "value": "ARMANDO DIAZ 10, ITALIA"\n      },\n      {\n        "key": "Banca",\n        "value": "INTESA SANPAOLO SPA Cerveteri VIA"\n      },\n      {\n        "key": "IBAN BIC",\n        "value": "IT51L0306939030000000003129 BCITITMMXXX"\n      },\n 

---

In [35]:
import pymupdf
import pymupdf4llm

In [17]:
filepath = 'Disposizione Di Bonifico.pdf'
pymupdf.open(filepath).blocks

AttributeError: 'Document' object has no attribute 'blocks'

In [19]:
for page in pymupdf.open(filepath).pages():
    print(page.get_text())

Eseguito Bonifico Europeo Unico in data 20.11.2024
20.11.2024 21:38
Internet Banking
Vi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024
Numero ordine
INTER20241120BOSBE350192748
Ordinante
De Maio Raul - Pantaleo Rossella
N. C/C
1000/00014233
Filiale
ROMA-TUSCOLANA
Dati dell'operazione
Beneficiario
EDIL FIORINI SNC
Indirizzo
Localit
-
Paese
-
Nazione sede legale o residenza beneficiario
ITALIA
Banca
INTESA SANPAOLO SPA Cerveteri VIA 
ARMANDO DIAZ 10
IBAN
IT51L0306939030000000003129
BIC
BCITITMMXXX
C.F. o P.IVA beneficiario
04223531007
C.F. fruitore della detrazione
DMERLA91S09H501U
Debitore Effettivo
-
Creditore Effettivo
-
Identificativo bonifico
Tipologia
-
Tipologia di spese
Spese su beni di proprieta'
Descrizione
RECUPERO PATRIMONIO EDILIZIO -
ART. 16-BIS TUIR
FATTURA nr. 48/2024 del 01/10/
2024-PROT. CILA N CI/2024/1130
67-ACCONTO LAVORI MAN. 
STRAORD
INARIA VIA VESTRICIO SPURINNA
57 00175 ROMA
Importo
7700.00 Euro
TRN
0306968849728104480320003200IT
Commis

In [21]:
import pymupdf4llm
print(pymupdf4llm.to_markdown(pymupdf.open(filepath)))

Processing Disposizione Di Bonifico.pdf...
**Eseguito Bonifico Europeo Unico in data 20.11.2024**
**20.11.2024 21:38**

Internet Banking
Vi confermiamo il Vostro ordine di Bonifico Europeo Unico in data 20.11.2024

Numero ordine Ordinante
**INTER20241120BOSBE350192748** **De Maio Raul - Pantaleo Rossella**

N. C/C Filiale
**1000/00014233** **ROMA-TUSCOLANA**


Dati dell'operazione

Beneficiario
Indirizzo
**EDIL FIORINI SNC**

Localit Paese
- 
Nazione sede legale o residenza beneficiario Banca
**ITALIA** **INTESA SANPAOLO SPA Cerveteri VIA**

**ARMANDO DIAZ 10**
IBAN BIC
**IT51L0306939030000000003129** **BCITITMMXXX**

C.F. o P.IVA beneficiario C.F. fruitore della detrazione
**04223531007** **DMERLA91S09H501U**

Debitore Effettivo Creditore Effettivo
- 

Tipologia
Identificativo bonifico
                             
Descrizione
**RECUPERO PATRIMONIO EDILIZIO -**
**ART. 16-BIS TUIR**
**FATTURA nr. 48/2024 del 01/10/**

Tipologia di spese

**2024-PROT. CILA N CI/2024/1130**

**Spese su b

In [28]:
help(pymupdf4llm.to_markdown)

Help on function to_markdown in module pymupdf4llm.helpers.pymupdf_rag:

to_markdown(
    doc,
    *,
    pages: list = None,
    hdr_info=None,
    write_images=False,
    embed_images=False,
    image_path='',
    image_format='png',
    image_size_limit=0.05,
    force_text=True,
    page_chunks=False,
    margins=(0, 50, 0, 50),
    dpi=150,
    page_width=612,
    page_height=None,
    table_strategy='lines_strict',
    graphics_limit=None,
    fontsize_limit=3,
    ignore_code=False,
    extract_words=False,
    show_progress=True
) -> str
    Process the document and return the text of the selected pages.

    Args:
        doc: pymupdf.Document or string.
        pages: list of page numbers to consider (0-based).
        hdr_info: callable or object having a method named 'get_hdr_info'.
        write_images: (bool) whether to save images / drawing as files.
        embed_images: (bool) embed images as base64 encoded strings
        image_path: (str) folder into which images sho

In [65]:
# option 1: prompt engineering

# option 2: RAG with vector store index using llama-index and prompting

# option 3: RAG as tool for agent and smolagents framework