# Multimodal RAG

This notebook goes over how to build a RAG system that can leverage Image+Text Capabilities of the SambaNova Multimodal models 

In [1]:
import os
import sys
import glob

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from dotenv import load_dotenv

load_dotenv(os.path.join(repo_dir, '.env'), override=True)

import base64
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen

from pprint import pprint

## Multimodal call

In [2]:
from langchain_sambanova import ChatSambaNova

lvlm=ChatSambaNova(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
    model="Llama-4-Maverick-17B-128E-Instruct"
)

### Utils

In [3]:
def image_to_base64(image_path_or_url: str) -> str:
    """
    Convert a local image file or HTTP/HTTPS image URL to a base64 data URI.

    :param image_path_or_url: Filesystem path or URL to the image.
    :return: Base64 data URI string (e.g., data:image/png;base64,...)
    """
    parsed = urlparse(image_path_or_url)

    if parsed.scheme in ("http", "https"):
        with urlopen(image_path_or_url) as resp:
            image_binary = resp.read()
            content_type = resp.headers.get_content_type() or "image/jpeg"
    else:
        with open(image_path_or_url, "rb") as image_file:
            image_binary = image_file.read()
            # Try to infer MIME type from extension; fall back to jpeg
            ext = Path(image_path_or_url).suffix.lower().lstrip(".")
            content_type = f"image/{ext or 'jpeg'}"

    base64_image = base64.b64encode(image_binary).decode()
    return f"data:{content_type};base64,{base64_image}"

### QA Call

In [4]:
prompt = 'how many birds could you find at 4pm:'
image_path = os.path.join(kit_dir, 'data', 'sample_docs', 'sample.png')
lvlm.invoke(
        input=[
                {
                        "role": "user",
                        "content": [
                                {
                                        "type": "text",
                                        "text": prompt
                                },
                                {
                                        "type": "image_url",
                                        "image_url": {
                                                "url": image_to_base64(image_path)
                                        }
                                }
                        ]
                }
        ]
)

AIMessage(content='To determine the number of birds at 4pm, we need to look at the chart and find the data point corresponding to 4pm on the x-axis.\n\nThe chart shows a scatter plot with time on the x-axis and the number of birds on the y-axis. The data points are scattered across the graph, representing the number of birds at different times of the day.\n\nLooking at the x-axis, we can see that 4pm is the last data point on the right-hand side of the graph. The corresponding y-value for this data point is approximately 10.\n\nTherefore, according to the chart, there are approximately 10 birds at 4pm.\n\n**Answer:** 10', additional_kwargs={}, response_metadata={'token_usage': {'acceptance_rate': None, 'completion_tokens': 139, 'completion_tokens_after_first_per_sec': 636.1935177531963, 'completion_tokens_after_first_per_sec_first_ten': 642.0501093167123, 'completion_tokens_after_first_per_sec_graph': 642.0501093167123, 'completion_tokens_per_sec': 266.59654590882985, 'end_time': 17653

### Summary call

In [5]:
prompt = 'A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the humans question. USER: <image>\nDescribe the image in detail. Be specific about graphs, such as bar plots, scatter plots, or others. ASSISTANT:'
lvlm.invoke(
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_to_base64(image_path)
                    }
                }
            ]
        }
    ]
)

AIMessage(content='The image presents a scatter plot titled "Number of Birds on a Tree vs Time of the day" in blue text at the top. The graph is set against a white background and features a logo in the upper-right corner, which includes a rocket ship and the text "cuemath THE MATH EXPERT" in black and orange.\n\n**Graph Details:**\n\n*   The x-axis is labeled "Time of the day" in blue text and is marked with orange labels indicating times from 6 AM to 4 PM.\n*   The y-axis is labeled "Number of Birds" in blue text and is marked with numbers from 10 to 40.\n*   The graph contains seven orange data points, each representing the number of birds on a tree at a specific time of day.\n*   The data points are scattered throughout the graph, with no discernible pattern or trend.\n\n**Data Points:**\n\n*   6 AM: approximately 18 birds\n*   8 AM: approximately 38 birds\n*   10 AM: approximately 20 birds\n*   12 PM: approximately 30 birds\n*   2 PM: approximately 42 birds\n*   4 PM: approximatel

## Doc Extraction

### Unstructured PDF extraction

In [31]:
from unstructured.partition.pdf import partition_pdf

# Path to save images
file_path = os.path.join(kit_dir, 'data', 'sample_docs', 'invoicesample.pdf')
output_path = os.path.splitext(file_path)[0]

# Ensure poppler binaries are visible to Python
os.environ["PATH"] = "/opt/homebrew/bin:" + os.environ["PATH"]
os.environ["POPPLER_PATH"] = "/opt/homebrew/bin"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=file_path,
    extract_images_in_pdf=True,
    strategy='hi_res',
    hi_res_model_name='yolox',
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    chunking_strategy='by_title',
    max_characters=1000,
    new_after_n_chars=800,
    combine_text_under_n_chars=500,
    extract_image_block_output_dir=output_path,
)

Cannot set gray non-stroke color because /'Pat1' is an invalid float value
Cannot set gray non-stroke color because /'Pat2' is an invalid float value
Cannot set gray non-stroke color because /'Pat3' is an invalid float value
Cannot set gray non-stroke color because /'Pat4' is an invalid float value
Cannot set gray non-stroke color because /'Pat5' is an invalid float value
Cannot set gray non-stroke color because /'Pat6' is an invalid float value
Cannot set gray non-stroke color because /'Pat7' is an invalid float value
Cannot set gray non-stroke color because /'Pat8' is an invalid float value
Cannot set gray non-stroke color because /'Pat9' is an invalid float value
Cannot set gray non-stroke color because /'Pat10' is an invalid float value
Cannot set gray non-stroke color because /'Pat11' is an invalid float value
Cannot set gray non-stroke color because /'Pat12' is an invalid float value
Cannot set gray non-stroke color because /'Pat13' is an invalid float value
Cannot set gray non-s



Cannot set gray non-stroke color because /'Pat1' is an invalid float value
Cannot set gray non-stroke color because /'Pat2' is an invalid float value
Cannot set gray non-stroke color because /'Pat3' is an invalid float value
Cannot set gray non-stroke color because /'Pat4' is an invalid float value
Cannot set gray non-stroke color because /'Pat5' is an invalid float value
Cannot set gray non-stroke color because /'Pat6' is an invalid float value
Cannot set gray non-stroke color because /'Pat7' is an invalid float value
Cannot set gray non-stroke color because /'Pat8' is an invalid float value
Cannot set gray non-stroke color because /'Pat9' is an invalid float value
Cannot set gray non-stroke color because /'Pat10' is an invalid float value
Cannot set gray non-stroke color because /'Pat11' is an invalid float value
Cannot set gray non-stroke color because /'Pat12' is an invalid float value
Cannot set gray non-stroke color because /'Pat13' is an invalid float value
Cannot set gray non-s

### View Elements

In [28]:
for i, element in enumerate(raw_pdf_elements):
    print(f'\033[95m ELEMENT {i}\033[00m')
    print(f'TYPE: {type(element)}')
    print(f'META: {element.metadata.to_dict()}')
    print(f'TEXT: {element.text}')
    print('\n\n##########\n')

[95m ELEMENT 0[00m
TYPE: <class 'unstructured.documents.elements.CompositeElement'>
META: {'filetype': 'application/pdf', 'languages': ['eng', 'fra'], 'last_modified': '2024-10-08T16:44:41', 'page_number': 1, 'text_as_html': '<table><tbody><tr><td>Apple</td><td>$5.00</td><td>1</td><td>$5.00</td></tr><tr><td>Orange</td><td>$1.99</td><td>2</td><td>$3.98</td></tr><tr><td>Watermelon</td><td>$1.69</td><td>3</td><td>$5.07</td></tr><tr><td>Mango</td><td>$9.56</td><td>2</td><td>$19.12</td></tr><tr><td>Peach</td><td>$2.99</td><td>1</td><td>$2.99</td></tr></tbody></table>', 'orig_elements': 'eJzNWWtv3LgV/SvENAWSYqjh+xEECwTdYmtsY6fwpEURBANKpMbaaKRZPeL17va/95KSH1nPFmsDY/iDId8zvCJ5D++L+vjLItRhF5phU/nFa7TgLBgbyoBdnnMsFC2wY5LioiiVoN5ZrvRiiRa7MDjvBgc6vyyKtu181bgh9Emu3VU7DpuLUG0vBkAY5xZ0Zviy8sMFoFRJDui+rZoh6n38aC3L5BJRITLxaYmuZUZVJqNMJdEZOwBMGoAs+qt+CLu4k/fVT6E+37siLP4LP/gwhGKo2mZT1K7vN/uuzWEYyZjiUsGAsqrDcLUPSff9u0VacLMd3Tbt6uMiNNvFp4T2w2bX+qqsQrIZI0xgSjAxa6peC/Fa0Ki9B81NM+7y0MXdxkUM4adoj8Xb9fpvp+uTs

In [29]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 1}

In [30]:
from langchain_classic.schema import Document


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if 'unstructured.documents.elements.Table' in str(type(element)):
        meta = element.metadata.to_dict()
        meta['type'] = 'table'
        categorized_elements.append(Document(page_content=element.metadata.text_as_html, metadata=meta))
    elif 'unstructured.documents.elements.CompositeElement' in str(type(element)):
        meta = element.metadata.to_dict()
        meta['type'] = 'text'
        categorized_elements.append(Document(page_content=str(element), metadata=meta))

# Tables
table_docs = [e for e in categorized_elements if e.metadata['type'] == 'table']
print(len(table_docs))

# Text
text_docs = [e for e in categorized_elements if e.metadata['type'] == 'text']
print(len(text_docs))

0
1


### Text and table summaries

In [19]:
import yaml
from langchain_sambanova import ChatSambaNova
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import load_prompt
from langchain_classic.prompts import ChatPromptTemplate, load_prompt

In [20]:
def load_chat_prompt(path: str) -> ChatPromptTemplate:
    """Load chat prompt from yaml file"""

    with open(path, 'r') as file:
        config = yaml.safe_load(file)

    config.pop('_type')

    template = config.pop('template')

    if not template:
        msg = "Can't load chat prompt without template"
        raise ValueError(msg)

    messages = []
    if isinstance(template, str):
        messages.append(('human', template))

    elif isinstance(template, list):
        for item in template:
            messages.append((item['role'], item['content']))

    return ChatPromptTemplate(messages=messages, **config)

In [21]:
text_prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts', 'text_summary.yaml'))
table_prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts', 'table_summary.yaml'))

# Summary chain
model = ChatSambaNova(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
    model='Meta-Llama-3.1-8B-Instruct',
)

text_summarize_chain = {'element': lambda x: x} | text_prompt | model | StrOutputParser()
table_summarize_chain = {'element': lambda x: x} | table_prompt | model | StrOutputParser()

### Text Summaries

In [33]:
# Apply to text
texts = [i.page_content for i in text_docs if i.page_content != '']
if texts:
    text_summaries = text_summarize_chain.batch(texts, {'max_concurrency': 1})
    text_summaries

### Table summaries

In [32]:
# Apply to tables
tables = [i.page_content for i in table_docs]
if tables:
    table_summaries = table_summarize_chain.batch(tables, {'max_concurrency': 1})
    table_summaries

### Image summary

In [36]:
prompt = 'Describe the image in detail. Be specific about graphs include name of axis, labels, legends and important numerical information'
image_paths = []
image_paths.extend(glob.glob(os.path.join(output_path, '*.jpg')))
image_paths.extend(glob.glob(os.path.join(output_path, '*.png')))

image_summaries = []
image_docs = []

for image_path in image_paths:
    result = lvlm.invoke(
        input=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_to_base64(image_path)
                        }
                    }
                ]
            }
        ]
    )
    image_summaries.append(result.content)
    image_docs.append(
        Document(
            page_content=result.content,
            metadata={
                'type': 'image',
                'file_directory': os.path.dirname(image_path),
                'filename': os.path.basename(image_path),
            },
        )
    )

In [37]:
image_summaries

['The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white background.\n* **Gold Circle**\n\t+ A gold circle serves as the primary shape of the logo.\n\t+ The circle is divided into two sections: the top half features a yellow sunburst with green trees on either side, while the bottom half depicts rolling green hills.\n\t+ The text "AUSTRALIA FRESH PRODUCE" is written in white letters along the top edge of the circle.\n\t+ The text "VICTORIA" is displayed at the bottom of the circle.\n* **Gold Banner**\n\t+ A gold banner is positioned across the center of the circle.\n\t+ The banner features the text "SUNNY FARM" in white letters.\n* **Color Scheme**\n\t+ The dominant colors used in the logo are gold, green, yellow, and white.\n* **Overall Design**\n\t+ The logo effectively conveys a sense of freshness and qua

In [38]:
image_docs

[Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white background.\n* **Gold Circle**\n\t+ A gold circle serves as the primary shape of the logo.\n\t+ The circle is divided into two sections: the top half features a yellow sunburst with green trees on either side, while the bottom half depicts rolling green hills.\n\t+ The text "AUSTRALIA FRESH PRODUCE" is written in white letters along the top edge of the circle.\n\t+ The text "VICTORIA" is displayed at the bottom of the circle.\n* **Gold Banner**\n\t+ A gold banner is positioned across the center of the circle.\n\t+ The banner features

### add to vectorstore

In [39]:
import uuid

from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
from langchain_classic.storage import InMemoryByteStore
from langchain_sambanova import SambaNovaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name='summaries',
    embedding_function=SambaNovaEmbeddings(
        model='E5-Mistral-7B-Instruct'
    ),
)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = 'doc_id'

# The retriever (empty to start)
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key, search_kwargs={'k': 2})

  vectorstore = Chroma(


In [40]:
# Add texts
if texts:
    doc_ids = [str(uuid.uuid4()) for _ in text_docs]
    summary_texts = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, text_docs)))

# Add tables
if tables:
    table_ids = [str(uuid.uuid4()) for _ in table_docs]
    summary_tables = [Document(page_content=s, metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, table_docs)))

# Add images
if image_summaries:
    img_ids = [str(uuid.uuid4()) for _ in image_summaries]
    summary_img = [Document(page_content=s, metadata={id_key: img_ids[i]}) for i, s in enumerate(image_summaries)]
    retriever.vectorstore.add_documents(summary_img)
    retriever.docstore.mset(list(zip(img_ids, image_docs)))  # Store the image summary as the raw document

In [41]:
retriever.invoke('what is the final price in the invoice?')

[Document(metadata={'filetype': 'application/pdf', 'languages': ['eng', 'fra'], 'last_modified': '2024-10-08T16:44:41', 'page_number': 1, 'text_as_html': '<table><tbody><tr><td>Apple</td><td>$5.00</td><td>1</td><td>$5.00</td></tr><tr><td>Orange</td><td>$1.99</td><td>2</td><td>$3.98</td></tr><tr><td>Watermelon</td><td>$1.69</td><td>3</td><td>$5.07</td></tr><tr><td>Mango</td><td>$9.56</td><td>2</td><td>$19.12</td></tr><tr><td>Peach</td><td>$2.99</td><td>1</td><td>$2.99</td></tr></tbody></table>', 'orig_elements': 'eJzNWftv3LgR/leIbQokxVLLlygyCA4IesXVuMZO4U2LIggWFDVa66KV9vSIz3fX/71DSn7kvFecDazhH7za+ZYjkvNxXvTHXxZQww6aYVMVi9dkkapSiFQqqkwqqbJWUVfkkrrUOlZaZrQqFkuy2MHgCjc41Pll4du2K6rGDdBHuXZX7ThsLqDaXgyICCkt6szwZVUMF4hynUpE923VDEHv40drRZIuCVcqUZ+W5FoWXCdpkHnKskQcACYNRBb9VT/ALuzkffUT1Od752HxX/yhgAH8ULXNxteu7zf7rs1xGEuElqnGAWVVw3C1h6j7/t0iLrjZjm4bd/VxAc128Smi/bDZtUVVVhBtJphQlDPKzJrr10q9Vjxo71Fz04y7HLqw27CIAX4K9li8Xa//dro+OTsl67Mw9HredTXUcbm/pUUarcvSFdQIg9zoUiAjGaNZxnLunMmlLo9GC8/SJEOrpyzh0eqTrN

In [42]:
retriever.invoke('what is the logo of the company')

[Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white background.\n* **Gold Circle**\n\t+ A gold circle serves as the primary shape of the logo.\n\t+ The circle is divided into two sections: the top half features a yellow sunburst with green trees on either side, while the bottom half depicts rolling green hills.\n\t+ The text "AUSTRALIA FRESH PRODUCE" is written in white letters along the top edge of the circle.\n\t+ The text "VICTORIA" is displayed at the bottom of the circle.\n* **Gold Banner**\n\t+ A gold banner is positioned across the center of the circle.\n\t+ The banner features

## Retrieval with raw text, raw tables and image summaries

In [44]:
from langchain_classic.chains import RetrievalQA

prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts', 'knowledge_retriever_custom_qa_prompt.yaml'))

chain = RetrievalQA.from_llm(
    llm=model, retriever=retriever, return_source_documents=True, input_key='question', output_key='answer'
)
chain.combine_documents_chain.llm_chain.prompt = prompt

In [45]:
chain.invoke({'question': 'what is the final price in the invoice?'})

{'question': 'what is the final price in the invoice?',
 'answer': 'The final price in the invoice is $39.60.',
 'source_documents': [Document(metadata={'filetype': 'application/pdf', 'languages': ['eng', 'fra'], 'last_modified': '2024-10-08T16:44:41', 'page_number': 1, 'text_as_html': '<table><tbody><tr><td>Apple</td><td>$5.00</td><td>1</td><td>$5.00</td></tr><tr><td>Orange</td><td>$1.99</td><td>2</td><td>$3.98</td></tr><tr><td>Watermelon</td><td>$1.69</td><td>3</td><td>$5.07</td></tr><tr><td>Mango</td><td>$9.56</td><td>2</td><td>$19.12</td></tr><tr><td>Peach</td><td>$2.99</td><td>1</td><td>$2.99</td></tr></tbody></table>', 'orig_elements': 'eJzNWftv3LgR/leIbQokxVLLlygyCA4IesXVuMZO4U2LIggWFDVa66KV9vSIz3fX/71DSn7kvFecDazhH7za+ZYjkvNxXvTHXxZQww6aYVMVi9dkkapSiFQqqkwqqbJWUVfkkrrUOlZaZrQqFkuy2MHgCjc41Pll4du2K6rGDdBHuXZX7ThsLqDaXgyICCkt6szwZVUMF4hynUpE923VDEHv40drRZIuCVcqUZ+W5FoWXCdpkHnKskQcACYNRBb9VT/ALuzkffUT1Od752HxX/yhgAH8ULXNxteu7zf7rs1xGEuElqnGAWVVw3C1h6j7/t0iLrjZjm4bd/VxAc128Smi/bDZt

In [46]:
chain.invoke('what is the logo of the company')

{'question': 'what is the logo of the company',
 'answer': 'The logo of the company is described in the first context provided. It features a gold circle with a yellow sunburst and green trees on the top half, and rolling green hills on the bottom half. The text "SUNNY FARM" is displayed on a gold banner across the center of the circle, with the text "AUSTRALIA FRESH PRODUCE" at the top edge of the circle and "VICTORIA" at the bottom. The logo is set against a white background and features a color scheme of gold, green, yellow, and white.',
 'source_documents': [Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The

## Retrieval with raw text, raw tables and raw images

In [47]:
query = 'what is the logo of the company?'

In [48]:
retriever.invoke(query)

[Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white background.\n* **Gold Circle**\n\t+ A gold circle serves as the primary shape of the logo.\n\t+ The circle is divided into two sections: the top half features a yellow sunburst with green trees on either side, while the bottom half depicts rolling green hills.\n\t+ The text "AUSTRALIA FRESH PRODUCE" is written in white letters along the top edge of the circle.\n\t+ The text "VICTORIA" is displayed at the bottom of the circle.\n* **Gold Banner**\n\t+ A gold banner is positioned across the center of the circle.\n\t+ The banner features

In [49]:
chain.invoke({'question': query})

{'question': 'what is the logo of the company?',
 'answer': 'The logo of the company is described in the first context. It is a stylized design that features a gold circle divided into two sections, with a yellow sunburst and green trees on the top half, and rolling green hills on the bottom half. The text "AUSTRALIA FRESH PRODUCE" is written along the top edge of the circle, "VICTORIA" is displayed at the bottom of the circle, and "SUNNY FARM" is positioned in a gold banner across the center of the circle.',
 'source_documents': [Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white bac

### filter image results

In [50]:
def get_retrieved_images(retriever, query):
    results = retriever.invoke(query)
    results = [result for result in results if result.metadata['type'] == 'image']
    return results

In [51]:
retrieved_images = get_retrieved_images(retriever, query)
retrieved_images

[Document(metadata={'type': 'image', 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/multimodal_knowledge_retriever/data/sample_docs/invoicesample', 'filename': 'figure-1-1.jpg'}, page_content='The image presents a logo for "Sunny Farm" in Victoria, Australia, featuring a stylized design that incorporates various elements to convey a sense of freshness and quality.\n\n* **Logo Background**\n\t+ The logo is set against a white background.\n* **Gold Circle**\n\t+ A gold circle serves as the primary shape of the logo.\n\t+ The circle is divided into two sections: the top half features a yellow sunburst with green trees on either side, while the bottom half depicts rolling green hills.\n\t+ The text "AUSTRALIA FRESH PRODUCE" is written in white letters along the top edge of the circle.\n\t+ The text "VICTORIA" is displayed at the bottom of the circle.\n* **Gold Banner**\n\t+ A gold banner is positioned across the center of the circle.\n\t+ The banner features

### Generate response over retrieved raw images 

In [63]:
def get_image_answers(retrieved_image_docs, query):
    image_answer_prompt_template = load_chat_prompt(os.path.join(kit_dir, 'prompts', 'multimodal-qa.yaml'))
    image_answer_prompt = image_answer_prompt_template.format(question=query)
    answers = []
    for doc in retrieved_image_docs:
        image_path = os.path.join(doc.metadata['file_directory'], doc.metadata['filename'])
        answers.append(lvlm.invoke(
            input=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": image_answer_prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_to_base64(image_path)
                            }
                        }
                    ]
                }
            ]
        ).content
    )                
    return answers

In [64]:
image_answers = get_image_answers(retrieved_images, query)
image_answers

['The logo of the company is a circular emblem featuring a stylized illustration of a sunrise over rolling hills, with two trees on either side. The logo is surrounded by a gold border with the words "AUSTRALIA FRESH PRODUCE" written in white text at the top and "VICTORIA" at the bottom. A gold banner across the center of the logo reads "SUNNY FARM". The overall design is meant to evoke a sense of freshness, quality, and connection to the natural environment, which is fitting for a company that produces fresh produce. The logo effectively communicates the company\'s brand identity and values. The logo is for "Sunny Farm".']

In [65]:
def get_retrieved_docs(retriever, query):
    results = retriever.invoke(query)
    results = [result for result in results if result.metadata['type'] != 'image']
    return results

In [66]:
context_docs = get_retrieved_docs(retriever, query)
context_docs

[Document(metadata={'filetype': 'application/pdf', 'languages': ['eng', 'fra'], 'last_modified': '2024-10-08T16:44:41', 'page_number': 1, 'text_as_html': '<table><tbody><tr><td>Apple</td><td>$5.00</td><td>1</td><td>$5.00</td></tr><tr><td>Orange</td><td>$1.99</td><td>2</td><td>$3.98</td></tr><tr><td>Watermelon</td><td>$1.69</td><td>3</td><td>$5.07</td></tr><tr><td>Mango</td><td>$9.56</td><td>2</td><td>$19.12</td></tr><tr><td>Peach</td><td>$2.99</td><td>1</td><td>$2.99</td></tr></tbody></table>', 'orig_elements': 'eJzNWftv3LgR/leIbQokxVLLlygyCA4IesXVuMZO4U2LIggWFDVa66KV9vSIz3fX/71DSn7kvFecDazhH7za+ZYjkvNxXvTHXxZQww6aYVMVi9dkkapSiFQqqkwqqbJWUVfkkrrUOlZaZrQqFkuy2MHgCjc41Pll4du2K6rGDdBHuXZX7ThsLqDaXgyICCkt6szwZVUMF4hynUpE923VDEHv40drRZIuCVcqUZ+W5FoWXCdpkHnKskQcACYNRBb9VT/ALuzkffUT1Od752HxX/yhgAH8ULXNxteu7zf7rs1xGEuElqnGAWVVw3C1h6j7/t0iLrjZjm4bd/VxAc128Smi/bDZtUVVVhBtJphQlDPKzJrr10q9Vjxo71Fz04y7HLqw27CIAX4K9li8Xa//dro+OTsl67Mw9HredTXUcbm/pUUarcvSFdQIg9zoUiAjGaNZxnLunMmlLo9GC8/SJEOrpyzh0eqTrN

In [67]:
prompt = load_chat_prompt(os.path.join(kit_dir, 'prompts', 'knowledge_retriever_custom_qa_prompt.yaml'))
text_contexts = [doc.page_content for doc in context_docs]
full_context = '\n\n'.join(image_answers) + '\n\n' + '\n\n'.join(text_contexts)
formated_prompt = prompt.format(context=full_context, question=query)
formated_prompt
model.invoke(formated_prompt)

AIMessage(content='The logo of the company is a circular emblem featuring a stylized illustration of a sunrise over rolling hills, with two trees on either side. It is surrounded by a gold border with the words "AUSTRALIA FRESH PRODUCE" written in white text at the top and "VICTORIA" at the bottom, and a gold banner across the center of the logo reads "SUNNY FARM".', additional_kwargs={}, response_metadata={'token_usage': {'acceptance_rate': None, 'completion_tokens': 84, 'completion_tokens_after_first_per_sec': 835.8758169620776, 'completion_tokens_after_first_per_sec_first_ten': 840.8707851056018, 'completion_tokens_after_first_per_sec_graph': 840.8707851056018, 'completion_tokens_per_sec': 654.8800469893716, 'end_time': 1765324419.8764029, 'is_last_response': True, 'prompt_tokens': 497, 'prompt_tokens_details': {'cached_tokens': 0}, 'start_time': 1765324419.748135, 'time_to_first_token': 0.028970718383789062, 'total_latency': 0.12826776504516602, 'total_tokens': 581, 'total_tokens_p

This example workflow is consolidated in the provided [multimodal rag src module](../src/multimodal.py) to see an usage example please refer to the [multimodal rag notebook](./3_multimodal_rag_usage.ipynb) 