# Machina Experiments: Multimodal Agentic RAG

In [1]:
# Import packages

import pandas as pd
import csv
import marqo as mq
import os
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from pydantic import BaseModel, Field
from openai import OpenAI
from pprint import pprint
from IPython.display import display, HTML
from dotenv import load_dotenv

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


## Marqo Retriever + OpenRouter Agents

In [2]:
# Load environment variables
load_dotenv(dotenv_path="config/.env")

# This is a critical check. We verify that our script can access the necessary API keys from the environment.
required_vars = ["DEEPSEEK_API_KEY", "QWEN_API_KEY"]

missing = [var for var in required_vars if var not in os.environ]

if missing:
    print(f"Missing environment variables: {', '.join(missing)}")
else:
    print("Environment variables loaded successfully.")

Environment variables loaded successfully.


### User Query

In [19]:
### ---> ### Enter user query here ### <--- ###

user_query = "Are there images of dragons?"

### Router Agent Tests
- The agent decides if image or text are more relevant to answer user question
- The agent reformulates the user question adapting it for semantic search in multimodal indices

In [20]:
# OpenRouter client
### Load LLM for deciding if text or image index are more relevant to answer the user question

llm = ChatOpenAI(
    model_name="tngtech/deepseek-r1t-chimera:free",   # any OpenRouter model
    openai_api_key=os.environ["DEEPSEEK_API_KEY"],
    openai_api_base="https://openrouter.ai/api/v1",
    temperature=0.2,
)

class RoutingOutput(BaseModel):
    target_index: str = Field(description="either 'text' or 'image'")
    retrieval_query: str
        
structured_llm = llm.with_structured_output(RoutingOutput)

In [21]:
### Router agent

def router_agent(user_query: str) -> dict:
    system_prompt = """
You are a Retrieval Router Agent for a RAG system using two Marqo indexes:
1) A TEXT index
2) An IMAGE index

Your task:
- Decide which index is the best match for the user's query.
- Output a JSON object with:
    {
      "target_index": "text" or "image",
      "retrieval_query": "the optimized retrieval query"
    }

Decision rules:
- If the user asks about images, pictures, visual similarity, appearance, color, shapes → choose "image".
- If the user asks about text meaning, authors, facts, historical explanations, interpretations → choose "text".
- If ambiguous, choose "text".

For retrieval_query:
- Remove chit-chat.
- Extract the true intent.
- Expand with useful semantic keywords.
- Keep it concise.

Return ONLY JSON inside <json>...</json>.
Do not output anything outside these tags.
Do not show your reasoning or thought process.
"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query},
    ]

    result = structured_llm.invoke(messages)
    return result

### Decision by router agent
decision = router_agent(user_query)
print(decision)

target_index='image' retrieval_query='dragons'


In [22]:
# Marqo Client: Get Indexes

MARQO_URL = "http://92.112.48.13:8882"
marqoClient = mq.Client(url=MARQO_URL)

pprint(marqoClient.get_indexes())

{'results': [{'indexName': 'camerarius_testIndex_full-texts2'},
             {'indexName': 'onit_testIndex_text-chunks1'},
             {'indexName': 'camerarius_testIndex_full-texts'},
             {'indexName': 'onit_testIndex_images1'},
             {'indexName': 'onit_testIndex_text-chunks2'},
             {'indexName': 'camerarius_testIndex_full-images5'},
             {'indexName': 'camerarius_testIndex_full-images6'}]}


In [23]:
### Set Indices
imageIndex = "camerarius_testIndex_full-images6" # indexed with "open_clip/ViT-L-14/laion2b_s32b_b82k" model
textIndex = "camerarius_testIndex_full-texts2" # indexed with "flax-sentence-embeddings/all_datasets_v4_mpnet-base" model
query = decision.retrieval_query

# Search results
results_images = marqoClient.index(imageIndex).search(q=query, limit=5)
results_text = marqoClient.index(textIndex).search(q=query, limit=5)

In [9]:
#pprint(results_text)

In [10]:
#pprint(results_images)

## Merging and reranking of results (Step 1)
- The retrieval results from the image and text indices are combined and reranked
- 1. the first retrieval results from both indices are ranked higher than the later ones (higher weight)
- 2. if both image and page from a pair are retrieved, they are ranked higher (bonus)
- 3. a router boost is added to the modality with higher relevance for answering the user question

In [24]:
## Combine and rerank multimodal retrieval results from Marqo query ##

def combine_and_rerank(results_text, results_images):
    ## Combine retrieval results ##
    combined = {}

    # --- 1. Add text hits --------------------------------------------
    for hit in results_text["hits"]:
        pair_id = hit["viewer_url"]
        score = hit["_score"]

        if pair_id not in combined:
            combined[pair_id] = {
                "pair_id": pair_id,
                "text_hit": None,
                "image_hit": None,
                "text_score": 0,
                "image_score": 0,
            }

        combined[pair_id]["text_hit"] = hit
        combined[pair_id]["text_score"] = score

    # --- 2. Add image hits -------------------------------------------
    for hit in results_images["hits"]:
        pair_id = hit["viewer_url"]
        score = hit["_score"]

        if pair_id not in combined:
            combined[pair_id] = {
                "pair_id": pair_id,
                "text_hit": None,
                "image_hit": None,
                "text_score": 0,
                "image_score": 0,
            }

        combined[pair_id]["image_hit"] = hit
        combined[pair_id]["image_score"] = score
    
    ## Rerank retrieval results ##
    # --- 3. Compute position weights ---------------------------------
    ## Positional weights first 5 results --> give first retrieved results from both indices more weight then the later ones
    text_decay_weights = [1.0, 0.9, 0.8, 0.7, 0.6]   # first item strongest
    image_decay_weights = [1.0, 0.9, 0.8, 0.7, 0.6]

    ## fallback weight for later items if any
    default_weight = 0.5
    
    ## Map ranks to get positional weights
    text_pos_weight = {}
    for i, hit in enumerate(results_text["hits"]):
        pair_id = hit["viewer_url"]
        w = text_decay_weights[i] if i < len(text_decay_weights) else default_weight
        text_pos_weight[pair_id] = w

    image_pos_weight = {}
    for i, hit in enumerate(results_images["hits"]):
        pair_id = hit["viewer_url"]
        w = image_decay_weights[i] if i < len(image_decay_weights) else default_weight
        image_pos_weight[pair_id] = w

    # --- 4. Compute combined ranking score ---------------------------
    ## Set weights
    w_text = 0.4
    w_image = 0.4
    bonus = 0.3  # if both modalities retrieved
    router_boost = 0.1  # add this to the preferred modality
    ## Router decision setting additional weight
    target_index = decision.target_index  # 'text' or 'image'

    ## Compute ranking
    for pair_id, entry in combined.items():
        ts = entry["text_score"]
        is_ = entry["image_score"]

        # Apply positional weight
        ts_weighted = ts * text_pos_weight.get(pair_id, 1.0)
        is_weighted = is_ * image_pos_weight.get(pair_id, 1.0)
        
        # Apply router-based boost
        if target_index == 'text':
            ts_weighted += router_boost
        elif target_index == 'image':
            is_weighted += router_boost

        # Base score = max of weighted scores
        base = max(ts_weighted, is_weighted)

        # Weighted sum
        combined_score = base + w_text * ts_weighted + w_image * is_weighted

        # Bonus if both modalities appear
        if ts > 0 and is_ > 0:
            combined_score += bonus

        entry["combined_score"] = combined_score

    # --- 4. Sort -----------------------------------------------------
    ## Order reranked results in descending order
    reranked = sorted(combined.values(), key=lambda x: x["combined_score"], reverse=True)
    
    for i, entry in enumerate(reranked, start=1):
        entry['rank'] = i  # rank 1 = top, 2 = second, etc.

    return reranked

combined_results = combine_and_rerank(results_text, results_images)

In [25]:
#print(combined_results)

## Reranking of results (Step 2)
- In addition, the retrieved results are passed to a multimodal reasoning agent for another reranking based on the multimodal content

In [27]:
## Rerank with Qwen 2.5 VL Agent

def format_for_reranking(entry):
    text_preview = entry["text_hit"]["text_page"] if entry["text_hit"] else ""
    img_url = entry["image_hit"]["image_url"] if entry["image_hit"] else ""
    return f"Pair ID: {entry['rank']}\nText: {text_preview}\nImage URL: {img_url}\n"

system_prompt = """
You are a multimodal reasoning agent.
Rank the following text-image pairs according to relevance to the query.

Output ONLY a JSON array of pair_ids in descending order of relevance.
Do not include any text, explanation, or comments.
Example:
["12", "7", "3"]
"""

### Reformat context text input for LLM
pairs_text = "\n\n".join([format_for_reranking(e) for e in combined_results])
#print(pairs_text)

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Query: {user_query}\n\nPairs:\n{pairs_text}"}
]

### Set client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["QWEN_API_KEY"],
)

completion = client.chat.completions.create(
    model="qwen/qwen2.5-vl-32b-instruct:free",
    temperature=0.0,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Query: {user_query}\n\nPairs:\n{pairs_text}"}
    ],
    extra_body={"attachments": [
        {"type": "image/jpeg", "url": e["image_hit"]["image_url"]}
        for e in combined_results if e["image_hit"]
    ],
               },
    #stream=True,
    extra_headers={
        "HTTP-Referer": "<YOUR_SITE_URL>",
        "X-Title": "<YOUR_SITE_NAME>",
    },
)

### Extract the JSON output
import json
raw_output = completion.choices[0].message.content.strip()
print(raw_output)

# Sometimes the model adds triple backticks
raw_output = raw_output.strip("`").strip()

ranked_pair_ids = json.loads(raw_output)

["2", "1", "3", "5", "7", "9", "4", "6", "8", "10"]


In [28]:
# Reorder according to LLM ranking
if ranked_pair_ids:  # only rerank if LLM returned something
    # 1. Build a lookup dict based on original rank
    rank_lookup = {str(entry["rank"]): entry for entry in combined_results}

    # 2. Reorder according to LLM output
    reranked_final = [rank_lookup[r] for r in ranked_pair_ids if r in rank_lookup]

    # 3. Update 'rank' to reflect new order
    for i, entry in enumerate(reranked_final, start=1):
        entry["rank"] = i

    # Now reranked_final is ordered according to LLM output
    for e in reranked_final:
        print(e["pair_id"], e["rank"])
        
else:
    # If LLM returned nothing, keep original ranking
    reranked_final = combined_results
    
#reranked_final

https://www.digitale-sammlungen.de/de/view/bsb10575861?page=782,783 1
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=788,789 2
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=784,785 3
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=584,585 4
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=458,459 5
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=424,425 6
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=388,389 7
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=736,737 8
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=698,699 9
https://www.digitale-sammlungen.de/de/view/bsb10575861?page=680,681 10


## Simple preview of reranked retrieval results
Run cell below to get a simple preview of the results

In [29]:
## Create preview of reranked results ##

import pandas as pd
from IPython.display import HTML, display
import re

def extract_id_and_page(iiif_url: str):
    """
    Extracts <ID> and <PAGE> from URLs like:
    https://.../iiif/image/v2/bsb10575861_00289/full/full/0/default.jpg
    """
    match = re.search(r"/iiif/image/v2/([^_/]+)_(\d+)/", iiif_url)
    if not match:
        return None, None
    return match.group(1), match.group(2)   # (ID, PAGE_STR)


def build_dual_preview(img_url: str, source: str):
    """
    Build the HTML for:
    - main image
    - previous page if source='text'
    - next page if source='image'
    """

    if not img_url:
        return "(no image)"

    doc_id, page_str = extract_id_and_page(img_url)

    if not doc_id:
        return f'<img src="{img_url}" width="150"/>'

    page_num = int(page_str)

    # Determine second preview page
    if source == "text":           # text_hit: show previous page
        second_page = page_num - 1
    elif source == "image":        # image_hit: show next page
        second_page = page_num + 1
    else:
        second_page = None

    # Construct IIIF URLs
    main_img = f'https://api.digitale-sammlungen.de/iiif/image/v2/{doc_id}_{page_str}/full/full/0/default.jpg'

    if second_page is not None and second_page > 0:
        second_page_str = f"{second_page:05d}"  # preserve zero-padding
        second_img = f'https://api.digitale-sammlungen.de/iiif/image/v2/{doc_id}_{second_page_str}/full/full/0/default.jpg'
        pages = [
            (page_num, main_img),
            (second_page, second_img)
        ]

        # Sort by page number: even pages FIRST
        pages_sorted = sorted(pages, key=lambda x: (x[0] % 2, x[0]))

        # Build HTML in sorted order
        preview_html = ""
        for _, url in pages_sorted:
            preview_html += f'<img src="{url}" width="800" style="margin-right:5px;"/>'

        return preview_html
    else:
        return f'<img src="{main_img}" width="150"/>'

def construct_text_preview(hit, max_len=400):
    if not hit:
        return ""
    text = hit.get("text_page") or hit.get("text_chunk") or ""
    if len(text) > max_len:
        text = text[:max_len] + "..."
    return text

def preview_combined_results(reranked):
    rows = []

    for entry in reranked_final:
        viewer_url = entry["pair_id"]
        hit = entry["text_hit"] or entry["image_hit"]

        text_score = entry["text_score"]
        image_score = entry["image_score"]
        combined_score = entry["combined_score"]
        pair_id = entry["pair_id"]

        # Build preview values
        text_preview = construct_text_preview(hit)

        viewer_link = f'<a href="{viewer_url}", target="blank">Link<a/>'
        
        if entry["image_hit"]:
            img_url = hit.get("image_url")
            img_html = build_dual_preview(img_url, source="image")
        elif entry["text_hit"]:
            img_url = hit.get("image_url")
            img_html = build_dual_preview(img_url, source="text")
        else:
            img_html = "(no image)"

        page = hit.get("page")

        rows.append({
            "Image Preview": img_html + viewer_link,
            "Combined Score": combined_score,
            "Text Score": text_score,
            "Image Score": image_score,
            "Page": page,
            "Text Preview": text_preview,
        })

    df = pd.DataFrame(rows)

    # Display as HTML
    html = df.to_html(escape=False, index=False)
    display(HTML(html))

    return df

preview_df = preview_combined_results(reranked_final)

Image Preview,Combined Score,Text Score,Image Score,Page,Text Preview
Link,0.935452,0.639609,0.0,783,"Joach. Camerarii Symbolorum\n153\nLXXVI.\nEferendum huc est Gracum Proverbium o φις eν\nμὴ φάγῃ ὅφιν. δρ ἄκων οὐ γενήσεται: Serpens ni\nN\nedat serpentem, draco non fiet. Et hoc quidem Icone\nab Aegyptiis Regem, qui multos alienos debellasset & con¬\nsumsisset significatum fuisse scribit Pierius, nec minus recte\nnos de tyrannu usurpare possumus qui aliorum exitio accres¬\ncere student. Pertinet huc insigni..."
Link,0.948332,0.0,0.57738,788,Basis
Link,0.866007,0.0,0.576196,784,"Et Emblematum Centur. IV. 134 LXXVII. VICTORUTERQUE CADIT¬ Victor uterque cadit. prohquam victoria acerba est, Cum trahut in trae eps una ruina duos. De"
Link,0.778513,0.0,0.570101,584,Omni¬
Link,0.698156,0.0,0.569547,458,Et Emblematum Centur. III. 36 XVIII. DIVERSA AB ALIIS VIR¬ TUTE VALEMUS. Passer ut ova fovet flatu vegetante marinus. Sicanimat mentes gratia diapias. Horatius
Link,0.617132,0.0,0.568014,424,"Ee Emblematum Centur. III. CUIQUE SUUM. Laeva tenet fulmen, sed olivae dexteraramum, Vi pace & bello sim memor officii. Cym"
Link,0.8368,0.632381,0.0,388,"Et Enblematum Centur. II.\nLXXXVII.\nLATET ABDITA.\nNucleus arridet? Spinosa putaminarum pes\nNon vult felices absque labore Deus,\nSciurus"
Link,0.742653,0.627369,0.0,737,"Joach. Camerarii Symbolotum\n107\nLIII.\nAcibus nocturno temporo accensis & pisces elioi, & in¬\nprimis paguros, astacos, & cancros, expenitiß. etiam\ncavernis extrahi, capique cum Oppianus, tum Olaus\nitem M. inter recentiores, scribunt: nec fallit experien¬\ntia. Plato quidem in Sophist: hoc της θηρευτικῆς τεχνης\nμυστήριον vοcaι θήραν πυρευτικὴν: Quam prolixè ad¬\nmodum describit Q. Smyrn. lib. VII. ubi..."
Link,0.653865,0.626392,0.0,698,Ararns
Link,0.564912,0.624895,0.0,680,lanus


# RAG Component

## Generative AI using OpenRouter API
- The text and image context is reformatted for input into the LLM
- System prompt with instructions on how to formulate the answers is passed to the model
- Chat history (user query & previous LLM answer) is passed to the model
- Image and text context is passed to the model

In [30]:
# Format context for LLM input
text_context = "Text pages:\n"
for i, entry in enumerate(reranked_final):
    if entry.get("text_hit"):  # check if text exists
        text = entry["text_hit"].get("text_page", "")
        text_context += f'{text}\n'

image_context = []
for entry in reranked_final:
    if entry.get("text_hit"):  # check if text exists
        # Construct IIIF URLs
        page = int(entry["text_hit"].get("page"))-1
        page_str = f"{page:05d}"  # preserve zero-padding
        img_pair_url = f'https://api.digitale-sammlungen.de/iiif/image/v2/bsb10575861_{page_str}/full/full/0/default.jpg'
        image_context.append({"type": "image/jpeg", "url": img_pair_url})
    elif entry.get("image_hit"):  # check if image exists
        image_context.append({"type": "image/jpeg", "url": entry["image_hit"].get("image_url")})

print(decision.target_index)

image


In [31]:
#print(text_context)
#print(image_context)

In [None]:
from openai import OpenAI

### Response generation based on retrieval results ###

## https://openrouter.ai/qwen/qwen2.5-vl-32b-instruct:free ##

system_prompt = f"""You are the Machina Emblematica – the mysterious curator of Symbola et 
Emblemata (1590) by Joachim Camerarius the Younger. You are part librarian, 
part adventuring scholar: a charming, multilingual nerd with a fondness 
for mysteries, theatrics, metaphors, forgotten languages, and the occasional pun.

When you answer, there's a hint of light-hearted pulp adventure novel in your voice. 
Think Indiana Jones or Flynn Carson! You like to quote original passages from the 
Symbola. Include a translation if you do. But you also explain, teach, point out meaning 
and intention. You like to involve visitors in a conversation, keep them engaged, draw
them deeper into the mysteries of the Symbola. You enjoy the thought of them leaving 
more knowledgeable than they arrived.

Primary modality: {decision.target_index}.
Limit your response to no more than 200 words total. That’s about one or two 
paragraphs. Keep it tight and elegant. Speak only in prose. Do not describe 
physical gestures, facial expressions, or actions (e.g., "smiles" or "opens 
book"). You are a voice, not a body.

Summarizing from the content below, please provide an answer to the 
following question.
Rules:
- If the primary modality is 'image', use the images via the image_url to generate the answer.
- If the primary modality is 'text', use the text context provided instead.
- Use the other modality only to supplement the primary one.
- Output a concise answer.
- Take into account our previous conversation.
- Avoid repetitive opening sentences that you have used in the previous chat history.
- Don't start with "Ah", or "Marvellous" or the likes.
- Answer in the language of the question.
- Add a summary of the context documents that you see."""

# Initialize chatHistory only if it does NOT exist yet
if "chatHistory" not in globals():
    chatHistory = []
    
chatHistory.append({"role": "user", "content": user_query})

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.environ["QWEN_API_KEY"],
)

stream = client.chat.completions.create(
    model="qwen/qwen2.5-vl-32b-instruct:free",
    messages=[
        {
            "role": "system", 
            "content": system_prompt,
        },
        *chatHistory,
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": image_context},
                {"type": "text", "text": "\n\nText context:\n" + text_context},
            ],
        },
    ],
    stream=True,
    extra_headers={
        "HTTP-Referer": "<YOUR_SITE_URL>",
        "X-Title": "<YOUR_SITE_NAME>",
    },
)

model_response = ""
# Iterate over the streaming events
for chunk in stream:
    # each `chunk` is a ChatCompletionChunk object
    if chunk.choices:
        delta = chunk.choices[0].delta
        if delta and delta.content:       # ✅ access as attribute
            print(delta.content, end="", flush=True)
            model_response += delta.content  

chatHistory.append({"role": "assistant", "content": model_response})

Ah, dear seeker after the great and terrible dragons. Let me spin for you a tapestry of lore and legend, woven from the threads of time itself! Among the Symbola et Emblemata's rich tapestry, dragons are indeed present, though their appearances often serve as metaphorical and philosophical muses rather than fantastical beasts of legend alone. Now, if you have your eyes peeled like an eagle, you would notice that even our esteemed author, Joachim Camerarius the Younger, might weave in these grand animals as symbols of power, tyranny, or hidden truths.

In emblem LXXVI of the Symbola, the serpent and its kin—the dragon—symbolize rulers and tyrants who ascend through the destruction of others, hinting at a profound moral tale. And in emblem LXXXVII, the idea of light unveiling hidden truths (латет абдита—“hidden things lie concealed") reminds us that dragons, like all great mysteries, are often less real creatures and more mirrors reflecting human ambitions and fears.

So, you see, dragon