In [3]:
import fitz
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
load_dotenv()
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
groq=os.getenv("GROQ_API_KEY")
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
llm=ChatGroq(model="meta-llama/llama-4-scout-17b-16e-instruct",api_key=groq)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## Embeddings

In [None]:
def embed_img(image_data):
    """Embedding the image using CLIP"""
    if isinstance(image_data,str):
        image=Image.open(image_data).convert('RGB')
    else:
        image=image_data
    
    input=clip_processor(images=image,return_tensors='pt')
    
    with torch.no_grad():
        features=clip_model.get_image_features(**input)
        features = features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()
def embed_text(text):
    """Embedding the image using CLIP"""
    input=clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77
    )
    with torch.no_grad():
        features=clip_model.get_text_features(**input)
        features=features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()

In [50]:
pdf_path="input.pdf"
doc=fitz.open(pdf_path)
all_docs=[]
all_embeddings=[]
image_data_store={}

splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)

In [51]:
for i,page in enumerate(doc):
    text=page.get_text()
    if text.strip():
        temp_doc=Document(page_content=text,metadata={"page":i,"type":"text"})
        text_chunks=splitter.split_documents([temp_doc])
        
        for chunk in text_chunks:
            embedding=embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)
    
    for img_index,img in enumerate(page.get_images(full=True)):
        try:
            xref=img[0]
            base_image=doc.extract_image(xref)
            image_bytes=base_image["image"]
            #Conerting to PIL
            pil_image=Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            #Create unique identifier
            image_id=f"page-{i}-img-{img_index}"
            #Store image as base64
            buffered=io.BytesIO()
            pil_image.save(buffered,format="PNG")
            img_base64=base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id]=img_base64
            #Embed using CLIP
            embedding=embed_img(pil_image)
            all_embeddings.append(embedding)
            #Doc for Image
            image_doc=Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page":i,"type":"image","image_id":image_id}
            )    
            all_docs.append(image_doc)
        except Exception as e:
            print(f"Error processing image{img_index} on page {i}:{e}")
            continue
doc.close()

In [52]:
embeddins_array=np.array(all_embeddings)

vector_store=FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc,emb in zip(all_docs,embeddins_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [53]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x253a79c8070>

In [54]:
def retrieve_modal(query,k=5):
    
    query_embed=embed_text(query)
    
    results=vector_store.similarity_search_by_vector(
        embedding=query_embed,
        k=k
    )
    return results

In [55]:
def create_message(query,retrieved_docs):
    content=[]
    content.append({
        "type":"text",
        "text":f"Question: {query}\n\nContext:"
    })
    
    text_docs=[doc for doc in retrieved_docs if doc.metadata.get("type")=="text"]
    image_docs=[doc for doc in retrieved_docs if doc.metadata.get("type")=="image"]
    
    if text_docs:
        text_context="\n\n".join([
            f"[Page{doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type":"text",
            "text": f"Text experts:\n {text_context}\n"
        })
    for doc in image_docs:
        image_id=doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type":"text",
                "text":f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type":"image_url",
                "image_url":{
                    "url":f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
            
    content.append({
        "type":"text",
        "text":"\n\n Please answer the question based on the context provided text and images"
    })
    
    return HumanMessage(content=content)

In [56]:
def multimodal_rag_pipeline(query):
    context_docs=retrieve_modal(query,k=5)
    
    message =create_message(query,context_docs)
    
    response=llm.invoke([message])
    
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type=doc.metadata.get("type","unknown")
        page=doc.metadata.get('page','?')
        if doc_type =="text":
            preview=doc.page_content[:100]+"..." if len(doc.page_content)>100 else doc.page_content
            print(f" -Text from page{page}: {preview}")
        else:
            print(f" -imager from page{page}")
    print("\n")
    
    return response.content

In [57]:
queries = [
        "What is the aim of this guidance?",
        "what does the bubble chart means in Fig. 10",
        "What dooes the table Anscombe’s quartet means",
        "what does the bubble chart means in Fig. 10"
    ]
    
for query in queries:
    print(f"\nQuery: {query}")
    print("-" * 50)
    answer = multimodal_rag_pipeline(query)
    print(f"Answer: {answer}")
    print("=" * 70)


Query: What is the aim of this guidance?
--------------------------------------------------

Retrieved 5 documents:
 -Text from page35: 1990
2000
2010
65
70
75
80
85
 -Text from page25: men (low income countries)
0
20
40
60
80
100
60
65
70
75
80
85
90
 -Text from page38: 15
20
25
30
35
40
2006
2011
2016
5
10
15
20
25
30
35
40
 -Text from page13: to get insight into correlations and outliers.
 -Text from page4: Contents
ACKNOWLEDGEMENTS..............................................................................


Answer: Based on the context provided, I would say that the aim of this guidance is likely to:

**Get insight into correlations and outliers.**

This is directly stated on Page 13 of the context text. 

However, I also noticed that there is a specific section titled "AIM OF THIS GUIDANCE" on Page 4 (according to the Contents section), which likely provides a more detailed and explicit statement of the guidance's aim. Unfortunately, the text of that section is not provided in