Multi-Modal RAG with Images and Documents

In [1]:
# System Dependencies: Poppler, Tesseract and libmagic installed globally (in bash)

In [2]:
# Ensuring the .venv has the necessary python bridges
%pip install -Uq "unstructured[all-docs]"            # will install all the needed dependencies and handle any doc like pdf, csv, ppt etc.
%pip install -Uq langchain langchain-community 
%pip install -Uq langchain-qdrant                    # The local vector store
%pip install -Uq langchain-huggingface
%pip install -Uq sentence-transformers 
%pip install -Uq qdrant-client
%pip install -Uq langchain-ollama

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing libraries
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf           # instead of pdf, we can type csv, ppt etc.
from unstructured.chunking.title import chunk_by_title

# Langchain components
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_core.messages import HumanMessage
from langchain_ollama import ChatOllama 

[0;93m2026-01-15 12:41:08.786683511 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


In [4]:
# STEP 1. PARTITIONING THE PDF DOCUMENT INTO ATOMIC ELEMENTS - using the unstructured library
def partition_document(file_path: str):
    """Extract atomic elements from the PDF using unstructured"""
    print(f"Partitioning the PDF document: {file_path}")

    elements = partition_pdf(
        filename=file_path,                     # path to my PDF file
        startegy="hi_res",                      # the most accurate (but slower) processing method of extraction
        infer_table_structure=True,             # keeps tables as structured HTML, not jumbled text
        extract_image_block_types=["Image"],    # grabs the images found in the PDF
        extract_image_block_to_payload=True     # Stores/converts the images as 'image_base64' data that we can actually use - because that is how images are transferred on the internet
    )

    print(f"Extracted {len(elements)} atomic elements")
    return elements

# Testing with my PDF file
file_path = "/home/ruba/Desktop/Multi-Modal RAG/docs/sensors-24-03064.pdf"   # copy path of where the PDF is stored 
elements = partition_document(file_path)

Partitioning the PDF document: /home/ruba/Desktop/Multi-Modal RAG/docs/sensors-24-03064.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 678 atomic elements


In [5]:
# Verifying the quantity of atomic elements
# len(elements)

# Printing elements
# elements

In [6]:
# All types of different atomic elements we see from unstructured
set([str(type(el)) for el in elements])

{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Formula'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [7]:
# Selecting one random element out of the 678 elements
elements[232].to_dict()         # .to_dict() prints all the info 

{'type': 'NarrativeText',
 'element_id': '297786a844c4de05ec169354a953c893',
 'text': 'Figure 7. Overview of the different reviewed UAV sensors.',
 'metadata': {'detection_class_prob': 0.8115609884262085,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(460.86962890625),
     np.float64(1475.6962844444445)),
    (np.float64(460.86962890625), np.float64(1501.0014577777777)),
    (np.float64(1112.1025390625), np.float64(1501.0014577777777)),
    (np.float64(1112.1025390625), np.float64(1475.6962844444445))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2026-01-12T13:40:10',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 21,
  'file_directory': '/home/ruba/Desktop/Multi-Modal RAG/docs',
  'filename': 'sensors-24-03064.pdf'}}

In [8]:
# Checking the image atomic elements data (how it looks like)
# Gathering all the images
images = [element for element in elements if element.category =='Image']
print(f"Found {len(images)} images")

# Printing the first image
images[0].to_dict()

# 'image_base64' shown in the output below is the accurate raw image (use this), rather than 'text' in the ouutput (jumbled up, not reliable) - OCR jumbles up the text in the image
# view the 'image_base64' (copy) on codebeautify.org (paste), and the actual image will appear

Found 19 images


{'type': 'Image',
 'element_id': '0fd0590c5ffaf709d366ddb00857ebf7',
 'text': 'i | SENSOrs',
 'metadata': {'detection_class_prob': 0.7279917597770691,
  'coordinates': {'points': ((np.float64(89.6802749633789),
     np.float64(134.87904357910156)),
    (np.float64(89.6802749633789), np.float64(216.10633850097656)),
    (np.float64(438.9463806152344), np.float64(216.10633850097656)),
    (np.float64(438.9463806152344), np.float64(134.87904357910156))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2026-01-12T13:40:10',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCABRAV0DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1F

In [9]:
# Checing for table atomic elements 
# Gathering all tables
tables = [element for element in elements if element.category == 'Table']
print(f"Found {len(tables)} tables")

# Printing the first table
tables[0].to_dict()

# Use 'text_as_html' attribute (more reliable), rather than 'text' (jumbled up, not reliable) - OCR jumbles up the text in the table
# Use jsfiddle.net to view text_as_html table

Found 8 tables


{'type': 'Table',
 'element_id': '0aafe84105e107fc419c3d93c0d05b7c',
 'text': 'Advantages ✓ Informative scene data ✓ Anti-jamming ability ✓ Relatively high accuracy Vision-Based Navigation for UAVs Disadvantages Challenges Field of Application X Complex environment structures reflect complexities in the navigation algorithm Real-time processing requirements Agriculture X Performance is impacted by adverse weather conditions Integration with image-based sensing modalities Surveillance X Vulnerable to visual illusions Power consumption Environmental monitoring',
 'metadata': {'detection_class_prob': 0.8665180802345276,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(114.76978302001953),
     np.float64(1114.007568359375)),
    (np.float64(114.76978302001953), np.float64(1390.7633056640625)),
    (np.float64(1549.9464111328125), np.float64(1390.7633056640625)),
    (np.float64(1549.9464111328125), np.float64(1114.007568359375))),
   'system': 'PixelSpace',
   'layout_wi

In [10]:
# STEP 2: CLUBBING ATOMIC ELEMENTS TOGETHER BY TITLE - Chunking by title
def create_chunks_by_title(elements):
    """Create intelligent chunks using the title-based startegy"""   # chunking by title prompt
    print("Creating Chunks by Title...")

    chunks = chunk_by_title(
        elements,                        # the parsed (partitioned) PDF atomic elements from the previous step
        max_characters=3000,             # Hard limit - never exceed 3000 characters per chunk!!!
        new_after_n_chars=2400,          # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500   # Merge tiny chunks under 500 characters with neighbours
    )

    print(f"Created {len(chunks)} chunks")
    return chunks

# Creating chunks
chunks = create_chunks_by_title(elements)

Creating Chunks by Title...
Created 75 chunks


In [11]:
# Viewing the chunks created
chunks

# Checking the types of chunks that are created
set([str(type(chunk)) for chunk in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [12]:
# Printing at one particular chunk to check its properties
chunks[5].to_dict()

{'type': 'CompositeElement',
 'element_id': '0c07bbfa-587f-4a59-839f-c030b9728311',
 'text': '2 of 42\n\nSensors 2024, 24, 3064\n\nbetween a UAV and the payload (i.e., suspended loads), whereas in [47], the state-of-\n\nthe-art focused mainly on drone detection and classification techniques. From another perspective, the review in [48] identifies current gaps in the application of UAVs for the creation of 3D models in the contexts of urban planning and historic monuments preservation. The review in [49] took into consideration the diverse possible applications of drones in healthcare applications, whereas in [50], the main focus was on the study of marine mammals (i.e., individual estimation, body conditions and biometrics, behavioral patterns, etc.). On the other hand, the review in [51] examined the drone-integrated Geographic Information System (GIS) in different fields, differently from the work in [52], which focused on UAVs’ potential to advance climate change research and monito

In [13]:
# Checking all the atomic elements (Title, Narrative text, Text, Image, Table, Header etc.) in the chunk - using 'metadata' and 'orig_elements'
# For better understanding selecting a chunk with image and table in the PDF

# Checking atomic elements in a chunk with an Image
chunks[1].metadata.orig_elements  

[<unstructured.documents.elements.Title at 0x7164a02f12a0>,
 <unstructured.documents.elements.NarrativeText at 0x7164a02f1660>,
 <unstructured.documents.elements.NarrativeText at 0x7164a02f1840>,
 <unstructured.documents.elements.NarrativeText at 0x7164a02f1a20>,
 <unstructured.documents.elements.Image at 0x7164a02f1c00>,
 <unstructured.documents.elements.NarrativeText at 0x7164a02f1de0>]

In [14]:
# Checking atomic elements in a chunk with a Table
chunks[14].metadata.orig_elements

[<unstructured.documents.elements.NarrativeText at 0x7164a0148fa0>,
 <unstructured.documents.elements.Table at 0x7164a0149660>,
 <unstructured.documents.elements.ListItem at 0x7164a0149d50>,
 <unstructured.documents.elements.NarrativeText at 0x7164a0149ff0>,
 <unstructured.documents.elements.NarrativeText at 0x7164a014a1a0>,
 <unstructured.documents.elements.NarrativeText at 0x7164a014a380>,
 <unstructured.documents.elements.Header at 0x7164a0149300>,
 <unstructured.documents.elements.Header at 0x7164a0148d00>,
 <unstructured.documents.elements.ListItem at 0x7164a014a7a0>,
 <unstructured.documents.elements.ListItem at 0x7164a014ac80>]

In [15]:
# To further check the contents/properties of a specific atomic element within the chunk (eg. chunk 14) we use the following code:

chunks[14].metadata.orig_elements[1].to_dict() # will print the info of the 2nd atomic element in chunk 14 i.e. Table

{'type': 'Table',
 'element_id': '0aafe84105e107fc419c3d93c0d05b7c',
 'text': 'Advantages ✓ Informative scene data ✓ Anti-jamming ability ✓ Relatively high accuracy Vision-Based Navigation for UAVs Disadvantages Challenges Field of Application X Complex environment structures reflect complexities in the navigation algorithm Real-time processing requirements Agriculture X Performance is impacted by adverse weather conditions Integration with image-based sensing modalities Surveillance X Vulnerable to visual illusions Power consumption Environmental monitoring',
 'metadata': {'detection_class_prob': 0.8665180802345276,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(114.76978302001953),
     np.float64(1114.007568359375)),
    (np.float64(114.76978302001953), np.float64(1390.7633056640625)),
    (np.float64(1549.9464111328125), np.float64(1390.7633056640625)),
    (np.float64(1549.9464111328125), np.float64(1114.007568359375))),
   'system': 'PixelSpace',
   'layout_wi

In [None]:
# STEP 3. SUMMARIZING THE CHUNKS AND CONVERTING IT INTO LANGCHAIN DOCUMENTS
# Substep 1: Separate content types - for every single chunk, we will separate all the tables, images and text 
def separate_content_types(chunk):
    """Analyze what types of content are in a chunk"""
    content_data= {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }
    
    # Checking for tables ang images in original elements
    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__

            # Handling tables
            if element_type == 'Table':
                content_data['types'].append('table')
                table_html= getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)

            # Handling images
            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)

    content_data['types'] = list(set(content_data['types']))
    return content_data

# As there are tables and images involved - we will have to summarize the text
def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary for mixed content using Llama 3.2 Vision"""

    try: 
        # 1. Initializing LLM
        llm = ChatOllama(model="llama3.2-vision", temperature=0)     # llama 3.2-version model also converts the images into text (11B)

        # 2. Creating the base prompt
        prompt = f"""You are creating a searchable description for document content retrieval. 

        Content to analyze:
        Text content:
        {text}
        \n"""

        # 3. Adding tables if present
        if tables: 
            prompt += "TABLES:\n"  
            for i, table in enumerate(tables):
                prompt += f"Table {i+1}:\n{table}\n\n"

                # 4. Adding the Task Instructions to the prompt
                prompt += """ 
                Your TASK:
                Generate a comprehensive, searchable description that covers:
                1. Key facts, numbers, and data points from text and tables.
                2. Main topics and concepts discussed.
                3. Questions this content could answer.
                4. Visual content analysis (charts, diagrams, patterns in images).
                5. Alternative search terms users might use.

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:"""

        # 5. Building message content starting with the text/table prompt
        message_content = [{"type": "text", "text": prompt}]

        # 6. Adding images to the message - appending the images that are in image_base64
        for image_base64 in images:
            message_content.append({
                "type": "image_url", 
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}  # message_content has 2 items in the List i.e. type and image_url that will be provided in the HumanMessage
            })

        # 7. Send to AI and get response
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])               # The LLM will look at all the text that includes the base 64 images and tables, then it will give us a summary

        return response.content

    except Exception as e: 
        print(f"AI summary failed: {e}")
        # Fallback to simple summary
        summary = f"{text[:300]}..."
        if tables: 
            summary += f" [Contains {len(tables)} table(s)]"
            if images:
                summary += f" [Contains {len(images)} images(s)]"
                return summary
            
def summarize_chunks(chunks):
    """Process all chunks with AI summaries"""
    print("Processing chunks with AI summaries...")

    langchain_documents = []
    total_chunks = len(chunks)

    for i, chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f" Processing Chunk {current_chunk}/{total_chunks}")


        # Analyze chunk content
        content_data = separate_content_types(chunk)

        # Debugging prints
        print(f" Types found: {content_data['types']}")
        print(f" Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")

        # Creating AI-enhanced summary if the chunk has tables or images
        if content_data['tables'] or content_data['images']:
            print(f" Creating AI Summary for mixed content...")
            try: 
                enhanced_content = create_ai_enhanced_summary(
                    content_data['text'],
                    content_data['tables'],
                    content_data['images'],
                )

                print(f"  AI summary generated successfully")
                print(f" Enhanced content preview: {enhanced_content[:200]}...")
            except Exception as e:
                print(f" AI summary failed: {e}")
                enhanced_content = content_data['text']
        else:
            print(f" using raw text(no tables/images")
            enhanced_content = content_data['text']

    # Creating LangChain Document with rich metadata -LangChain Document contains page_content and metadata
    doc = Document(
        page_content=enhanced_content,
        metadata={
            "original_content": json.dumps({           # converting the entire content into json
                "raw_text": content_data['text'],           # raw text
                "tables_html": content_data['tables'],      # raw tables
                "images_base64": content_data['images']     # raw images
            })
        }
    )

    langchain_documents.append(doc)      # appending all the documents 

    print(f" Processed {len(langchain_documents)} chunks")
    return langchain_documents           # returning a list of 75 LangChain documents (as we had 75 chunks)

# Processing chunks with AI
processed_chunks = summarize_chunks(chunks)                                        
    

Processing chunks with AI summaries...
 Processing Chunk 1/75
 Types found: ['image', 'text']
 Tables: 0, Images: 2
 Creating AI Summary for mixed content...
AI summary failed: this model only supports one image while more than one image requested (status code: 500)
  AI summary generated successfully
 AI summary failed: 'NoneType' object is not subscriptable
 Processing Chunk 2/75
 Types found: ['image', 'text']
 Tables: 0, Images: 1
 Creating AI Summary for mixed content...
  AI summary generated successfully
 Enhanced content preview: The image displays a Creative Commons (CC) license icon, which is a symbol used to indicate that a work is licensed under a Creative Commons license. The icon is a white circle with a black border, co...
 Processing Chunk 3/75
 Types found: ['text']
 Tables: 0, Images: 0
 using raw text(no tables/images
 Processing Chunk 4/75
 Types found: ['text']
 Tables: 0, Images: 0
 using raw text(no tables/images
 Processing Chunk 5/75
 Types found: ['text']
 Tab