In [3]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage
from langchain_core.documents import Document
from langchain.retrievers import MultiVectorRetriever
from langchain.storage import InMemoryStore
from PIL import Image
import io
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Initialize the embedding model
embeddings = OpenAIEmbeddings()

In [5]:
# Initialize the LLM
llm = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)

In [6]:
# Create a vector store for text documents
text_vectorstore = FAISS.from_texts(
    ["This is a red car", "This is a blue house", "This is a green tree"],
    embeddings
)

In [7]:
# Function to process images
def process_image(image_path):
    # Load and process image
    image = Image.open(image_path)
    # Convert to bytes for OpenAI API
    byte_stream = io.BytesIO()
    image.save(byte_stream, format=image.format)
    image_bytes = byte_stream.getvalue()
    return image_bytes

In [8]:
# Create a retriever that combines text and image search
id_key = "doc_id"
docstore = InMemoryStore()
multimodal_retriever = MultiVectorRetriever(
    vectorstore=text_vectorstore,
    docstore=docstore,
    id_key=id_key,
)

In [9]:
# Function to add documents
def add_document(text, image_path=None):
    doc_id = str(hash(text))
    doc = Document(page_content=text, metadata={id_key: doc_id})
    
    if image_path:
        # Process image and add to docstore
        image_bytes = process_image(image_path)
        doc.metadata["image"] = image_bytes
    
    multimodal_retriever.add_documents([doc])

In [10]:
# Function to query the system
def query_system(query, image_path=None):
    # Prepare the messages
    messages = []
    messages.append(HumanMessage(content=query))
    
    if image_path:
        image_bytes = process_image(image_path)
        messages[0].content = [
            query,
            {"type": "image_url", 
             "image_url": f"data:image/jpeg;base64,{image_bytes}"}
        ]
    
    # Get relevant documents
    docs = multimodal_retriever.get_relevant_documents(query)
    
    # Add context from retrieved documents
    context = "Retrieved context:\n"
    for doc in docs:
        context += doc.page_content + "\n"
        if "image" in doc.metadata:
            messages[0].content.append(
                {"type": "image_url", 
                 "image_url": f"data:image/jpeg;base64,{doc.metadata['image']}"}
            )
    
    # Add context to query
    if isinstance(messages[0].content, str):
        messages[0].content = context + "\nQuery: " + messages[0].content
    else:
        messages[0].content[0] = context + "\nQuery: " + messages[0].content[0]
    
    # Get response from LLM
    response = llm.invoke(messages)
    return response.content

In [None]:
# Example usage:
'''
add_document("This is a red sports car", "car.jpg")
add_document("This is a modern house", "house.jpg")

# Query with text only
response = query_system("What color is the car?")

# Query with text and image
response = query_system("What's in this image?", "query_image.jpg")
'''