In [1]:
# Declare all libraries this program will be using
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain_core.documents import Document

from langchain_community.chat_models import ChatOllama
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.storage import RedisStore
from langchain_community.utilities.redis import get_client

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.vectorstores import Chroma

from IPython.display import HTML, display, Markdown
from PIL import Image
from io import BytesIO
import pandas as pd
import re
import uuid
import base64
import time
import os

In [2]:
# This section ingest the PDF files from the 'datasheet' folder
# Load and process PDFs
path = "./datasheet/"
dir = os.listdir(path)
docs = []

for file in dir:
    if file.endswith('.pdf'):
        print(f"Processing: {file}")
        docs.append(file)

def process_pdf(doc):
    try:
        loader = UnstructuredPDFLoader(file_path=doc,
                                    strategy='hi_res',
                                    extract_images_in_pdf=True,
                                    infer_table_structure=True,
                                    chunking_strategy="by_title",
                                    max_characters=4000,
                                    new_after_n_chars=4000,
                                    combine_text_under_n_chars=2000,
                                    mode='elements',
                                    image_output_dir_path='./figures')
        data = loader.load()
        for item in data:
            item.metadata['document_name'] = os.path.basename(doc)
        return data
    except Exception as e:
        print(f"Error processing {doc}: {e}")
        return []

all_data = []
for doc in docs:
    pathdoc = path + doc
    data = process_pdf(pathdoc)
    all_data.extend(data)

Processing: poweredge-r740-spec-sheet.pdf


Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Initialize the models
chatgpt = Ollama(model="llama3.1:8b")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [4]:
# placeholders for textual data and tables

docs = []
tables = []

for doc in data:
    if doc.metadata['category'] == 'Table':
        tables.append(doc)
    elif doc.metadata['category'] == 'CompositeElement':
        docs.append(doc)

In [5]:
# Generate summaries for textual data and tables and store them in their respective lists

# Prompt
prompt_text = """
You are an assistant tasked with summarizing tables and text particularly for semantic retrieval.
These summaries will be embedded and used to retrieve the raw text or table elements
Give a detailed summary of the table or text below that is well optimized for retrieval.
For any tables also add in a one line description of what the table is about besides the summary.
Do not add additional words like Summary: etc.

Table or text chunk:
{element}
"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
summarize_chain = (
                    {"element": RunnablePassthrough()}
                      |
                    prompt
                      |
                    chatgpt
                      |
                    StrOutputParser() # extracts the response as text and returns it as a string
)

# Initialize empty summaries
text_summaries = []
table_summaries = []

text_docs = [doc.page_content for doc in docs]
table_docs = [table.page_content for table in tables]

text_summaries = summarize_chain.batch(text_docs, {"max_concurrency": 5})
table_summaries = summarize_chain.batch(table_docs, {"max_concurrency": 5})

len(text_summaries), len(table_summaries)

(3, 1)

In [6]:
# various functions to process image

# encode the image in base64
def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# summarize the image based on base64 constitution and prompt given
def image_summarize(img_base64, prompt):
    """Make image summary"""
   
    chat = ChatOllama(model="llava-llama3", temperature=0)
    

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content

# generate image summaries and return 2 lists: base64 and summaries
def generate_img_summaries(path):
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """

    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval.
                Remember these images could potentially contain graphs, charts or tables also.
                These summaries will be embedded and used to retrieve the raw image for question answering.
                Give a detailed summary of the image that is well optimized for retrieval.
                Do not add additional words like Summary: etc.
             """

    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(".jpg"):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))

    return img_base64_list, image_summaries


# Execute the function
IMG_PATH = './figures'
start_time = time.time()
imgs_base64, image_summaries = generate_img_summaries(IMG_PATH)
print(f"--- {time.time() - start_time} seconds ---")

--- 13.602229833602905 seconds ---


In [7]:
# initialize an instance of vector store
chroma_db = Chroma(
    collection_name="private_rag",
    embedding_function=embeddings,
    persist_directory="./privatedb",
    # collection_metadata={"hnsw:space": "cosine"},
)

  chroma_db = Chroma(


In [8]:
# initialize the document store - to store raw images, text and tables
client = get_client('redis://localhost:6379')
redis_store = RedisStore(client=client) # you can use filestore, memorystory, any other DB store also

In [9]:
# Add summaries to vector store and raw contents to document store

import uuid


# from langchain_chroma import Chroma
from langchain_core.documents import Document
# from langchain_openai import OpenAIEmbeddings


def create_multi_vector_retriever(
    docstore, vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=docstore,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    ## THE FOLLOWING SUPPOSED CAN BE COMMENTED OUT AND LEFT return retriever only if I have persisted vectorstore and docstore previously
    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever

# Create retriever
retriever_multi_vector = create_multi_vector_retriever(
    redis_store,
    chroma_db,
    text_summaries,
    text_docs,
    table_summaries,
    table_docs,
    image_summaries,
    imgs_base64,
)

In [10]:
# 4 image manipulation functions

# This function only used for display image for debugging purpose
def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Decode the base64 string
    img_data = base64.b64decode(img_base64)
    # Create a BytesIO object
    img_buffer = BytesIO(img_data)
    # Open the image using PIL
    img = Image.open(img_buffer)
    display(img)

# check if the string is base64
def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None

# check if the base64 string belong to an image
def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xff\xd8\xff": "jpg",
        b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

# split base64-encoded images and texts
def split_image_text_types(docs):
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content.decode('utf-8')
        else:
            doc = doc.decode('utf-8')
        if looks_like_base64(doc) and is_image_data(doc):
            b64_images.append(doc)
        else:
            texts.append(doc)
    return {"images": b64_images, "texts": texts}

In [10]:
# This section fo the code is used if the vectorstore and documentstore were populated previously. 
# All the code prior to this section do not have to be executed if all the documents were loaded previously. 
# Hence, the libraries are declared here assuming the first cell is not run

from langchain.vectorstores import Chroma
import chromadb
from langchain_community.embeddings import OllamaEmbeddings 
from langchain_community.storage import RedisStore # document store
from langchain_community.utilities.redis import get_client
from langchain.retrievers.multi_vector import MultiVectorRetriever

persistent_client = chromadb.PersistentClient()
embeddings = OllamaEmbeddings(model="nomic-embed-text")

vectorstore = Chroma(
    client=persistent_client,
    collection_name="private_rag",
    embedding_function=embeddings,
)

id_key = "doc_id"

client = get_client('redis://localhost:6379')
docstore = RedisStore(client=client) # you can use filestore, memorystory, any other DB store also

# Create the multi-vector retriever
retriever_multi_vector = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key,
)

def multimodal_rag_qa(query):
    response = multimodal_rag_w_sources.invoke({'input': query})
    return response['answer'] 

In [11]:
# The libraries are called again as there is a possibility that the first cell is not run
# as in the case where internal documents (PDF) are not required to load again

from operator import itemgetter
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# This function accepts the two-dimensional array and generate inference from prompt constructed
def multimodal_prompt_function(data_dict):
    """
    Create a multimodal prompt with both text and image context.

    This function formats the provided context from `data_dict`, which contains
    text, tables, and base64-encoded images. It joins the text (with table) portions
    and prepares the image(s) in a base64-encoded format to be included in a message.

    The formatted text and images (context) along with the user question are used to
    construct a prompt for multimodal model.
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = ""

    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = f"Image: data:image/jpeg;base64,{image}\n"
            messages += image_message

    # Adding the text for analysis
    text_message = f"""
        You are an pre-sales specialist well-versed with the company products based on the datasheet and brochure available and
        are competent in matching the requirements from potential user or clients with the available product specification. 
        You will be given Context documents which will be a mix of text, tables, and images usually of charts or graphs.
        Use this information to match the specification for provided Minimum specification with the relevant specification offered in Context documents.
        Your answer should specify a value or text extracted as a piece of information from Context document only. 
        DO NOT include preamble. 
        DO NOT give me the sources where you obtain your information or conclusion. 
        DO NOT ask further questions.
        DO NOT make up answers, use the provided context documents below and answer the question to the best of your ability. 
        Be as brief, complete and precise in your output as possible. 

        #Example: 
        Minimum specification: 'Weight: Maximum 3kg'
        Output: 'Minimum weight starts at 1.21kg'
        
        Let's start:

        Minimum specification:
        {data_dict['question']}

        Context documents:
        {formatted_texts}

        Output:
    """
    
    # Combine text and image message
    messages += text_message
    return [HumanMessage(content=messages)]


# Create RAG chain
multimodal_rag = (
        {
            "context": itemgetter('context'),
            "question": itemgetter('input'),
        }
            |
        RunnableLambda(multimodal_prompt_function)
            |
        chatgpt
            |
        StrOutputParser()
)

# Pass input query to retriever and get context document elements
retrieve_docs = (itemgetter('input')
                    |
                retriever_multi_vector
                    |
                RunnableLambda(split_image_text_types))

# Below, we chain `.assign` calls. This takes a dict and successively
# adds keys-- "context" and "answer"-- where the value for each key
# is determined by a Runnable (function or chain executing at runtime).
# This helps in also having the retrieved context along with the answer generated by GPT-4
multimodal_rag_w_sources = (RunnablePassthrough.assign(context=retrieve_docs)
                                               .assign(answer=multimodal_rag)
)

# this is the function that will be called for inference 
def multimodal_rag_qa(query):
    response = multimodal_rag_w_sources.invoke({'input': query})
    return response['answer'] 

In [17]:
# Delare the additional function the ensuing code will be using
# assuming the first cell was not run as the documents were
# loaded from vector store and document store which were persisted

from langchain_community.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
import pandas as pd
import ollama
import time

In [15]:
# This is for basic RAG to load, split and embed the documents

def createstorefromdoc(path):
    start_time = time.time()
    loader = PyPDFDirectoryLoader(path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    splits = text_splitter.split_documents(documents)
    
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    persist_directory = 'basicrag_db'
    vectorstore = Chroma.from_documents(documents=splits, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
    vectorstore.persist()
    elapsed = time.time() - start_time
    print(f"--- {elapsed} seconds ---")
    
    return elapsed

In [18]:
# Execute the above function
# It does not need to be run if the same vector store was persisted before
createstorefromdoc('./datasheet')

--- 20.075923919677734 seconds ---


  vectorstore.persist()


20.075923919677734

In [21]:
# This function reload the vector store which was persisted earlier

def loadvectorstore():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    persist_directory = 'basicrag_db'
    vectorstore = Chroma(persist_directory=persist_directory, 
                  embedding_function=embeddings)

    return vectorstore

In [19]:
# This function provide the inference for basic RAG
# It accepts two parameters, vector store and the input
# In this case, it will be a cell from excel file

# This function concatenate the relevant chunks retrieved
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# This function implements basic RAG 
def basic_rag(vectorstore, question):   

    # Create the retriever
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)

    formatted_prompt = f"Minimum specification: {question}\n\nContext documents: {formatted_context}"

    text_message = f"""
        You are an pre-sales specialist well-versed with the company products based on the datasheet and brochure available and
        are competent in matching the requirements from potential user or clients with the available product specification. 
        You will be given Context documents which will be a mix of text, tables, and images usually of charts or graphs.
        Use this information to match the specification for provided Minimum specification with the relevant specification offered in Context documents.
        Your answer should specify a value or text extracted as a piece of information from Context document only. 
        DO NOT include preamble. 
        DO NOT give me the sources where you obtain your information or conclusion. 
        DO NOT ask further questions.
        DO NOT make up answers, use the provided context documents below and answer the question to the best of your ability. 
        Be as brief, complete and precise in your output as possible. 

        """
 
    response = ollama.chat(model='llama3.1:8b', messages=[{'role': 'system', 'content': text_message},{'role': 'user', 'content': formatted_prompt}])
    
    return response['message']['content']

In [22]:
# This is the final section where the inference output from multimodal and basic RAG 
# are channeled into filling up the separate columns in excel file for comparison

# Load the Excel file
df = pd.read_excel('Sample-TenderDoc.xlsx')

# placeholders for the responses from multimodal and basic RAG
mm_responses = []
basic_responses = []
vstore = loadvectorstore()

# Iterate over the rows and generate responses for multimodal and basic RAG
# each response is based upon the entry in 'Minimum Specification' column
for index, row in df.iterrows():
    min_spec = row['Minimum Specification']
    expected_response = row['Expected Response']
    query = f"Based on the datasheet, what is the closest specification that could meet or exceed this technical requirement: {min_spec} ?"

    # Generate basic RAG response
    basic_response = basic_rag(vstore, query)
    basic_responses.append(basic_response)

    # Generate multimodal RAG response
    mm_response = multimodal_rag_qa(query)
    mm_responses.append(mm_response)
    
# Add the new columns to the DataFrame
df['Multimodal RAG'] = mm_responses
df['Basic RAG'] = basic_responses

# Save the updated DataFrame to a new Excel file
df.to_excel('ProcessedFile.xlsx', index=False)


In [24]:
# The following are the inference code for multimodal RAG only, just as reference

import pandas as pd

# Load the Excel file
df = pd.read_excel('Sample-TenderDoc.xlsx')

# Iterate over the rows and generate responses and evaluations
generated_responses = []
# evaluation_scores = []

for index, row in df.iterrows():
    min_spec = row['Minimum Specification']
    expected_response = row['Expected Response']
    query = f"Based on the datasheet, what is the closest specification that could meet or exceed this technical requirement: {min_spec} ?"

    # Generate response
    generated_response = multimodal_rag_qa(query)
    generated_responses.append(generated_response)
    
# Add the new columns to the DataFrame
df['Multimodal RAG'] = generated_responses
# df['Evaluation Score'] = evaluation_scores

# Save the updated DataFrame to a new Excel file
df.to_excel('ProcessedFile.xlsx', index=False)


Generated response for row 0: Intel 

Processor type
Generated response for row 1: 2nd Generation Intel® Xeon Scalable processors up to 28 cores per processor
Generated response for row 2: Up to two 2nd Generation Intel Xeon Scalable processors
Generated response for row 3: Internal Controllers: PERC H330, H730P, H740P, HBA330
Generated response for row 4: Up to 128TB
Generated response for row 5: Up to 24 DDR4 DIMM slots
Generated response for row 6: This is a detailed spec sheet for the PowerEdge R740 server from Dell EMC. Here are some of the key features and specifications:

**Key Features**

* Persistent Memory NVDIMM-N can increase database performance by 10x
* Wipe all data from storage media quickly and securely with System Erase
* Support for up to three 300W or six 150W GPUs

**Specifications**

* Max 128TB SATA HDD storage
* Optional DVD-ROM, DVD+RW drive
* Titanium 750W power supply (platinum 495W, 750W also available)
* Hot plug power supplies with full redundancy (750W 24