In [4]:
# code updated on 3 October 2024

from langchain_community.document_loaders import UnstructuredPDFLoader

doc = './HP-datasheet.pdf'

# takes 1 min on Colab
loader = UnstructuredPDFLoader(file_path=doc,
                               strategy='hi_res',
                               extract_images_in_pdf=True,
                               infer_table_structure=True,
                               chunking_strategy="by_title", # section-based chunking
                               max_characters=4000, # max size of chunks
                               new_after_n_chars=4000, # preferred size of chunks
                               combine_text_under_n_chars=2000, # smaller chunks < 2000 chars will be combined into a larger chunk
                               mode='elements',
                               image_output_dir_path='./figures')
data = loader.load()

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from IPython.display import HTML, display, Markdown

In [6]:


from langchain_community.llms import Ollama

# Initialize the model
chatgpt = Ollama(model="llama3.1:8b")


In [7]:
docs = []
tables = []

for doc in data:
    if doc.metadata['category'] == 'Table':
        tables.append(doc)
    elif doc.metadata['category'] == 'CompositeElement':
        docs.append(doc)

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# Prompt
prompt_text = """
You are an assistant tasked with summarizing tables and text particularly for semantic retrieval.
These summaries will be embedded and used to retrieve the raw text or table elements
Give a detailed summary of the table or text below that is well optimized for retrieval.
For any tables also add in a one line description of what the table is about besides the summary.
Do not add additional words like Summary: etc.

Table or text chunk:
{element}
"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
summarize_chain = (
                    {"element": RunnablePassthrough()}
                      |
                    prompt
                      |
                    chatgpt
                      |
                    StrOutputParser() # extracts the response as text and returns it as a string
)

# Initialize empty summaries
text_summaries = []
table_summaries = []

text_docs = [doc.page_content for doc in docs]
table_docs = [table.page_content for table in tables]

text_summaries = summarize_chain.batch(text_docs, {"max_concurrency": 5})
table_summaries = summarize_chain.batch(table_docs, {"max_concurrency": 5})

len(text_summaries), len(table_summaries)

(9, 2)

In [9]:
import base64
import os
import time

from langchain_core.messages import HumanMessage
from langchain_community.chat_models import ChatOllama

def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prompt):
    """Make image summary"""
   
    chat = ChatOllama(model="llava-llama3", temperature=0)
    

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def generate_img_summaries(path):
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """

    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval.
                Remember these images could potentially contain graphs, charts or tables also.
                These summaries will be embedded and used to retrieve the raw image for question answering.
                Give a detailed summary of the image that is well optimized for retrieval.
                Do not add additional words like Summary: etc.
             """

    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(".jpg"):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))

    return img_base64_list, image_summaries


# Image summaries
IMG_PATH = './figures'
start_time = time.time()
imgs_base64, image_summaries = generate_img_summaries(IMG_PATH)
print(f"--- {time.time() - start_time} seconds ---")

--- 44.594937801361084 seconds ---


In [10]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [4]:
# Code from Langchain for calling chromadb
# Do not run for the first time
from langchain.vectorstores import Chroma
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("private_rag")

chroma_db = Chroma(
    client=persistent_client,
    collection_name="private_rag",
    embedding_function=embeddings,
)

  chroma_db = Chroma(


In [11]:
# This code can be run for the first time to populate chromadb
# DO NOT RUN ON THE SECOND OR SUBSEQUENT TIME IF PERSISTED

from langchain.vectorstores import Chroma

chroma_db = Chroma(
    collection_name="private_rag",
    embedding_function=embeddings,
    persist_directory="./privatedb",
    # collection_metadata={"hnsw:space": "cosine"},
)

  chroma_db = Chroma(


In [12]:
# Run this code for the first time

import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_community.storage import RedisStore
from langchain_community.utilities.redis import get_client
# from langchain_chroma import Chroma
from langchain_core.documents import Document
# from langchain_openai import OpenAIEmbeddings


def create_multi_vector_retriever(
    docstore, vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """


    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=docstore,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    ## THE FOLLOWING SUPPOSED CAN BE COMMENTED OUT AND LEFT return retriever only if I have persisted vectorstore and docstore previously
    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever


# Initialize the storage layer - to store raw images, text and tables
client = get_client('redis://localhost:6379')
redis_store = RedisStore(client=client) # you can use filestore, memorystory, any other DB store also

# Create retriever
retriever_multi_vector = create_multi_vector_retriever(
    redis_store,
    chroma_db,
    text_summaries,
    text_docs,
    table_summaries,
    table_docs,
    image_summaries,
    imgs_base64,
)

In [13]:
# Multimodal RAG

from IPython.display import HTML, display, Image
from PIL import Image
import base64
from io import BytesIO

def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Decode the base64 string
    img_data = base64.b64decode(img_base64)
    # Create a BytesIO object
    img_buffer = BytesIO(img_data)
    # Open the image using PIL
    img = Image.open(img_buffer)
    display(img)

In [14]:
import re
import base64

def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None


def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xff\xd8\xff": "jpg",
        b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False


def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content.decode('utf-8')
        else:
            doc = doc.decode('utf-8')
        if looks_like_base64(doc) and is_image_data(doc):
            b64_images.append(doc)
        else:
            texts.append(doc)
    return {"images": b64_images, "texts": texts}

In [15]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_community.storage import RedisStore
from langchain_community.utilities.redis import get_client
# from langchain_chroma import Chroma
from langchain_core.documents import Document
# from langchain_openai import OpenAIEmbeddings


def create_multi_vector_retriever(
    docstore, vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """


    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=docstore,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    ## THE FOLLOWING SUPPOSED CAN BE COMMENTED OUT AND LEFT return retriever only if I have persisted vectorstore and docstore previously
    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever


# Initialize the storage layer - to store raw images, text and tables
client = get_client('redis://localhost:6379')
redis_store = RedisStore(client=client) # you can use filestore, memorystory, any other DB store also

# Create retriever
retriever_multi_vector = create_multi_vector_retriever(
    redis_store,
    chroma_db,
    text_summaries,
    text_docs,
    table_summaries,
    table_docs,
    image_summaries,
    imgs_base64,
)

In [36]:
from operator import itemgetter
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

def multimodal_prompt_function(data_dict):
    """
    Create a multimodal prompt with both text and image context.

    This function formats the provided context from `data_dict`, which contains
    text, tables, and base64-encoded images. It joins the text (with table) portions
    and prepares the image(s) in a base64-encoded format to be included in a message.

    The formatted text and images (context) along with the user question are used to
    construct a prompt for multimodal model.
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = ""

    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = f"Image: data:image/jpeg;base64,{image}\n"
            messages += image_message

    # Adding the text for analysis
    text_message = f"""
        You are an pre-sales specialist well-versed with the company products based on the datasheet and brochure available and
        are competent in matching the requirements from potential user or clients with the available product specification. 
        You will be given context information below which will be a mix of text, tables, and images usually of charts or graphs.
        Use this information to provide answers related to the question which should be related to minimum specification required by the user.
        Your answer should specify a value or text extracted from given document only. 
        DO NOT include preamble or your own analysis. 
        DO NOT give me the sources where you obtain your information or conclusion. 
        DO NOT make up answers, use the provided context documents below and answer the question to the best of your ability. 

        #Example 1: 
        User question: 'Weight: Maximum 3kg'
        Your answer: 'Minimum weight starts at 1.21kg'
        
        #Example 2:
        User question: 'Accessories: laptop bag'
        answer: 'Accessories: not provided'

        #Example 3:
        User question: 'Application Software: Microsoft Office'
        answer: 'Application Software: not provided'

        Let's start:

        User question:
        {data_dict['question']}

        Context documents:
        {formatted_texts}

        Answer:
    """
    
    # Combine text and image message
    messages += text_message
    return [HumanMessage(content=messages)]


# Create RAG chain
multimodal_rag = (
        {
            "context": itemgetter('context'),
            "question": itemgetter('input'),
        }
            |
        RunnableLambda(multimodal_prompt_function)
            |
        chatgpt
            |
        StrOutputParser()
)

# Pass input query to retriever and get context document elements
retrieve_docs = (itemgetter('input')
                    |
                retriever_multi_vector
                    |
                RunnableLambda(split_image_text_types))

# Below, we chain `.assign` calls. This takes a dict and successively
# adds keys-- "context" and "answer"-- where the value for each key
# is determined by a Runnable (function or chain executing at runtime).
# This helps in also having the retrieved context along with the answer generated by GPT-4
multimodal_rag_w_sources = (RunnablePassthrough.assign(context=retrieve_docs)
                                               .assign(answer=multimodal_rag)
)



In [55]:
## Overtaken by the same function below. 
## The function below specifically produce json output. 

def multimodal_rag_qa(query):
    response = multimodal_rag_w_sources.invoke({'input': query})
    print(response['answer']) 

In [22]:
EVALUATION_PROMPT = """You are a fair evaluator language model.

You will be given an instruction, a response to evaluate, a reference answer that gets a score of 3, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 3. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 3}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
5. Do not score conciseness: a correct answer that covers the question should receive max score, even if it contains additional useless information.

The instruction to evaluate:
{instruction}

Response to evaluate:
{response}

Reference Answer (Score 3):
{reference_answer}

Score Rubrics:
[Is the response complete, accurate, and factual based on the reference answer?]
Score 1: The response is completely incomplete, inaccurate, and/or not factual.
Score 2: The response is somewhat complete, accurate, and/or factual.
Score 3: The response is completely complete, accurate, and/or factual.

Feedback:"""

In [41]:
from langchain_core.prompts import ChatPromptTemplate
eval_prompt = ChatPromptTemplate.from_template(EVALUATION_PROMPT)

from langchain_community.llms import Ollama

# Initialize the model
evaluation_llm = Ollama(model="llama3.1:8b")


In [53]:
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

# Load the Excel file
df = pd.read_excel('Sample-TenderDoc.xlsx')

# Define the evaluation function
def evaluate_response(instruction, response, reference_answer):
    EVALUATION_PROMPT = """You are a fair evaluator language model.

    You will be given an instruction, a response to evaluate, a reference answer that gets a score of 3, and a score rubric representing a evaluation criteria are given.
    1. Write a score that is an integer between 1 and 3. You should refer to the score rubric.
    2. The output format should look as follows: "{{an integer number between 1 and 3}}"
    3. Please do not generate any other opening, closing, and explanations.
    4. Failure to give a numeric value in your output is unacceptable and will result in termination.
     
    The instruction to evaluate:
    {instruction}

    Response to evaluate:
    {response}

    Reference Answer (Score 3):
    {reference_answer}

    Score Rubrics:
    [Is the response complete, accurate, and factual based on the reference answer?]
    Score 1: The response is completely incomplete, inaccurate, and/or not factual.
    Score 2: The response is somewhat complete, accurate, and/or factual.
    Score 3: The response is completely complete, accurate, and/or factual.
    """

    eval_prompt = ChatPromptTemplate.from_template(EVALUATION_PROMPT)
    prompt = eval_prompt.format(instruction=instruction, response=response, reference_answer=reference_answer)
    
    evaluation = evaluation_llm(prompt)
    parsed_output = StrOutputParser().parse(evaluation).strip()
    
    # Extract the score from the parsed output
    try:
        score = int(parsed_output.split()[0])
    except (ValueError, IndexError) as e:
        raise ValueError(f"Invalid score value: {parsed_output}") from e
    
    return score

# Iterate over the rows and generate responses and evaluations
generated_responses = []
evaluation_scores = []

for index, row in df.iterrows():
    min_spec = row['Minimum Specification']
    expected_response = row['Expected Response']
    query = f"Based on the datasheet, what is the closest specification that could meet or exceed this technical requirement: {min_spec} ?"

    # Generate response
    generated_response = multimodal_rag_qa(query)
    if generated_response.startswith('Generated response for:'):
        generated_response = generated_response[len('Generated response for:'):].strip()
    generated_responses.append(generated_response)
    
    # Evaluate response
    try:
        score = evaluate_response(query, generated_response, expected_response)
    except ValueError as e:
        print(f"Error evaluating response for row {index}: {e}")
        score = None  # or some default value
    evaluation_scores.append(score)

# Add the new columns to the DataFrame
df['Generated Response'] = generated_responses
df['Evaluation Score'] = evaluation_scores

# Save the updated DataFrame to a new Excel file
df.to_excel('Updated-Sample-TenderDoc.xlsx', index=False)
