In [1]:
from unstructured.partition.pdf import partition_pdf

output_path = "./"
file_path = "p2.pdf"
unstructured_model_name = "yolox"  # yolox | layout_v1.1.0 | detectron2_onnx

chunks = partition_pdf(
    filename=file_path,
    hi_res_model_name=unstructured_model_name,
    infer_table_structure=True,  # Ensure tables are parsed
    strategy="hi_res",  # Use fast strategy for better table compatibility | fast, hi_res, auto, ocr_only
    extract_image_block_types=["Table"],  # Add Table to extract all table images
    image_output_dir_path=output_path,  # Save images to directory
    # extract_image_block_to_payload=True,  # Include base64 payload for API usage
    # chunking_strategy="by_title",  # Try by_title; switch to by_section or by_page if needed
    # max_characters=2000,  # Adjust based on content structure
    # combine_text_under_n_chars=500,  # Merge smaller text blocks
)

process_file_with_model: <function process_file_with_model at 0x17ada16c0>


In [2]:
# import json

# from unstructured.staging.base import elements_to_json

# filename = f"{file_path.split(".")[0]}_json"
# elements_to_json(chunks, filename=f"{filename}.json")


# def process_json_file(input_filename):
#     # Read the JSON file
#     with open(input_filename, "r") as file:
#         data = json.load(file)

#     # Iterate over the JSON data and extract required table elements
#     extracted_elements = []
#     for entry in data:
#         if entry["type"] == "Table":
#             extracted_elements.append(entry["metadata"]["text_as_html"])

#     # Write the extracted elements to the output file
#     with open("./output.txt", "w") as output_file:
#         for element in extracted_elements:
#             output_file.write(element + "\n\n")  # Adding two newlines for separation


# process_json_file(f"{filename}.json")  # Takes a while for the .txt

In [8]:
# separate tables from texts
tables = []
texts = []

types = []
for chunk in chunks:
    types.append(str(type(chunk)))
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

    if "Text" in str(type((chunk))):
        texts.append(chunk)

len(tables), len(texts)

(13, 349)

In [9]:
# # Get the images from the CompositeElement objects
# def get_images_base64(chunks):
#     images_b64 = []
#     for chunk in chunks:
#         if "CompositeElement" in str(type(chunk)):
#             chunk_els = chunk.metadata.orig_elements
#             for el in chunk_els:
#                 if "Image" in str(type(el)):
#                     images_b64.append(el.metadata.image_base64)
#     return images_b64


# images = get_images_base64(chunks)
# import base64
# from IPython.display import Image, display


# def display_base64_image(base64_code):
#     # Decode the base64 string to binary
#     image_data = base64.b64decode(base64_code)
#     # Display the image
#     display(Image(data=image_data))


# # display_base64_image(images[0])

In [10]:
import uuid

import faiss
import torch
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import FAISS, Chroma
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings

torch.mps.empty_cache()

In [11]:
# The storage layer for the parent documents
store = InMemoryDocstore()
id_key = "doc_id"

embedding_model = "sentence-transformers/all-MiniLM-L6-v2"  # "sentence-transformers/all-mpnet-base-v2"

# Load Hugging Face embeddings (e.g., from sentence-transformers)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

# The vectorstore to use FAISS for indexing
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=store,
    index_to_docstore_id={},
)

<!-- ### Load the summaries and link the to the original data -->

In [12]:
text_doc_ids = [str(uuid.uuid4()) for _ in texts]
text_documents = [
    Document(page_content=summary.text, metadata={id_key: text_doc_ids[i]})
    for i, summary in enumerate(texts)
]


tables_html = [table.metadata.text_as_html for table in tables]
table_doc_ids = [str(uuid.uuid4()) for _ in tables_html]
table_documents = [
    Document(page_content=summary, metadata={id_key: table_doc_ids[i]})
    for i, summary in enumerate(tables_html)
]

['6eebab64-5070-45c4-886f-6a1993391890', '74672b5e-207d-4d44-aaa3-ee169f2c7865', '7ccb5b72-6e1b-4286-adc0-08733b2b5d4c', 'b378a957-c7f6-48ea-9da2-51c48608517c', 'a1898744-237f-48e9-8b21-a850a72e2c9b', '86c5cf5b-e4bc-4140-9827-e34a09a8ceb7', '395cb586-d8cb-4564-a598-5cf28bbd1673', '25d5b32e-aa0d-497c-af46-c9cf7bba034f', '3c106150-17d8-46fe-af0e-040e12e0f4ee', '6366cdab-c85c-45f6-ae00-9f028f2cb52e', 'ec8eceb2-03bc-42b7-a8ae-44d3d06ce1a8', 'ef10394d-ec24-4172-a689-66d16d3f3398', '57ec7ce6-76c2-4e85-aa60-0a02397f261b', 'c8e952d0-8cab-48f7-ab4d-85ca26db22ce', 'e2a965dc-bf2f-4e86-b9a7-b65e6aaf152c', '312bd458-3219-4155-98f2-3efa51abff03', '1e33cd70-e3e4-4312-92aa-d87f320a1ee8', '4de596ab-659e-48d1-8e4e-aacf1df39726', 'e8d661b1-1657-4620-a07b-9776fce92138', '10d2f9b9-41c9-4f3d-a2ae-d1812fb08255', 'a4c052cb-6335-4358-a444-c1583b476c07', '0b9eed1b-754e-432f-8d47-b7b645bdbb54', 'e5b46fa5-9ab4-4a4a-a242-ba77267a8899', '6cd835bc-cb4b-429c-977e-70c9452ef92b', '05380d45-527a-44ce-b186-9f5d9379244b',

In [None]:
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 1})
retriever = vectorstore.as_retriever()
retriever.vectorstore.add_documents(documents=text_documents, ids=text_doc_ids)
retriever.vectorstore.add_documents(documents=table_documents, ids=table_doc_ids)
# retriever.vectorstore.add_documents(summary_tables)

In [None]:
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline


def initialize_llm(
    model_id="google/gemma-2-2b-it",
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.5,
        "do_sample": True,
        # "device": "cpu",
    },
    # max_batch_size=4
):
    """Initializes Hugging Face LLM pipeline."""
    hf_pipeline = pipeline(task="text-generation", model=model_id, **pipeline_kwargs)
    return HuggingFacePipeline(pipeline=hf_pipeline)


llm = initialize_llm()

In [56]:
from langchain_huggingface import ChatHuggingFace


def chat_bot_llm(llm):
    chat_model = ChatHuggingFace(llm=llm)
    return chat_model


chat_llm = chat_bot_llm(llm=llm)

In [24]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

from langchain_core.messages import SystemMessage, HumanMessage

from base64 import b64decode


def parse_docs(docs):
    """Split base64-encoded images and texts"""
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):

    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.page_content

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    # if len(docs_by_type["images"]) > 0:
    #     for image in docs_by_type["images"]:
    #         prompt_content.append(
    #             {
    #                 "type": "image_url",
    #                 "image_url": {"url": f"data:image/jpeg;base64,{image}"},
    #             }
    #         )

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content),
        ]
    )


# chain = (
#     {
#         "context": retriever | RunnableLambda(parse_docs),
#         "question": RunnablePassthrough(),
#     }
#     | RunnableLambda(build_prompt)
#     | chat_llm
#     | StrOutputParser()
# )

# chain_with_sources = {
#     "context": retriever | RunnableLambda(parse_docs),
#     "question": RunnablePassthrough(),
# } | RunnablePassthrough().assign(
#     response=(RunnableLambda(build_prompt) | llm | StrOutputParser())
# )

In [57]:
question = "What is the total Sum Insured amount in the policy?"

In [97]:
def get_answer(question):
    docs = retriever.invoke(question)
    context = parse_docs(docs)
    prompt = build_prompt({"context": context, "question": question})
    llm_res = chat_llm.invoke(prompt.messages)
    return llm_res.pretty_print()

In [59]:
llm_res = chat_llm.invoke(prompt.messages)
# llm_res = chat_llm.stream(prompt.messages)
# llm_res

In [None]:
llm_res.pretty_print()

In [96]:
questions = [
    "List of Insured Members?",
    "What is the Room Rent amount or room type included in the policy?",
    "What is the Maternity Sum capping or sum insured?",
    "What is the policy start date?",
    "Sum insured for the policy?",
    "Does the policy have copay?",
    "what is the policy inception date",
    "what is the waiting period? and list down the categories for waiting periods",
    "what is the waiting period for specific disease waiting periods",
    "what is the waiting period for maternity package ",
]

In [None]:
for ques in questions:
    print(get_answer(ques))

In [None]:
a

In [None]:
StrOutputParser().parse_result(llm_res)

In [None]:
response = chain.invoke("List of insured members")

print(response)
print(response["response"])

In [None]:
response = chain_with_sources.invoke("What is multihead?")

print("Response:", response["response"])

print("\n\nContext:")
for text in response["context"]["texts"]:
    print(text.text)
    print("Page number: ", text.metadata.page_number)
    print("\n" + "-" * 50 + "\n")
for image in response["context"]["images"]:
    display_base64_image(image)

<!-- ## References

- [LangChain Inspiration](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb?ref=blog.langchain.dev)
- [Multivector Storage](https://python.langchain.com/docs/how_to/multi_vector/) -->