In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from unstructured.partition.pdf import partition_pdf

output_path = "./"
file_path = "p4.pdf"
unstructured_model_name = "yolox"  # yolox | layout_v1.1.0 | detectron2_onnx

chunks = partition_pdf(
    filename=file_path,
    hi_res_model_name=unstructured_model_name,
    infer_table_structure=True,  # Ensure tables are parsed
    strategy="hi_res",  # Use fast strategy for better table compatibility | fast, hi_res, auto, ocr_only
    extract_image_block_types=["Table"],  # Add Table to extract all table images
    image_output_dir_path=output_path,  # Save images to directory
    # extract_image_block_to_payload=True,  # Include base64 payload for API usage
    # chunking_strategy="by_title",  # Try by_title; switch to by_section or by_page if needed
    # max_characters=2000,  # Adjust based on content structure
    # combine_text_under_n_chars=500,  # Merge smaller text blocks
    dpi = 300,
)

process_file_with_model: <function process_file_with_model at 0x179f9c7c0>


In [3]:
# chunks

In [4]:
import json

from unstructured.staging.base import elements_to_json

filename = f"{file_path.split(".")[0]}_json"
elements_to_json(chunks, filename=f"{filename}.json")


def process_json_file(input_filename):
    # Read the JSON file
    with open(input_filename, "r") as file:
        data = json.load(file)

    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in data:
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])
        if entry['type'] == 'Text':
            extracted_elements.append(entry["text"])
        if entry['type'] == "NarrativeText":
            extracted_elements.append(entry["text"])
            

    # Write the extracted elements to the output file
    with open("./output.txt", "w") as output_file:
        for element in extracted_elements:
            output_file.write(element + "\n\n")  # Adding two newlines for separation


process_json_file(f"{filename}.json")  # Takes a while for the .txt

In [5]:
# separate tables from texts
tables = []
texts = []

types = []
for chunk in chunks:
    types.append(str(type(chunk)))
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

    if "Text" in str(type((chunk))):
        texts.append(chunk)

len(tables), len(texts)

(9, 56)

In [6]:
tables[0].metadata.text_as_html

'<table><tbody><tr><td>Policy No.</td><td colspan="2">68506563</td></tr><tr><td>Plan Name</td><td colspan="2">Care Supreme</td></tr><tr><td>Cover Type</td><td>Floater</td><td></td></tr><tr><td>Policy Period - Start Date</td><td>00:00 hrs 22-Jul-2024</td><td></td></tr><tr><td>Policy Period - End Date</td><td>Midnight 21-Jul-2025</td><td></td></tr><tr><td>Premium Paid</td><td>Rs.57,302.00</td><td></td></tr><tr><td>Premium Payment Mode</td><td>Rs0.00+SGST/UGST Single Premium</td><td>Rs4,370.48)</td></tr><tr><td>Communication Address Zone</td><td>Zone 2</td><td></td></tr><tr><td></td><td>Date Of Birth</td><td>Client ID</td></tr><tr><td></td><td>04-Feb-1963.</td><td>43332016</td></tr></tbody></table>'

In [15]:
import uuid

import faiss
import torch
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import FAISS, Chroma
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader

torch.mps.empty_cache()

In [16]:
# from langchain_google_genai import ChatGoogleGenerativeAI

# # Prompt
# prompt_text = """
# You are an assistant tasked with summarizing tables and text.
# Give a concise summary of the table or text.
 
# Respond only with the summary, no additionnal comment.
# Do not start your message by saying "Here is a summary" or anything like that.
# Just give the summary as it is.
 
# Table or text chunk: {element}
 
# """
# prompt = ChatPromptTemplate.from_template(prompt_text)
 
# # Summary chain
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.5)

# summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
 

In [17]:
# messages = [
#     (
#         "system",
#         "You are a helpful assistant that translates English to French. Translate the user sentence.",
#     ),
#     ("human", "I love programming."),
# ]
# ai_msg = model.invoke(messages)


In [18]:
# ai_msg

In [19]:
# # Summarize text
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# # Summarize tables
# tables_html = [table.metadata.text_as_html for table in tables]
# table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [20]:
# text_summaries

In [21]:
# # Get the images from the CompositeElement objects
# def get_images_base64(chunks):
#     images_b64 = []
#     for chunk in chunks:
#         if "CompositeElement" in str(type(chunk)):
#             chunk_els = chunk.metadata.orig_elements
#             for el in chunk_els:
#                 if "Image" in str(type(el)):
#                     images_b64.append(el.metadata.image_base64)
#     return images_b64


# images = get_images_base64(chunks)
# import base64
# from IPython.display import Image, display


# def display_base64_image(base64_code):
#     # Decode the base64 string to binary
#     image_data = base64.b64decode(base64_code)
#     # Display the image
#     display(Image(data=image_data))

# # display_base64_image(images[0])

In [25]:
 # Step 1: Storage layer setup
store = InMemoryDocstore()  # Storage layer for parent documents
id_key = "doc_id"  # ID key for associating documents

# Step 2: Load Hugging Face embeddings
embedding_model = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2"  # You can change this to your preferred model
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Step 3: Initialize FAISS index
vector_dimension = len(embeddings.embed_query("hello world"))  # Determine embedding dimension
index = faiss.IndexFlatL2(vector_dimension)  # L2 similarity index

# Step 4: Set up FAISS vectorstore
vectorstore = FAISS(
    embedding_function=embeddings,  # The embedding function
    index=index,  # The FAISS index
    docstore=store,  # Document storage
    index_to_docstore_id={},  # Mapping of FAISS index to docstore IDs
)

# Step 5: Load documents
loader = TextLoader("output.txt")
documents = loader.load()


# Step 6: split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
doc_ids = [str(uuid.uuid4()) for _ in docs]  # Unique IDs for each document

print("docs:", len(docs))


# Step 7: Generate embeddings and add documents to FAISS
try:
    retriever = vectorstore.as_retriever()  # Create retriever from vectorstore
    # Add documents to the vectorstore
    retriever.vectorstore.add_documents(docs, ids=doc_ids)
    print("Documents successfully added to FAISS!")
except AssertionError as e:
    print(f"AssertionError: {e}")
except Exception as e:
    print(f"Error adding documents: {e}")


# Step 8: Query the vectorstore
query = "Insured Members"
query_embedding = embeddings.embed_query(query)
results = vectorstore.similarity_search_by_vector(query_embedding, k=2)


# Display results
for result in results:
    print(f"Retrieved Document: {result.page_content}, Metadata: {result.metadata}")
 

Created a chunk of size 1729, which is longer than the specified 800
Created a chunk of size 1123, which is longer than the specified 800


docs: 13
Documents successfully added to FAISS!
Retrieved Document: <table><thead><tr><th>S NO.</th><th>Particulars</th><th>Details</th></tr></thead><tbody><tr><td></td><td>Cumulative Bonus Super</td><td>Upto 100% increase in the Sum Insured, on a cumulative basis for each completed and continuous policy year upto a max of 500%</td></tr><tr><td>2</td><td>Wellness Benefit</td><td>Discount on renewal premium based on active days achieved. Online fitness Coaching/Counselling session from Wellness Coaches</td></tr><tr><td>3</td><td>Air Ambulance Cover</td><td>Up to 5 lacs per year.</td></tr><tr><td>4</td><td>Claim Shield</td><td>Coverage of specified 68 Non Payable Items as defined in T&amp;C</td></tr></tbody></table>, Metadata: {'source': 'output.txt'}
Retrieved Document: Date : 04 Jul 2024

Mr Kailash Mahajan 6/3, South Harsidhi Opp Garden Indore Indore 452007 Madhya Pradesh State Code : 23

Policy No: 68506563 Mobile No: XXXXXX0561

Dear Mr Kailash Mahajan,

Thank You for trusting us as

[Document(metadata={'source': 'output.txt'}, page_content='Date : 04 Jul 2024\n\nMr Kailash Mahajan 6/3, South Harsidhi Opp Garden Indore Indore 452007 Madhya Pradesh State Code : 23\n\nPolicy No: 68506563 Mobile No: XXXXXX0561\n\nDear Mr Kailash Mahajan,\n\nThank You for trusting us as your preferred Health Insurer.\n\nAt Care Health insurance, it is our endeavor to make quality healthcare easily accessible for our customers as well as ensure a truly hassle-free claim servicing experience\n\nTo help you understand our services better, please go through the \'Know your policy better\' kit that accompanies this letter and constitutes the following\n\nAlso appended herewith for your convenience is your Care Health Card. This card should be presented at the time of an emergency or a planned hospitalization, to avail cashless treatment at our network of over 16000+ cashless network pan-India.\n\nTo further simplify procedures, we\'re online as well. Visit our portal www.careinsurance.com a

In [82]:
# The storage layer for the parent documents
store = InMemoryDocstore()
id_key = "doc_id"

embedding_model = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2"  # "sentence-transformers/all-mpnet-base-v2"

# Load Hugging Face embeddings (e.g., from sentence-transformers)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

d = 2
index = faiss.IndexFlatL2(d)

# The vectorstore to use FAISS for indexing
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=store,
    index_to_docstore_id={},
)

In [83]:
# text_doc_ids = [str(uuid.uuid4()) for _ in texts]
# text_documents = [
#     Document(page_content=summary.text, metadata={id_key: text_doc_ids[i]})
#     for i, summary in enumerate(texts)
# ]


# tables_html = [table.metadata.text_as_html for table in tables]
# table_doc_ids = [str(uuid.uuid4()) for _ in tables_html]
# table_documents = [
#     Document(page_content=summary, metadata={id_key: table_doc_ids[i]})
#     for i, summary in enumerate(tables_html)
# ]

In [84]:
f



In [85]:
index.d

2

In [86]:
for i, embedding in enumerate(embeddings):
    if len(embedding) != index.d:
        print(f"Mismatch at index {i}: {len(embedding)} != {index.d}")



In [87]:
from langchain.text_splitter import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1729, which is longer than the specified 800
Created a chunk of size 1123, which is longer than the specified 800


In [88]:
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 1})
retriever = vectorstore.as_retriever()
text_doc_ids = [str(uuid.uuid4()) for _ in docs]

# vector_store.add_documents(documents=documents, ids=text_doc_ids)


retriever.vectorstore.add_documents(documents=docs, ids=text_doc_ids)
# retriever.vectorstore.add_documents(documents=table_documents, ids=table_doc_ids)
# retriever.vectorstore.add_documents(summary_tables)

AssertionError: 

In [None]:
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline


def initialize_llm(
    model_id="google/gemma-2-2b-it",
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.5,
        "do_sample": True,
        # "device": "cpu",
    },
    # max_batch_size=4
):
    """Initializes Hugging Face LLM pipeline."""
    hf_pipeline = pipeline(task="text-generation", model=model_id, **pipeline_kwargs)
    return HuggingFacePipeline(pipeline=hf_pipeline)


llm = initialize_llm()

In [None]:
from langchain_huggingface import ChatHuggingFace


def chat_bot_llm(llm):
    chat_model = ChatHuggingFace(llm=llm)
    return chat_model


chat_llm = chat_bot_llm(llm=llm)

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

from langchain_core.messages import SystemMessage, HumanMessage

from base64 import b64decode


def parse_docs(docs):
    """Split base64-encoded images and texts"""
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):

    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.page_content

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    # if len(docs_by_type["images"]) > 0:
    #     for image in docs_by_type["images"]:
    #         prompt_content.append(
    #             {
    #                 "type": "image_url",
    #                 "image_url": {"url": f"data:image/jpeg;base64,{image}"},
    #             }
    #         )

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content),
        ]
    )


# chain = (
#     {
#         "context": retriever | RunnableLambda(parse_docs),
#         "question": RunnablePassthrough(),
#     }
#     | RunnableLambda(build_prompt)
#     | chat_llm
#     | StrOutputParser()
# )

# chain_with_sources = {
#     "context": retriever | RunnableLambda(parse_docs),
#     "question": RunnablePassthrough(),
# } | RunnablePassthrough().assign(
#     response=(RunnableLambda(build_prompt) | llm | StrOutputParser())
# )

In [None]:
question = "What is the total Sum Insured amount in the policy?"

In [None]:
def get_answer(question):
    docs = retriever.invoke(question)
    context = parse_docs(docs)
    prompt = build_prompt({"context": context, "question": question})
    llm_res = chat_llm.invoke(prompt.messages)
    return llm_res.pretty_print()

In [None]:
llm_res = chat_llm.invoke(prompt.messages)
# llm_res = chat_llm.stream(prompt.messages)
# llm_res

In [None]:
llm_res.pretty_print()

In [None]:
questions = [
    "List of Insured Members?",
    "What is the Room Rent amount or room type included in the policy?",
    "What is the Maternity Sum capping or sum insured?",
    "What is the policy start date?",
    "Sum insured for the policy?",
    "Does the policy have copay?",
    "what is the policy inception date",
    "what is the waiting period? and list down the categories for waiting periods",
    "what is the waiting period for specific disease waiting periods",
    "what is the waiting period for maternity package ",
]

In [None]:
for ques in questions:
    print(get_answer(ques))

In [None]:
a

In [None]:
StrOutputParser().parse_result(llm_res)

In [None]:
response = chain.invoke("List of insured members")

print(response)
print(response["response"])

In [None]:
response = chain_with_sources.invoke("What is multihead?")

print("Response:", response["response"])

print("\n\nContext:")
for text in response["context"]["texts"]:
    print(text.text)
    print("Page number: ", text.metadata.page_number)
    print("\n" + "-" * 50 + "\n")
for image in response["context"]["images"]:
    display_base64_image(image)

<!-- ## References

- [LangChain Inspiration](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb?ref=blog.langchain.dev)
- [Multivector Storage](https://python.langchain.com/docs/how_to/multi_vector/) -->