In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [1]:
import torch

torch.mps.empty_cache()

In [3]:
import uuid

import faiss

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.output_parsers import StrOutputParser

In [2]:
file_path = "p1.pdf"

In [None]:
import cv2
import numpy as np


def post_process_table(image, detected_table_mask):
    # Apply morphological operations to refine the table mask
    kernel = np.ones((5, 5), np.uint8)
    dilated_mask = cv2.dilate(detected_table_mask, kernel, iterations=1)
    eroded_mask = cv2.erode(dilated_mask, kernel, iterations=1)

    # Extract the refined table region
    refined_table_region = image * eroded_mask[:, :, np.newaxis]

    return refined_table_region

In [5]:
from unstructured.partition.pdf import partition_pdf


def parse_pdf(file_path):
    unstructured_model_name = "yolox"  # yolox | layout_v1.1.0 | detectron2_onnx
    return partition_pdf(
        strategy="hi_res",
        filename=file_path,
        hi_res_model_name=unstructured_model_name,
        infer_table_structure=True,
        extract_image_block_types=["Table"],
        image_output_dir_path="./",
        languages=["eng"],
        dpi=300,
    )


chunks = parse_pdf(file_path)

python(24997) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25007) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25008) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25009) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25057) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25058) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25059) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25096) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25097) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25099) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25100) Malloc

In [6]:
import json
from unstructured.staging.base import elements_to_json


def process_parsed_pdf(elements):
    json_elements = elements_to_json(elements=elements, indent=4)

    extracted_elements = []
    parsed_elements = {"table": [], "text": []}
    for entry in json.loads(json_elements):
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])
            parsed_elements["table"].append(
                {"html": entry["metadata"]["text_as_html"], "ocr": entry["text"]}
            )

        if entry["type"] == "Text":
            extracted_elements.append(entry["text"])
            parsed_elements["text"].append(entry["text"])

        if entry["type"] == "NarrativeText":
            extracted_elements.append(entry["text"])
            parsed_elements["text"].append(entry["text"])

    # with open("./output.txt", "w") as output_file:
    #     for element in extracted_elements:
    #         output_file.write(element + "\n\n")  # Adding two newlines for separation

    return parsed_elements


el = process_parsed_pdf(chunks)

In [None]:
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline


def initialize_llm(
    model_id="google/gemma-2-2b-it",
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.5,
        "do_sample": True,
        # "device": "cpu",
    },
    max_batch_size=4,
):
    hf_pipeline = pipeline(task="text-generation", model=model_id, **pipeline_kwargs)
    return HuggingFacePipeline(pipeline=hf_pipeline)


llm = initialize_llm()

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage

from langchain_core.prompts import PromptTemplate


def create_prompt():
    template = """
        You are an assistant tasked with summarizing tables and OCR text.
        Provide a concise summary of the table's important details, including key amounts, names, and other critical information. 
        Ensure the summary is clear and complete, combining insights from both the table structure and OCR text.

        Do not include any extraneous text or comments. Your response should only contain the summary.

        Table (HTML format): {table}
        OCR Text: {text}
    """

    return PromptTemplate.from_template(template)


prompt = create_prompt()

In [8]:
chain = prompt | llm

In [11]:
for ch in chain.stream(
    {"table": el["table"][3]["html"], "text": el["table"][3]["ocr"]}
):
    print(ch, end="", flush=True)

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.



        


In [None]:
from unstructured.cleaners.core import clean

all_table_summary = []
for table_info in el["table"]:
    summary = ""
    for chunk in chain.stream({"table": table_info["html"], "text": table_info["ocr"]}):
        print(clean(chunk, extra_whitespace=True), end="", flush=True)
        summary += chunk
    print()
    all_table_summary.append(clean(summary, extra_whitespace=True))

In [None]:
all_table_summary

In [None]:
from unstructured.cleaners.core import clean

for i in all_table_summary:
    print(len(i), i)
    print("-" * 50)

In [None]:
from langchain_core.prompts import PromptTemplate

template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)

print(prompt)

In [None]:
from langchain_core.prompts import ChatPromptTemplate


html_table = "<table><tbody><tr><td>Intermediary Name:</td><td>PolicyBazaar Insurance Brokers Pvt Ltd</td><td>Intermediary Code: 0065359</td></tr><tr><td>Intermediary Contact Details:</td><td>| Mobile:</td><td>Land Line No: +1800-124-5723918</td></tr></tbody></table>"
template = ChatPromptTemplate(
    [
        SystemMessage(content=prompt_text),
        ("human", {table}),
    ]
)


chain = template | llm

print(chain.invoke({"table": html_table}))

In [12]:
# from langchain_google_genai import ChatGoogleGenerativeAI

# # Prompt

prompt = ChatPromptTemplate.from_template(prompt_text)


# # Summary chain
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.5)

# summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [22]:
def summarize_tables(table_html, model):
    prompt_text = """
# You are an assistant tasked with summarizing tables and text.
# Give a concise summary of the table or text.

# Respond only with the summary, no additionnal comment.
# Do not start your message by saying "Here is a summary" or anything like that.
# Just give the summary as it is.

# """
    messages = (("system", prompt_text), ("human", table_html))

    print("messages:", messages)
    ai_msg = model.invoke(messages)

    return ai_msg

In [14]:
tables_html = [table.metadata.text_as_html for table in tables]

In [None]:
table_summaries = [summarize_tables(table, llm) for table in tables_html]

In [None]:
StrOutputParser().parse(table_summaries[0])

In [None]:
out = summarize_tables(tables_html[0], llm)

In [30]:
# # Summarize text
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# # Summarize tables

# table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [None]:
prompt_text = """
# You are an assistant tasked with summarizing tables and text.
# Give a concise summary of the table or text.

# Respond only with the summary, no additionnal comment.
# Do not start your message by saying "Here is a summary" or anything like that.
# Just give the summary as it is.

# """
chat = [
    {"role": "system", "content": prompt_text},
    {
        "role": "human",
        "content": "<table><tbody><tr><td>Intermediary Name:</td><td>PolicyBazaar Insurance Brokers Pvt Ltd</td><td>Intermediary Code: 0065359</td></tr><tr><td>Intermediary Contact Details:</td><td>| Mobile:</td><td>Land Line No: +1800-124-5723918</td></tr></tbody></table>",
    },
]

llm.invoke(chat)

In [None]:
tables_html[0]

In [None]:
# # Get the images from the CompositeElement objects
# def get_images_base64(chunks):
#     images_b64 = []
#     for chunk in chunks:
#         if "CompositeElement" in str(type(chunk)):
#             chunk_els = chunk.metadata.orig_elements
#             for el in chunk_els:
#                 if "Image" in str(type(el)):
#                     images_b64.append(el.metadata.image_base64)
#     return images_b64


# images = get_images_base64(chunks)
# import base64
# from IPython.display import Image, display


# def display_base64_image(base64_code):
#     # Decode the base64 string to binary
#     image_data = base64.b64decode(base64_code)
#     # Display the image
#     display(Image(data=image_data))

# # display_base64_image(images[0])

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Step 1: Storage layer setup
store = InMemoryDocstore()  # Storage layer for parent documents
id_key = "doc_id"  # ID key for associating documents

# Step 2: Load Hugging Face embeddings
embedding_model = "sentence-transformers/all-mpnet-base-v2"  # "sentence-transformers/all-MiniLM-L6-v2"  # You can change this to your preferred model
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Step 3: Initialize FAISS index
vector_dimension = len(
    embeddings.embed_query("hello world")
)  # Determine embedding dimension
index = faiss.IndexFlatL2(vector_dimension)  # L2 similarity index

# Step 4: Set up FAISS vectorstore
vectorstore = FAISS(
    embedding_function=embeddings,  # The embedding function
    index=index,  # The FAISS index
    docstore=store,  # Document storage
    index_to_docstore_id={},  # Mapping of FAISS index to docstore IDs
)

# Step 5: Load documents
loader = TextLoader("output.txt")
documents = loader.load()


# Step 6: split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
doc_ids = [str(uuid.uuid4()) for _ in docs]  # Unique IDs for each document

print("docs:", len(docs))


# Step 7: Generate embeddings and add documents to FAISS
try:
    retriever = vectorstore.as_retriever()  # Create retriever from vectorstore
    # Add documents to the vectorstore
    retriever.vectorstore.add_documents(docs, ids=doc_ids)
    print("Documents successfully added to FAISS!")
except AssertionError as e:
    print(f"AssertionError: {e}")
except Exception as e:
    print(f"Error adding documents: {e}")


# Step 8: Query the vectorstore
query = "Insured Members"
query_embedding = embeddings.embed_query(query)
results = vectorstore.similarity_search_by_vector(query_embedding, k=2)


# Display results
for result in results:
    print(f"Retrieved Document: {result.page_content}, Metadata: {result.metadata}")

In [None]:
# # The storage layer for the parent documents
# store = InMemoryDocstore()
# id_key = "doc_id"

# embedding_model = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2"  # "sentence-transformers/all-mpnet-base-v2"

# # Load Hugging Face embeddings (e.g., from sentence-transformers)
# embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# d = 2
# index = faiss.IndexFlatL2(d)

# # The vectorstore to use FAISS for indexing
# vectorstore = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=store,
#     index_to_docstore_id={},
# )

In [None]:
# text_doc_ids = [str(uuid.uuid4()) for _ in texts]
# text_documents = [
#     Document(page_content=summary.text, metadata={id_key: text_doc_ids[i]})
#     for i, summary in enumerate(texts)
# ]


# tables_html = [table.metadata.text_as_html for table in tables]
# table_doc_ids = [str(uuid.uuid4()) for _ in tables_html]
# table_documents = [
#     Document(page_content=summary, metadata={id_key: table_doc_ids[i]})
#     for i, summary in enumerate(tables_html)
# ]

In [None]:
# from langchain.text_splitter import CharacterTextSplitter

# # split it into chunks
# text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
# docs = text_splitter.split_documents(documents)

In [None]:
# # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 1})
# retriever = vectorstore.as_retriever()
# text_doc_ids = [str(uuid.uuid4()) for _ in docs]

# # vector_store.add_documents(documents=documents, ids=text_doc_ids)


# retriever.vectorstore.add_documents(documents=docs, ids=text_doc_ids)
# # retriever.vectorstore.add_documents(documents=table_documents, ids=table_doc_ids)
# # retriever.vectorstore.add_documents(summary_tables)

In [None]:
from langchain_huggingface import ChatHuggingFace


def chat_bot_llm(llm):
    chat_model = ChatHuggingFace(llm=llm)
    return chat_model


chat_llm = chat_bot_llm(llm=llm)

In [None]:
def parse_docs(docs):
    text = []
    for doc in docs:
        text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.page_content

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content),
        ]
    )


# chain = (
#     {
#         "context": retriever | RunnableLambda(parse_docs),
#         "question": RunnablePassthrough(),
#     }
#     | RunnableLambda(build_prompt)
#     | chat_llm
#     | StrOutputParser()
# )

# chain_with_sources = {
#     "context": retriever | RunnableLambda(parse_docs),
#     "question": RunnablePassthrough(),
# } | RunnablePassthrough().assign(
#     response=(RunnableLambda(build_prompt) | llm | StrOutputParser())
# )

In [None]:
def get_answer(question):
    try:
        docs = retriever.invoke(question)
        if not docs:
            return f"No relevant documents found for question: {question}"

        context = parse_docs(docs)
        prompt = build_prompt({"context": context, "question": question})
        llm_res = chat_llm.invoke(prompt.messages)
        return llm_res.pretty_print()
    except Exception as e:
        print(f"Error during query processing: {e}")
        return f"An error occurred while processing the question: {question}"

In [None]:
questions = [
    "List of Insured Members?",
    "What is the Room Rent amount or room type included in the policy?",
    "What is the Maternity Sum capping or sum insured?",
    "Sum insured for the policy?",
    "Does the policy have copay?",
    "what is the policy inception date",
    "What is the Policy Period",
    "what is the waiting period? and list down the categories for waiting periods",
    "what is the waiting period for specific disease waiting periods",
    "what is the waiting period for maternity package ",
]

In [None]:
import pprint

output_list = [get_answer(query) for query in questions]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(output_list)