In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import json
import uuid
from base64 import b64decode

import faiss
import torch
from langchain.document_loaders import TextLoader
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import pipeline
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

torch.mps.empty_cache()

In [2]:
output_path = "./"
file_path = "p2.pdf"
unstructured_model_name = "yolox"  # yolox | layout_v1.1.0 | detectron2_onnx
filename = f"{file_path.split(".")[0]}_json"

In [3]:
from unstructured.partition.pdf import partition_pdf

chunks = partition_pdf(
    filename=file_path,
    hi_res_model_name=unstructured_model_name,
    infer_table_structure=True,  # Ensure tables are parsed
    strategy="hi_res",  # Use fast strategy for better table compatibility | fast, hi_res, auto, ocr_only
    # extract_image_block_types=["Table"],  # Add Table to extract all table images
    # image_output_dir_path=output_path,  # Save images to directory
    # extract_image_block_to_payload=True,  # Include base64 payload for API usage
    # chunking_strategy="by_title",  # Try by_title; switch to by_section or by_page if needed
    # max_characters=2000,  # Adjust based on content structure
    # combine_text_under_n_chars=500,  # Merge smaller text blocks
    dpi=300,
)

process_file_with_model: <function process_file_with_model at 0x176798720>


In [5]:
filename = f"{filename}.json"
json_elements = elements_to_json(elements=chunks, indent=4)

In [6]:
def process_json_file(json_elements):
    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in json.loads(json_elements):
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])
        if entry["type"] == "Text":
            extracted_elements.append(entry["text"])
        if entry["type"] == "NarrativeText":
            extracted_elements.append(entry["text"])

    # Write the extracted elements to the output file
    with open("./output.txt", "w") as output_file:
        for element in extracted_elements:
            output_file.write(element + "\n\n")  # Adding two newlines for separation


process_json_file(json_elements)  # Takes a while for the .txt

In [7]:
# separate tables from texts
tables = []
texts = []

types = []
for chunk in chunks:
    types.append(str(type(chunk)))
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

    if "NarrativeText" in str(type((chunk))):
        texts.append(chunk)

    if "Text" in str(type((chunk))):
        texts.append(chunk)

# len(tables), len(texts)

In [8]:
# tables[0].metadata.text_as_html

In [16]:
def initialize_llm(
    model_id="google/gemma-2-2b-it",
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.5,
        "do_sample": True,
        # "device": "cpu",
    },
    # max_batch_size=4
):
    """Initializes Hugging Face LLM pipeline."""
    hf_pipeline = pipeline(task="text-generation", model=model_id, **pipeline_kwargs)
    return HuggingFacePipeline(pipeline=hf_pipeline)


llm = initialize_llm()

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   3%|3         | 157M/4.99G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI

# # Prompt
prompt_text = """
# You are an assistant tasked with summarizing tables and text.
# Give a concise summary of the table or text.

# Respond only with the summary, no additionnal comment.
# Do not start your message by saying "Here is a summary" or anything like that.
# Just give the summary as it is.

# """
prompt = ChatPromptTemplate.from_template(prompt_text)


# # Summary chain
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.5)

# summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [11]:
def summarize_tables(table_html, model):
    messages = (
    (
        "system",
        prompt_text
    ),
    ("human", table_html)
)

    ai_msg = model.invoke(messages)

    return ai_msg

In [12]:
tables_html = [table.metadata.text_as_html for table in tables]

In [15]:
table_summaries = [summarize_tables(table, llm) for table in tables_html]

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised PermissionDenied: 403 Request had insufficient authentication scopes. [reason: "ACCESS_TOKEN_SCOPE_INSUFFICIENT"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "method"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
].


PermissionDenied: 403 Request had insufficient authentication scopes. [reason: "ACCESS_TOKEN_SCOPE_INSUFFICIENT"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "method"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
]

In [46]:
StrOutputParser().parse(ai_msg)

AIMessage(content='PolicyBazaar Insurance Brokers Pvt Ltd (Intermediary Code: 0065359) can be contacted via mobile or landline (+1800-124-5723918).\n', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-69b1e003-59ba-46b2-8587-140103dc42fd-0', usage_metadata={'input_tokens': 157, 'output_tokens': 49, 'total_tokens': 206, 'input_token_details': {'cache_read': 0}})

In [30]:
# # Summarize text
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# # Summarize tables

# table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [41]:
tables_html[0]

'<table><tbody><tr><td>Intermediary Name:</td><td>PolicyBazaar Insurance Brokers Pvt Ltd</td><td>Intermediary Code: 0065359</td></tr><tr><td>Intermediary Contact Details:</td><td>| Mobile:</td><td>Land Line No: +1800-124-5723918</td></tr></tbody></table>'

In [None]:
# # Get the images from the CompositeElement objects
# def get_images_base64(chunks):
#     images_b64 = []
#     for chunk in chunks:
#         if "CompositeElement" in str(type(chunk)):
#             chunk_els = chunk.metadata.orig_elements
#             for el in chunk_els:
#                 if "Image" in str(type(el)):
#                     images_b64.append(el.metadata.image_base64)
#     return images_b64


# images = get_images_base64(chunks)
# import base64
# from IPython.display import Image, display


# def display_base64_image(base64_code):
#     # Decode the base64 string to binary
#     image_data = base64.b64decode(base64_code)
#     # Display the image
#     display(Image(data=image_data))

# # display_base64_image(images[0])

In [None]:
# Step 1: Storage layer setup
store = InMemoryDocstore()  # Storage layer for parent documents
id_key = "doc_id"  # ID key for associating documents

# Step 2: Load Hugging Face embeddings
embedding_model = "sentence-transformers/all-mpnet-base-v2"  # "sentence-transformers/all-MiniLM-L6-v2"  # You can change this to your preferred model
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Step 3: Initialize FAISS index
vector_dimension = len(
    embeddings.embed_query("hello world")
)  # Determine embedding dimension
index = faiss.IndexFlatL2(vector_dimension)  # L2 similarity index

# Step 4: Set up FAISS vectorstore
vectorstore = FAISS(
    embedding_function=embeddings,  # The embedding function
    index=index,  # The FAISS index
    docstore=store,  # Document storage
    index_to_docstore_id={},  # Mapping of FAISS index to docstore IDs
)

# Step 5: Load documents
loader = TextLoader("output.txt")
documents = loader.load()


# Step 6: split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
doc_ids = [str(uuid.uuid4()) for _ in docs]  # Unique IDs for each document

print("docs:", len(docs))


# Step 7: Generate embeddings and add documents to FAISS
try:
    retriever = vectorstore.as_retriever()  # Create retriever from vectorstore
    # Add documents to the vectorstore
    retriever.vectorstore.add_documents(docs, ids=doc_ids)
    print("Documents successfully added to FAISS!")
except AssertionError as e:
    print(f"AssertionError: {e}")
except Exception as e:
    print(f"Error adding documents: {e}")


# Step 8: Query the vectorstore
query = "Insured Members"
query_embedding = embeddings.embed_query(query)
results = vectorstore.similarity_search_by_vector(query_embedding, k=2)


# Display results
for result in results:
    print(f"Retrieved Document: {result.page_content}, Metadata: {result.metadata}")

In [None]:
# # The storage layer for the parent documents
# store = InMemoryDocstore()
# id_key = "doc_id"

# embedding_model = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2"  # "sentence-transformers/all-mpnet-base-v2"

# # Load Hugging Face embeddings (e.g., from sentence-transformers)
# embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# d = 2
# index = faiss.IndexFlatL2(d)

# # The vectorstore to use FAISS for indexing
# vectorstore = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=store,
#     index_to_docstore_id={},
# )

In [None]:
# text_doc_ids = [str(uuid.uuid4()) for _ in texts]
# text_documents = [
#     Document(page_content=summary.text, metadata={id_key: text_doc_ids[i]})
#     for i, summary in enumerate(texts)
# ]


# tables_html = [table.metadata.text_as_html for table in tables]
# table_doc_ids = [str(uuid.uuid4()) for _ in tables_html]
# table_documents = [
#     Document(page_content=summary, metadata={id_key: table_doc_ids[i]})
#     for i, summary in enumerate(tables_html)
# ]

In [None]:
# from langchain.text_splitter import CharacterTextSplitter

# # split it into chunks
# text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
# docs = text_splitter.split_documents(documents)

In [None]:
# # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 1})
# retriever = vectorstore.as_retriever()
# text_doc_ids = [str(uuid.uuid4()) for _ in docs]

# # vector_store.add_documents(documents=documents, ids=text_doc_ids)


# retriever.vectorstore.add_documents(documents=docs, ids=text_doc_ids)
# # retriever.vectorstore.add_documents(documents=table_documents, ids=table_doc_ids)
# # retriever.vectorstore.add_documents(summary_tables)

In [None]:
from langchain_huggingface import ChatHuggingFace


def chat_bot_llm(llm):
    chat_model = ChatHuggingFace(llm=llm)
    return chat_model


chat_llm = chat_bot_llm(llm=llm)

In [None]:
def parse_docs(docs):
    text = []
    for doc in docs:
        text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.page_content

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content),
        ]
    )


# chain = (
#     {
#         "context": retriever | RunnableLambda(parse_docs),
#         "question": RunnablePassthrough(),
#     }
#     | RunnableLambda(build_prompt)
#     | chat_llm
#     | StrOutputParser()
# )

# chain_with_sources = {
#     "context": retriever | RunnableLambda(parse_docs),
#     "question": RunnablePassthrough(),
# } | RunnablePassthrough().assign(
#     response=(RunnableLambda(build_prompt) | llm | StrOutputParser())
# )

In [None]:
def get_answer(question):
    try:
        docs = retriever.invoke(question)
        if not docs:
            return f"No relevant documents found for question: {question}"

        context = parse_docs(docs)
        prompt = build_prompt({"context": context, "question": question})
        llm_res = chat_llm.invoke(prompt.messages)
        return llm_res.pretty_print()
    except Exception as e:
        print(f"Error during query processing: {e}")
        return f"An error occurred while processing the question: {question}"

In [None]:
questions = [
    "List of Insured Members?",
    "What is the Room Rent amount or room type included in the policy?",
    "What is the Maternity Sum capping or sum insured?",
    "Sum insured for the policy?",
    "Does the policy have copay?",
    "what is the policy inception date",
    "What is the Policy Period",
    "what is the waiting period? and list down the categories for waiting periods",
    "what is the waiting period for specific disease waiting periods",
    "what is the waiting period for maternity package ",
]

In [None]:
import pprint

output_list = [get_answer(query) for query in questions]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(output_list)