In [248]:
from typing import List, Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field
from langchain.chat_models import init_chat_model
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

import os
import tempfile
from langchain.document_loaders import PyPDFLoader
from langchain_community.callbacks import get_openai_callback
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate

from schemas_diff import Other_Benefits

from retrieval_prompts import RETRIEVAL_SYSTEM_PROMPT_2,RETRIEVAL_HUMAN_PROMPT_TEMPLATE_3,RETRIEVAL_HUMAN_PROMPT_TEMPLATE_4

from  dotenv import load_dotenv

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
    separators=[
        "\n\n",          # Split on paragraphs
        "\n---+\n",      # Split on horizontal lines (common in tables)
        r'\n\s*\|\s*\n', # Split on pipe-separated table rows
        r'\n\s{2,}',     # Split on lines with 2+ spaces (common in tabular data)
        "\n",            # Split on newlines
        r'(?<=\. )',     # Split after sentences
        r'\s{4,}',       # Split on 4+ whitespace characters (tabular columns)
        " ",             # Split on single spaces
        ""
    ]
)

NameError: name 'langchain_community' is not defined

In [249]:
load_dotenv(override=True)

uin = '136N080V02'
# current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
persistent_directory = os.path.join(parent_dir, "db", f"chroma_db_{uin}")

if not os.path.exists(persistent_directory):
    print(f"----Vector DB does not exist for UIN {uin}, creating a new one----")
    os.makedirs(persistent_directory)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

In [250]:
db._persist_directory

'/home/cloudcraftz/WORK/LANGCHAIN_LEARNING/extraction_langchain/db/chroma_db_136N080V02'

In [251]:
print(f"Total documents in DB: {len(db.get()['documents'])}")

Total documents in DB: 460


In [252]:
search_params = {
    "search_type": "similarity_score_threshold",
    "search_kwargs": {"k": 1, 
                      "score_threshold": 0.4,
                      }  # Exclude empty content
}

In [261]:
orig_retriever = db.as_retriever(
        search_type=search_params["search_type"],
        search_kwargs=search_params["search_kwargs"]
    )   
# Test the retriever directly
# query = "What are the plan options available for this policy? Give brief description of each plan option."
# result = retriever.invoke(query)
# print("---------------RESPONSE-----------",result)


In [262]:
llm = init_chat_model(os.environ.get("MODEL_OPENAI"), model_provider="openai", temperature=0,verbose=True)

In [254]:
def format_docs(docs):
    formatted = []
    for doc in docs:
        # Ensure page_content is never None
        content = doc.page_content or "[EMPTY CONTENT]"
        formatted.append(f"Document Content: {content}\nMetadata: {doc.metadata}")
    return "\n\n".join(formatted)

In [263]:
class Plan_Options(BaseModel):
    value: Optional[str] = Field(
        ..., description="Available plan options with their plan description and statistical details"
    )
    page_number: Optional[str] = Field(
        ..., description="Page number where the field was extracted from"
    )
    chunks: Optional[List[str]] = Field(
        ...,
        description="Source of the chunks, keep it empty if null",
    )


In [264]:
system_template ="""You are an expert at extracting relevant insurance policy information from text, given a query and retrieved relevant documents to the query.
    ### Instructions:
        1. Ensure that your analysis is detailed, accurate, concise, user-friendly, and directly addresses the query.
            - If certain details are missing, explicitly state that the information is not available.
        2. Compile a Final Answer:\n- Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
            - Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
            - Rely strictly on the provided text, without including external information.
        3. Ensure that response should not provide any financial advice. If the user asks for financial advice, clearly state that you are unable to give any advice.
        4. Ensure that the response is detailed, comprehensive, and clearly states if any critical information is missing.
        5. Format the response in clear, easy-to-understand, point-wise format e.g 1. , 2.,  3. etc.optimized for excel cells.Make the keywords to be clearly spaced with their brief details, For e.g . in the text Life Option:- This plan provides basic life coverage, ensuring that the policyholder's beneficiaries. Do not highlight numbers, for example instead of `95,20,000`, give output as \u20b995,20,000.
        6. Please provide direct answers without preemptive phrases.
        7. If the answer is not found in the documents, respond with 'Not available' ONLY. Do not generate hallucinated responses.
    """
plan_options_human_template= """

    Use the following retrieved documents to extract policy details for the query follwed by the instructions.
    ###Context:
        Retrieved documents : {documents}
        Query:{query}
    
    An example of the output format is given below:
    3 Plan Options
        1) Life Secure: SAD is paid if the Life Assured/Spouse dies during the PT while the policy is in-force;
        2) Life Secure with Income: Monthly Survival Income starts at age 60 and continues until death or end of the PT. If the Life Assured dies, the SAD- paid incomes, is paid and the policy ends;
        3) Life Secure with ROP: SAD is paid if the Life Assured dies and the Policy ends. If the Life Assured survives, the SAM is paid and the Policy ends ;
"""

In [265]:
plan_options_ext_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", 
            """You are an expert at extracting relevant insurance policy information from text, given a query and retrieved relevant documents to the query.
            ### Instructions:
                1. Ensure that your analysis is detailed, accurate, concise, user-friendly, and directly addresses the query.
                    - If certain details are missing, explicitly state that the information is not available.
                2. Compile a Final Answer:\n- Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
                    - Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
                    - Rely strictly on the provided text, without including external information.
                3. Ensure that response should not provide any financial advice. If the user asks for financial advice, clearly state that you are unable to give any advice.
                4. Ensure that the response is detailed, comprehensive, and clearly states if any critical information is missing.
                5. Format the response in clear, easy-to-understand, point-wise format e.g 1. , 2.,  3. etc.optimized for excel cells.Make the keywords to be clearly spaced with their brief details, For e.g . in the text Life Option:- This plan provides basic life coverage, ensuring that the policyholder's beneficiaries. Do not highlight numbers, for example instead of `95,20,000`, give output as \u20b995,20,000.
                6. Please provide direct answers without preemptive phrases.
                7. If the answer is not found in the documents, respond with 'Not available' ONLY. Do not generate hallucinated responses.
            """),        
        ("human", 
            """ Use the following retrieved documents to extract policy details for the query follwed by the instructions.
                ###Context:
                    Retrieved documents : {documents}
                    Query:{query}
                
                An example of the output format is given below:
                3 Plan Options
                    1) Life Secure: SAD is paid if the Life Assured/Spouse dies during the PT while the policy is in-force;
                    2) Life Secure with Income: Monthly Survival Income starts at age 60 and continues until death or end of the PT. If the Life Assured dies, the SAD- paid incomes, is paid and the policy ends;
                    3) Life Secure with ROP: SAD is paid if the Life Assured dies and the Policy ends. If the Life Assured survives, the SAM is paid and the Policy ends ;
            """)
    ]
)

In [266]:
plan_options_ret_prompt = ChatPromptTemplate.from_template(
    """You are an assistant tasked with finding the various plan options provided in an  insurance policy text.
    Based on the following query, retrieve the relevant documents and extract the plan options.:
    
    Query: {query}
    
    """
)



In [208]:
def debug_retriever(query):
    docs = orig_retriever.invoke(query)
    print("Chroma raw results:", docs)
    valid_docs = [doc for doc in docs if doc and doc.page_content]
    return valid_docs

In [267]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from typing import Any

class SafeRetriever:
    def __init__(self, base_retriever):
        self.base_retriever = base_retriever

    def invoke(self, query: str, **kwargs):
        docs = self.base_retriever.invoke(query, **kwargs)
        return [
            doc for doc in docs
            if doc and isinstance(doc, Document) and doc.page_content
        ]

In [275]:
search_params = {
    "search_type": "similarity_score_threshold",
    "search_kwargs": {"k": 1, 
                      "score_threshold": 0.4,
                      }  # Exclude empty content
}

orig_retriever = db.as_retriever(
        search_type=search_params["search_type"],
        search_kwargs=search_params["search_kwargs"]
    )   

# safe_retriever = SafeRetriever(orig_retriever)

In [276]:
first_stage_output = {}
extractor = plan_options_ext_prompt | llm.with_structured_output(
    schema=Plan_Options,
    include_raw=False,
)


def debug_retriever(query):
    docs = orig_retriever.invoke(query)
    print("Chroma raw results:", docs)  # Inspect structure
    return [doc for doc in docs if doc.page_content is not None]  # Filter out None

def log_input(output):
    print("To LLM:", output)
    return output


In [277]:
plan_options_chain = (
    {
        "documents": orig_retriever | (lambda docs : docs[0].page_content)
    }
    | RunnableLambda(log_input)
)

In [258]:
def format_docs(docs):
    return "\n\n".join(f"Page {doc.metadata['page']}: {doc.page_content}" 
                      for doc in docs)

In [278]:
plan_options_query = "What are the plan options available for this policy?"

In [281]:
# safe_retriever = SafeRetriever(orig_retriever)
docs = orig_retriever.invoke(plan_options_query)

In [242]:
def _patched_results_to_docs_and_scores(results):
    return [
        (Document(page_content=doc, metadata=meta or {}), score)
        for doc, meta, score in zip(
            results["documents"][0],
            results["metadatas"][0],
            results["distances"][0],
        )
        if doc is not None
    ]

Chroma._results_to_docs_and_scores = _patched_results_to_docs_and_scores

In [282]:
docs

[Document(metadata={'creationdate': '2024-09-23T19:05:56+06:30', 'creator': 'Adobe Illustrator 27.0 (Windows)', 'moddate': '2024-09-24T18:55:02+05:30', 'page': 3, 'page_label': '4', 'producer': 'Adobe PDF library 16.07', 'source': '/tmp/tmpkqr5datr.pdf', 'title': '16-CHL_Product_iSelect Smart 360 term plan_Sales Literature_V_2 23 SEP 2024', 'total_pages': 24}, page_content='Note: Premium amounts shown are basis Physical Medical and exclusive of taxes\nCOVERAGE OPTION\nThe beneﬁts available under the Plan Op�on Life will be based on the Coverage Op�ons chosen by you at incep�on. Similarly, a \nWorking Spouse can choose any of these Coverage Op�ons at policy incep�on. These op�ons, once chosen, cannot be altered \nduring the Policy Term.\n1. Level Cover: Your Sum Assured remains same throughout the Policy Term. However, if you have opted for regular premium')]

In [279]:
# retrieve the docs , pass it in the prompt , along with the query 

result = plan_options_chain.invoke(plan_options_query)

  

ValidationError: 1 validation error for Document
page_content
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type