# Interactive Insurance Query-Answering System

## Overview
This notebook demonstrates an interactive Question-Answering (QA) system for insurance policies using a Retrieval-Augmented Generation (RAG) pipeline. The system integrates semantic search, document embeddings, caching, cross-encoder reranking, and GPT-based response generation.

## Features
- Extracts and processes text, including tables, from insurance policy documents (or generic documents of your choice with minor tweaks).
- Generates semantic embeddings for text retrieval.
- Implements caching for efficient query handling.
- Reranks results using a cross-encoder for higher accuracy.
- Synthesizes detailed responses using a model of your choice.
- Interactive query inputs and visualizations.

## 1. Setup and Configuration

### User-Defined Variables
Configure these before running the notebook.

Your OpenAI API keys are found here - https://platform.openai.com/api-keys

OpenAI model details are available here - https://platform.openai.com/docs/models#gpt-4o-mini

A sample set of insurance policy documents are available here - https://www.hdfclife.com/policy-documents

In [None]:
DOCUMENT_LOCATION = "/content/drive/MyDrive/rag_test/policy_documents"  # Path to insurance PDFs
OPENAI_API_KEY = "your_key_here"  # Replace with your OpenAI API key
MODEL_NAME = "gpt-4o-mini"  # GPT model for response generation
CHROMA_PATH = "/content/drive/MyDrive/rag_test/chromadb"  # Path for ChromaDB storage, can be an empty folder on your GDrive

### Mounting GDrive

In [None]:
# Comment out below section if running locally
from google.colab import drive

drive.mount('/content/drive')

### Required Libraries

In [None]:
!pip install openai chromadb pdfplumber tiktoken sentence-transformers -q
# Replace ! with % if using Jupyter

In [None]:
import openai
import pdfplumber
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction, SentenceTransformerEmbeddingFunction
from sentence_transformers import CrossEncoder
from IPython.display import display
import ipywidgets as widgets
import numpy as np
import time
import ast
openai.api_key = OPENAI_API_KEY

## 2. Data Preparation
Processes and filters the extracted text from insurance policy PDFs for further analysis.
### PDF Extraction and Metadata Processing

In [None]:
def check_boxes(word, table_box):
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_box
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

def extract_text_from_pdf(pdf_path):
    """
    Extract text and tables from PDF documents.

    Parameters:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - list: Extracted text with metadata.
    """
    full_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for p, page in enumerate(pdf.pages):
            page_no = f"Page {p+1}"
            text = page.extract_text()
            tables = page.find_tables()
            table_boxes = [t.bbox for t in tables]
            non_table_words = [
                word for word in page.extract_words()
                if not any(check_boxes(word, box) for box in table_boxes)
            ]
            lines = [word['text'] for word in non_table_words]
            full_text.append([page_no, " ".join(lines)])
            # Debugging Output
            print(f"Page {p+1}: Extracted {len(lines)} lines of text.")
    return full_text

pdf_dir = Path(DOCUMENT_LOCATION)
data = []
for pdf_path in pdf_dir.glob("*.pdf"):
    print(f"Processing {pdf_path.name}...")
    extracted = extract_text_from_pdf(pdf_path)
    df = pd.DataFrame(extracted, columns=['Page No.', 'Page_Text'])
    df['Document_Name'] = pdf_path.name
    data.append(df)
insurance_pdfs_data = pd.concat(data, ignore_index=True)

### Metadata and Filtering

In [None]:
insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split()))
insurance_pdfs_data = insurance_pdfs_data[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(
    lambda x: {"Policy_Name": x['Document_Name'], "Page_No": x['Page No.']}, axis=1
)
print("Final processed DataFrame:")
print(insurance_pdfs_data.head())

## 3. Embedding and Semantic Search
Generates semantic embeddings and retrieves relevant documents using ChromaDB.
### Embedding Setup

In [None]:
client = PersistentClient(path=CHROMA_PATH)
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(name='Insurance', embedding_function=embedding_function)
cache_collection = client.get_or_create_collection(
    name='Insurance_Cache',
    embedding_function=embedding_function
)

### Adding Documents to Collection

In [None]:
collection.add(
    documents=insurance_pdfs_data["Page_Text"].tolist(),
    ids=[str(i) for i in range(len(insurance_pdfs_data))],
    metadatas=insurance_pdfs_data['Metadata'].tolist()
)
print("Documents successfully added to ChromaDB collection.")

### Querying with Caching

In [None]:
def retrieve_documents(query, n_results=5, cache_threshold=0.2):
    """
    Retrieve top documents using semantic search with caching.

    Parameters:
    - query (str): User query.
    - n_results (int): Number of results to retrieve.
    - cache_threshold (float): Threshold for using cache.

    Returns:
    - DataFrame: Top documents with metadata.
    """

    cache_results = cache_collection.query(query_texts=[query], n_results=1)
    if cache_results['distances'][0] and cache_results['distances'][0][0] <= cache_threshold:
        print("Cache hit for query:", query)
        cache_data = cache_results['metadatas'][0][0]
        print("Cache retrieved data:", cache_data)

        documents = ast.literal_eval(cache_data['documents'])
        distances = ast.literal_eval(cache_data['distances'])
        metadatas = ast.literal_eval(cache_data['metadatas'])

        return pd.DataFrame({
            "Documents": documents,
            "Metadata": metadatas,
            "Distances": distances
        })

    print("Cache miss. Retrieving from main collection.")
    results = collection.query(query_texts=[query], n_results=n_results)
    cache_collection.add(
        documents=[query],
        ids=[query],
        metadatas=[{
            "documents": str(results['documents'][0]),
            "distances": str(results['distances'][0]),
            "metadatas": str(results['metadatas'][0])
        }]
    )
    print("Query added to cache.")
    print("Retrieved data from collection:", results)
    return pd.DataFrame({
        "Documents": results['documents'][0],
        "Metadata": results['metadatas'][0],
        "Distances": results['distances'][0]
    })

## 4. Cross-Encoder Reranking
Applies cross-encoder models to refine document ranking based on relevance to the query.

In [None]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_results(query, results_df):
    """
    Rerank results using a cross-encoder.

    Parameters:
    - query (str): User query.
    - results_df (DataFrame): Retrieved documents.

    Returns:
    - DataFrame: Reranked results.
    """
    inputs = [[query, doc] for doc in results_df['Documents']]
    scores = cross_encoder.predict(inputs)
    results_df['Reranked_Scores'] = scores
    print("Reranking complete. Top scores:")
    print(scores[:3])
    return results_df.sort_values(by='Reranked_Scores', ascending=False)

## 5. Response Generation

In [None]:
def generate_response(query, top_docs):
    """
    Generate a response using your required model.

    Parameters:
    - query (str): User query.
    - top_docs (DataFrame): Retrieved documents.

    Returns:
    - str: Generated response.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant specializing in insurance who can effectively answer user queries about insurance policies and documents. "
                "You should provide clear, accurate, and concise answers directly addressing the user's query."
            )
        },
        {
            "role": "user",
            "content": (
                f"""
                A user has asked the following question: '{query}'.

                You also have access to search results from insurance policy documents stored in a DataFrame called '{top_docs}'.
                - The column 'Documents' contains text extracted from insurance policy pages.
                - The column 'Metadata' contains the policy name and page number as citations.

                Use the following guidelines to formulate your response:
                1. Provide accurate, relevant numbers or details from the documents, if available.
                2. Summarize or restructure tables present in the text into a tabular format for clarity.
                3. Use only the information relevant to the query and ignore irrelevant details.
                4. Cite the policy name and page number from the 'Metadata' column to support your answer.
                5. If you cannot provide a complete answer, suggest sections or topics to help the user locate relevant information in the cited documents.
                6. Do not disclose internal workings or mention limitations; respond as a customer-facing assistant.

                Format the final response as clear, user-friendly text followed by properly formatted citations. If the query is irrelevant, state so clearly.
                """
            )
        },
        {
            "role": "assistant",
            "content": (
                "Use the top 3 relevant documents to address the user's question directly and cite the relevant policies and pages. "
                "Ensure the response is easy to read and formatted clearly for the user."
            )
        }
    ]

    # Generate the response using the OpenAI API
    response = openai.chat.completions.create(model=MODEL_NAME, messages=messages)

    print("Generated response:")
    print(response.choices[0].message.content)

    return response.choices[0].message.content


## 6. Interactive Query, Reranking, and Visualization
This section integrates querying, reranking, and visualization for an end-to-end user workflow. This can be used to test out the model.


### Integrated Workflow with Visualization

In [None]:
query_input = widgets.Text(description="Query:", placeholder="Enter your query...")
submit_button = widgets.Button(description="Submit")
output_area = widgets.Output()

def visualize_relevance(df):
    plt.figure(figsize=(10, 6))
    plt.bar(df['Metadata'].apply(lambda x: x['Page_No']), df['Distances'], color='skyblue')
    plt.xlabel("Page No.")
    plt.ylabel("Relevance Score")
    plt.title("Relevance of Retrieved Documents")
    plt.show()

def visualize_reranking(df):
    plt.figure(figsize=(10, 6))
    plt.bar(df['Metadata'].apply(lambda x: x['Page_No']), df['Reranked_Scores'], color='orange')
    plt.xlabel("Page No.")
    plt.ylabel("Reranked Score")
    plt.title("Reranked Relevance of Documents")
    plt.show()

def integrated_workflow(query):
    df = retrieve_documents(query)
    print("Top retrieved documents (before reranking):")
    print(df[['Documents', 'Metadata']])
    visualize_relevance(df)

    # Rerank documents
    reranked_df = rerank_results(query, df)
    print("Top reranked documents:")
    print(reranked_df[['Documents', 'Metadata', 'Reranked_Scores']])
    visualize_reranking(reranked_df)

    # Generate response
    top_docs = reranked_df.head(3)  # Taking top 3 documents after reranking
    response = generate_response(query, top_docs)
    print("Generated Response:")
    print(response)

    return reranked_df, response

def on_submit(b):
    with output_area:
        output_area.clear_output()
        query = query_input.value
        print(f"Processing query: {query}")
        reranked_df, response = integrated_workflow(query)

submit_button.on_click(on_submit)
display(query_input, submit_button, output_area)


## 7. Testing with Multiple Queries
Checking the system's functionality with predefined test queries.

In [None]:
# Pre-defined test queries
queries = [
    "Do all the insurance policies cover diabetic patients?",
    "What is the average premium rate for individuals above 60 years of age?",
    "Is there a waiting period for pre-existing conditions for the policies?"
]

for idx, test_query in enumerate(queries, start=1):
    print(f"\nRunning test query {idx}: {test_query}")
    _, test_response = integrated_workflow(test_query)
    print(f"Response for Query {idx}:\n{test_response}\n")