In [None]:
# STEP 1: Install dependencies
!pip install -q langchain langchain-community faiss-cpu sentence-transformers PyPDF2 google-generativeai

In [None]:
# STEP 2: Imports
import os
import json
import re
import time
import google.api_core.exceptions
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from google import generativeai as genai
from google.colab import userdata
from google.colab import files

In [None]:
# STEP 3: Load and extract PDF content
def extract_text_from_pdf(file_path):
    print("📄 Reading and extracting text from PDF...")
    reader = PdfReader(file_path)
    text = ""
    for i, page in enumerate(reader.pages):
        if page.extract_text():
            print(f" Extracted text from page {i+1}")
            text += page.extract_text() + "\n"
    return text

In [None]:
# STEP 4: Advanced Chunking using recursive splitting
def chunk_text(text):
    print(" Splitting text into chunks for embedding...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    chunks = splitter.create_documents([text])
    print(f"🧩 Created {len(chunks)} chunks.")
    return chunks

In [None]:
# STEP 5: Generate embeddings and create FAISS index
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def create_vectorstore(chunks):
    print("📦 Generating embeddings and building FAISS index...")
    return FAISS.from_documents(chunks, embedding_model)

In [None]:
# STEP 6: Configure Gemini API
api_key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-pro")

In [None]:
# STEP 7: Prompt Template for Structured Response
def build_prompt(context, query):
    return f"""
You are an expert insurance assistant. Use the policy text below to answer the user query.

Policy Clauses:
{context}

User Query:
{query}

Return only valid JSON object like:
{{
  "Decision": "Approved / Rejected / Unknown",
  "Amount": "₹ value or 'N/A'",
  "Justification": "Explain clearly with the exact clause or rule quoted verbatim."
}}
Only output the JSON object. Do not wrap it in markdown, triple quotes, or anything else.
"""

In [None]:
# STEP 8: Retrieve relevant chunks
def retrieve_context(query, vectordb):
    print("🔍 Retrieving relevant chunks from FAISS store...")
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(query)
    print(f"📚 Retrieved {len(docs)} matching document chunks.")
    return "\n---\n".join([doc.page_content for doc in docs])

In [None]:
# STEP 9: Ask Gemini and extract only JSON (with timeout handling)
def ask_gemini(context, query):
    print(" Sending prompt to Gemini...")
    prompt = build_prompt(context, query)
    for attempt in range(2):
        try:
            response = model.generate_content(prompt)
            text = response.text
            json_str = re.search(r"{.*}", text, re.DOTALL).group()
            parsed = json.loads(json_str)
            return json.dumps(parsed, indent=2)
        except google.api_core.exceptions.DeadlineExceeded:
            print(" Timeout occurred, retrying...")
            time.sleep(2)
        except Exception as e:
            print(f" Gemini error: {e}")
            return " Gemini did not return valid JSON.\n\nRaw Output:\n" + response.text
    return " Request failed after retries."


In [None]:
def process_file_and_query(file_path, query):
    print("\n Starting document analysis pipeline...")
    text = extract_text_from_pdf(file_path)
    chunks = chunk_text(text)
    vectordb = create_vectorstore(chunks)
    context = retrieve_context(query, vectordb)
    print("\n Running Gemini inference...")
    return ask_gemini(context, query)

In [None]:
#  STEP 11: Manual Input Interface in Colab
print(" Please upload a policy PDF file to begin...")
uploaded = files.upload()
filepath = next(iter(uploaded))
print(f" Uploaded file: {filepath}")

query = input("\n🔍 Enter your query (e.g. 46M, knee surgery, 3-month-old policy): ")
response = process_file_and_query(filepath, query)

print("\n Gemini Response:")
print(response)

📤 Please upload a policy PDF file to begin...
