In [1]:
!pip install python-doctr[torch] "opencv-python-headless<5"
!pip install pdfplumber
!pip install transformers
!pip install langdetect
!pip install sentencepiece
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install streamlit
!pip install pillow





Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [3]:
!pip install tf-keras



In [4]:
# ============================================================
# üß© INSURANCE CLAIM SUMMARIZER ‚Äî FIXED FOR LONG REPORTS
# ============================================================

# ---------------------------
# üì¶ Imports
# ---------------------------
import re
import pdfplumber
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from langdetect import detect
from transformers import MarianTokenizer, MarianMTModel, pipeline

# ---------------------------
# ‚öôÔ∏è Load OCR + Summarization Models
# ---------------------------
print("‚è≥ Loading models...")

# DocTR OCR (CPU mode)
ocr_model = ocr_predictor(pretrained=True)

# üîÅ You can swap this model name if you want better quality (slower):
# e.g. "facebook/bart-large-cnn"
SUMMARIZATION_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"

summarizer = pipeline(
    "summarization",
    model=SUMMARIZATION_MODEL_NAME,
)

print("‚úÖ Models loaded successfully!\n")


# ============================================================
# üß† Utility Functions
# ============================================================

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF using pdfplumber, fallback to DocTR OCR."""
    text = ""
    # 1Ô∏è‚É£ Try digital text first
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    if text.strip():
        return text.strip()

    # 2Ô∏è‚É£ Fallback: OCR with DocTR
    print("üîç Using OCR for scanned PDF...")
    doc = DocumentFile.from_pdf(pdf_path)
    result = ocr_model(doc)
    # doctr's .render() returns a full-page text representation
    return result.render()


def extract_text_from_image(image_path: str) -> str:
    """Extract text from image using DocTR OCR."""
    doc = DocumentFile.from_images(image_path)
    result = ocr_model(doc)
    return result.render()


def auto_detect_language(text: str) -> str:
    """Auto-detect language code from text."""
    try:
        return detect(text)
    except Exception:
        return "en"


# ---------------------------
# üåê Translation (kept same stack, but safe)
# ---------------------------

def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_chars: int = 1500) -> str:
    """
    Generic translation using MarianMT (Helsinki-NLP models).
    Now supports long texts by translating in chunks.
    """
    try:
        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
    except Exception as e:
        print(f"‚ö†Ô∏è Translation model for {src_lang}->{tgt_lang} not found ({e}). Using original text.")
        return text

    # Normalize whitespace a bit
    text = re.sub(r"\s+", " ", text.strip())

    if not text:
        return text

    # Chunk text to avoid 512-token truncation
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        # Try to break at the last period near the end of the window
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 100:  # no good period, or too early
            split = end
        else:
            split += 1  # include the period
        chunks.append(text[start:split].strip())
        start = split

    translated_chunks = []
    for c in chunks:
        if not c.strip():
            continue
        inputs = tokenizer(
            c,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        out_ids = model.generate(**inputs, max_length=512)
        translated = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        translated_chunks.append(translated.strip())

    return " ".join(translated_chunks).strip()


# ---------------------------
# üß† Long-text summarization
# ---------------------------

def _summarize_chunk(chunk: str, max_length: int = 160, min_length: int = 50) -> str:
    """Call HF summarizer safely on one chunk."""
    if not chunk.strip():
        return ""
    # Hugging Face pipeline expects text <= model max tokens. We approximate via chars.
    result = summarizer(
        chunk,
        max_length=max_length,
        min_length=min_length,
        do_sample=False,
    )
    return result[0]["summary_text"].strip()


def summarize_long_text(
    text: str,
    max_chunk_chars: int = 2500,
    chunk_summary_max_len: int = 160,
    chunk_summary_min_len: int = 50,
) -> str:
    """
    Summarize very long claim reports by:
      1. Cleaning + normalizing text
      2. Splitting into overlapping chunks
      3. Summarizing each chunk
      4. Optionally summarizing the concatenated summaries again
    """
    if not text or not text.strip():
        return "‚ö†Ô∏è No text detected in the document."

    # Basic normalization
    text = re.sub(r"\s+", " ", text.strip())

    if len(text) <= max_chunk_chars:
        # Short enough: one-shot summarization
        return _summarize_chunk(
            text,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )

    # 1Ô∏è‚É£ Split into chunks at sentence boundaries where possible
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        # Try to end at a period close to the limit
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 400:  # no good split; just hard cut
            split = end
        else:
            split += 1  # include period
        chunk = text[start:split].strip()
        if chunk:
            chunks.append(chunk)
        start = split

    # 2Ô∏è‚É£ Summarize each chunk
    partial_summaries = []
    for i, c in enumerate(chunks, start=1):
        print(f"üß© Summarizing chunk {i}/{len(chunks)} (len={len(c)} chars)")
        # If chunk is tiny, keep it as is
        if len(c.split()) < 40:
            partial_summaries.append(c)
        else:
            s = _summarize_chunk(
                c,
                max_length=chunk_summary_max_len,
                min_length=chunk_summary_min_len,
            )
            partial_summaries.append(s)

    # 3Ô∏è‚É£ Combine summaries; if still long, summarize again
    combined = " ".join(partial_summaries)
    combined = re.sub(r"\s+", " ", combined).strip()

    if len(combined) <= max_chunk_chars:
        final = _summarize_chunk(
            combined,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )
        return final

    # If still very long, just return concatenated summaries
    return combined


def summarize_claim(text: str) -> str:
    """Summarize insurance claim in customer-friendly English (long-text aware)."""
    if not text.strip():
        return "‚ö†Ô∏è No text detected in the document."

    summary = summarize_long_text(text)
    return summary.strip()


# ---------------------------
# üîó End-to-end pipelines
# ---------------------------

def process_claim_file(file_path: str, back_translate: bool = True) -> str:
    """
    Main pipeline:
    Extract ‚Üí Detect Lang ‚Üí Translate ‚Üí Summarize ‚Üí Back-translate.
    Handles long reports via chunked translation & summarization.
    """
    # 1Ô∏è‚É£ Extract text based on file type
    file_lower = file_path.lower()
    if file_lower.endswith(".pdf"):
        raw_text = extract_text_from_pdf(file_path)
    elif file_lower.endswith((".jpg", ".jpeg", ".png")):
        raw_text = extract_text_from_image(file_path)
    elif file_lower.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
    else:
        return "‚ö†Ô∏è Unsupported file type."

    if not raw_text.strip():
        return "‚ö†Ô∏è No readable text found in the document."

    print("\n‚úÖ Extracted Raw Text (first 400 chars):\n", raw_text[:400], "\n")

    # 2Ô∏è‚É£ Detect language automatically
    src_lang = auto_detect_language(raw_text)
    print(f"üåê Detected Language: {src_lang}")

    # 3Ô∏è‚É£ Translate to English if not already English (chunked translation)
    if src_lang != "en":
        text_in_english = translate_text(raw_text, src_lang, "en")
    else:
        text_in_english = raw_text

    # 4Ô∏è‚É£ Summarize claim (long-text aware)
    summary_en = summarize_claim(text_in_english)

    # 5Ô∏è‚É£ Optionally translate summary back to source language
    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"


def process_claim_text(text: str, back_translate: bool = True) -> str:
    """Directly process a claim text input."""
    if not text.strip():
        return "‚ö†Ô∏è Empty text."

    src_lang = auto_detect_language(text)
    print(f"üåê Detected Language: {src_lang}")

    if src_lang != "en":
        text_in_english = translate_text(text, src_lang, "en")
    else:
        text_in_english = text

    summary_en = summarize_claim(text_in_english)

    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"


# ============================================================
# üß™ Example Tests
# ============================================================

claim_report = """
Claim ID 48290 was rejected because the health insurance policy had expired
before the hospital admission date. The claim was submitted after the coverage
period ended. The insured had last renewed the policy on 01-Apr-2022 with a
grace period of 30 days; however, the hospitalization occurred on 15-May-2022
after the expiry of the grace period. As per policy conditions, claims after
expiry of coverage are not admissible.
"""

print("\nüîπ Direct Text Test:")
print(process_claim_text(claim_report))

print("\nüîπ PDF Test:")
print(process_claim_file("test_files/sample_claim.pdf"))

print("\nüîπ Image Test:")
print(process_claim_file("test_files/claim_image.png"))

print("\nüîπ Text File Test:")
print(process_claim_file("test_files/claim_note.txt"))


‚è≥ Loading models...




Device set to use cpu


‚úÖ Models loaded successfully!


üîπ Direct Text Test:


Your max_length is set to 160, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


üåê Detected Language: en


Your max_length is set to 160, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


üßæ **Customer-Friendly Explanation:**

Claim ID 48290 was rejected because the health insurance policy had expired before the hospital admission date . The claim was submitted after the coverage period ended . The insured had last renewed the policy on 01-Apr-2022 with a grace period of 30 days .

üîπ PDF Test:

‚úÖ Extracted Raw Text (first 400 chars):
 Claim ID: 39561
Patient Name: Priya Sharma
Date of Admission: 21 Sept 2024
Date of Discharge: 25 Sept 2024
Hospital: Medico Hospital, Pune
Diagnosis: Viral Fever
Treatment: Hospitalization and IV Fluids
Claim Amount: ‚Çπ32,000
Approved Amount: ‚Çπ28,500
Claim Status: APPROVED WITH REDUCTION
Remarks: Non-medical expenses such as food and toiletries are not covered under the
policy. 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Priya Sharma was admitted to Medico Hospital, Pune with Viral Fever at 21 Sept 2024 . She was diagnosed with viral fever and was admitted with IV Fluids at the hospital . Food and toilet

Your max_length is set to 160, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)



‚úÖ Extracted Raw Text (first 400 chars):
 Claim ID: 50923
Patient Name: Amit Patel
Date: 5 July 2024
Hospital: CityCare Hospital, Nagpur
Diagnosis: Dental Extraction
Claim Status: DENIED
Reason: Dental procedures are not covered under the current insurance plan. 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Dental procedures are not covered under the current insurance plan . Claim ID: 50923 Patient Name: Amit Patel Date: 5 July 2024 Hospital: CityCare Hospital, Nagpur Diagnosis: Dental Extraction Claim Status: DENIED .

üîπ Text File Test:

‚úÖ Extracted Raw Text (first 400 chars):
 INSURANCE COMPANY: SecureLife Health Insurance Pvt. Ltd.
PRODUCT: SecureLife Gold ‚Äì Family Floater
Policy Number: SLH/FG/2023/0192837
Claim ID: CLM/2024/000784
UHID: UHID-99821

Patient Name: Mr. Rohan Sharma
Age/Gender: 42 / Male
Relationship to Proposer: Self

Hospital Name: CityCare Multispeciality Hospital
Hospital City: Pune, Maharashtra
Hospital Type: Network Hospital

Adm

In [5]:
!pip install streamlit transformers torch sentencepiece pytesseract pdfplumber Pillow



In [6]:
!pip install sumy python-docx



In [5]:
%%writefile claim_explainer_chatbot_final.py
%%writefile claim_explainer_chatbot_app.py
# ============================================================
# üßæ Insurance Claim Explanation Chatbot ‚Äî Smart Q&A Version
# (With Summary Download Feature Added)
# ============================================================

import re
import os
import tempfile
from pathlib import Path

import streamlit as st
import pdfplumber

# ---------- Optional OCR (DocTR) ----------
try:
    from doctr.io import DocumentFile
    from doctr.models import ocr_predictor
    DOCTR_AVAILABLE = True
except Exception:
    DocumentFile = None
    ocr_predictor = None
    DOCTR_AVAILABLE = False

# ---------- Gen AI imports ----------
from langdetect import detect
from transformers import pipeline, MarianTokenizer, MarianMTModel

# ------------------------------------------------------------
# Streamlit page setup & custom styles
# ------------------------------------------------------------
st.set_page_config(page_title="Claim Explanation Chatbot", page_icon="üßæ", layout="wide")

st.markdown("""
<style>
.main { background: radial-gradient(circle at top left, #eef2ff 0, #f9fafb 45%, #fdf2ff 100%); }
.block-container { max-width: 1000px; padding-top: 2.5rem; }
.app-title{text-align:center;font-size:32px;color:#111827;font-weight:800;margin-bottom:4px;line-height:1.3;padding-top:4px;}
.app-sub{text-align:center;color:#4b5563;margin-bottom:12px;}
.chat-upload-wrapper{display:flex;justify-content:center;margin-bottom:6px;}
.chat-upload{background:rgba(255,255,255,0.95);border-radius:999px;padding:4px 10px;border:1px solid #e5e7eb;
box-shadow:0 10px 25px rgba(15,23,42,0.08);display:flex;align-items:center;gap:6px;}
.chat-upload-label{font-size:13px;color:#4b5563;}
.chat-upload .stFileUploader > label {font-size:0;padding:0;}
.chat-upload .stFileUploader > div {padding:0;}
.chat-helper{font-size:13px;color:#6b7280;margin:4px 0 8px 0;}
.footer{text-align:center;color:#9ca3af;font-size:12px;margin-top:18px;}
.stChatMessage {max-width: 780px;margin-left:auto;margin-right:auto;}
textarea, input, .stChatInput textarea { color: #000000 !important; font-weight: 500; }
textarea::placeholder { color: #9ca3af; }
</style>
""", unsafe_allow_html=True)

st.markdown('<div class="app-title">üßæ Claim Explanation Chatbot</div>', unsafe_allow_html=True)
st.markdown('<div class="app-sub">Upload a claim report and ask anything about its status, amount, or reason.</div>', unsafe_allow_html=True)

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
@st.cache_resource(show_spinner=False)
def load_ocr_model():
    if not DOCTR_AVAILABLE:
        return None
    try:
        return ocr_predictor(pretrained=True)
    except Exception:
        return None

ocr_model = load_ocr_model()

SUMMARIZATION_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"

@st.cache_resource(show_spinner=False)
def load_summarizer():
    return pipeline("summarization", model=SUMMARIZATION_MODEL_NAME)

summarizer = load_summarizer()

# ------------------------------------------------------------
# Utility Functions
# ------------------------------------------------------------
def clean_text(t: str) -> str:
    if not t:
        return ""
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = re.sub(r'\n{3,}', '\n\n', t)
    return t.strip()

def extract_text(file) -> str:
    suffix = Path(file.name).suffix.lower()
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
    tmp.write(file.read()); tmp.flush(); tmp.close()
    text = ""
    try:
        if suffix == ".pdf":
            with pdfplumber.open(tmp.name) as pdf:
                for p in pdf.pages:
                    text += (p.extract_text() or "") + "\n"
        elif suffix in (".jpg", ".jpeg", ".png") and ocr_model:
            doc = DocumentFile.from_images(tmp.name)
            res = ocr_model(doc)
            text = res.render()
        elif suffix == ".txt":
            with open(tmp.name, encoding="utf-8", errors="ignore") as fh:
                text = fh.read()
    finally:
        os.unlink(tmp.name)
    return clean_text(text)

def extract_info(txt: str) -> dict:
    info = {}
    def grab(x):
        m = re.search(x, txt, re.I)
        return m.group(1).strip() if m else None

    info["claim_id"] = grab(r"Claim ID[:\s]*([A-Z0-9/-]+)")
    info["policy_number"] = grab(r"Policy[:\s]*([A-Z0-9/-]+)")
    info["patient"] = grab(r"Patient Name[:\s]*([A-Za-z ]+)")
    info["hospital"] = grab(r"Hospital[:\s]*([A-Za-z ]+)")
    info["claim_amount"] = grab(r"Claim Amount[:\s‚Çπ]*([\d,]+)")
    info["approved_amount"] = grab(r"Approved Amount[:\s‚Çπ]*([\d,]+)")
    info["status"] = grab(r"Status[:\s]*([A-Za-z ]+)")

    return info

def quick_summary(info: dict) -> str:
    return "\n".join([f"{k.replace('_',' ').title()}: {v}" for k,v in info.items() if v])

def summarize_long_text(text: str) -> str:
    if len(text.split()) < 50:
        return text
    result = summarizer(text[:2000], max_length=160, min_length=60, do_sample=False)
    return result[0]['summary_text']

def answer_question(q, info, structured, nlp):
    q = q.lower()
    if "status" in q or "approved" in q:
        return f"Decision: {info.get('status')}\nApproved Amount: ‚Çπ{info.get('approved_amount')}"
    if "amount" in q:
        return f"Claimed: ‚Çπ{info.get('claim_amount')}\nApproved: ‚Çπ{info.get('approved_amount')}"
    if "explain" in q or "summary" in q:
        return nlp
    return structured

# ‚úÖ NEW: BUILD DOWNLOADABLE SUMMARY
def build_downloadable_summary(info, structured, nlp):
    lines = ["====== CLAIM SUMMARY REPORT ======\n"]
    for k, v in info.items():
        if v:
            lines.append(f"{k.replace('_',' ').title()}: {v}")
    lines.append("\n--- STRUCTURED SUMMARY ---\n")
    lines.append(structured or "Not available")
    lines.append("\n--- AI SUMMARY ---\n")
    lines.append(nlp or "Not available")
    lines.append("\nGenerated by Claim Explanation Chatbot")
    return "\n".join(lines)

# ------------------------------------------------------------
# Session State
# ------------------------------------------------------------
if "messages" not in st.session_state:
    st.session_state.messages = [{
        "role":"assistant",
        "content":"Hi üëã First, attach a claim report using the ‚ûï Upload button.\nThen ask your questions."
    }]

if "raw_text" not in st.session_state:
    st.session_state.raw_text = None
    st.session_state.file_name = None
    st.session_state.info = None
    st.session_state.structured_summary = None
    st.session_state.nlp_summary = None
    st.session_state.show_download = False

# ------------------------------------------------------------
# Upload UI
# ------------------------------------------------------------
st.markdown('<div class="chat-upload-wrapper"><div class="chat-upload">', unsafe_allow_html=True)
st.markdown('<span class="chat-upload-label">‚ûï Upload claim report</span>', unsafe_allow_html=True)
uploaded_file = st.file_uploader("", type=["pdf","jpg","jpeg","png","txt"], label_visibility="collapsed")
st.markdown('</div></div>', unsafe_allow_html=True)

if uploaded_file and st.session_state.file_name != uploaded_file.name:
    with st.spinner("Processing report..."):
        raw_text = extract_text(uploaded_file)
        info = extract_info(raw_text)
        structured_summary = quick_summary(info)
        nlp_summary = summarize_long_text(raw_text)

        st.session_state.raw_text = raw_text
        st.session_state.info = info
        st.session_state.structured_summary = structured_summary
        st.session_state.nlp_summary = nlp_summary
        st.session_state.file_name = uploaded_file.name
        st.session_state.show_download = False

# ------------------------------------------------------------
# Chat UI
# ------------------------------------------------------------
for msg in st.session_state.messages:
    with st.chat_message(msg["role"]):
        st.markdown(msg["content"])

user_prompt = st.chat_input("Type your question...")

if user_prompt:
    st.session_state.messages.append({"role":"user","content":user_prompt})
    with st.chat_message("assistant"):

        user_lower = user_prompt.lower()

        # ‚úÖ WHEN USER ASKS FOR DOWNLOAD
        if "download" in user_lower and "summary" in user_lower:
            st.session_state.show_download = True
            reply = "‚úÖ Your summary is ready to download below."

        elif st.session_state.raw_text is None:
            reply = "Please upload a claim report first."

        else:
            reply = answer_question(
                user_prompt,
                st.session_state.info,
                st.session_state.structured_summary,
                st.session_state.nlp_summary,
            )

        st.markdown(reply)
        st.session_state.messages.append({"role":"assistant","content":reply})

# ------------------------------------------------------------
# ‚úÖ DOWNLOAD BUTTON (ONLY AFTER USER ASKS)
# ------------------------------------------------------------
if st.session_state.show_download and st.session_state.raw_text:
    report_text = build_downloadable_summary(
        st.session_state.info,
        st.session_state.structured_summary,
        st.session_state.nlp_summary,
    )

    st.download_button(
        label="‚¨áÔ∏è Download Claim Summary",
        data=report_text,
        file_name="claim_summary.txt",
        mime="text/plain"
    )

st.markdown('<div class="footer">Claim Explanation Chatbot ¬∑ OCR + NLP powered</div>', unsafe_allow_html=True)


Overwriting claim_explainer_chatbot_final.py


In [None]:
!streamlit run claim_explainer_chatbot_final.py
