In [1]:
!pip install python-doctr[torch] "opencv-python-headless<5"
!pip install pdfplumber
!pip install transformers
!pip install langdetect
!pip install sentencepiece
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install streamlit
!pip install pillow





Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [None]:
!pip install tf-keras

In [3]:
# ============================================================
# üß© INSURANCE CLAIM SUMMARIZER ‚Äî FIXED FOR LONG REPORTS
# ============================================================

# ---------------------------
# üì¶ Imports
# ---------------------------
import re
import pdfplumber
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from langdetect import detect
from transformers import MarianTokenizer, MarianMTModel, pipeline

# ---------------------------
# ‚öôÔ∏è Load OCR + Summarization Models
# ---------------------------
print("‚è≥ Loading models...")

# DocTR OCR (CPU mode)
ocr_model = ocr_predictor(pretrained=True)

# üîÅ You can swap this model name if you want better quality (slower):
# e.g. "facebook/bart-large-cnn"
SUMMARIZATION_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"

summarizer = pipeline(
    "summarization",
    model=SUMMARIZATION_MODEL_NAME,
)

print("‚úÖ Models loaded successfully!\n")


# ============================================================
# üß† Utility Functions
# ============================================================

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF using pdfplumber, fallback to DocTR OCR."""
    text = ""
    # 1Ô∏è‚É£ Try digital text first
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    if text.strip():
        return text.strip()

    # 2Ô∏è‚É£ Fallback: OCR with DocTR
    print("üîç Using OCR for scanned PDF...")
    doc = DocumentFile.from_pdf(pdf_path)
    result = ocr_model(doc)
    # doctr's .render() returns a full-page text representation
    return result.render()


def extract_text_from_image(image_path: str) -> str:
    """Extract text from image using DocTR OCR."""
    doc = DocumentFile.from_images(image_path)
    result = ocr_model(doc)
    return result.render()


def auto_detect_language(text: str) -> str:
    """Auto-detect language code from text."""
    try:
        return detect(text)
    except Exception:
        return "en"


# ---------------------------
# üåê Translation (kept same stack, but safe)
# ---------------------------

def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_chars: int = 1500) -> str:
    """
    Generic translation using MarianMT (Helsinki-NLP models).
    Now supports long texts by translating in chunks.
    """
    try:
        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
    except Exception as e:
        print(f"‚ö†Ô∏è Translation model for {src_lang}->{tgt_lang} not found ({e}). Using original text.")
        return text

    # Normalize whitespace a bit
    text = re.sub(r"\s+", " ", text.strip())

    if not text:
        return text

    # Chunk text to avoid 512-token truncation
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        # Try to break at the last period near the end of the window
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 100:  # no good period, or too early
            split = end
        else:
            split += 1  # include the period
        chunks.append(text[start:split].strip())
        start = split

    translated_chunks = []
    for c in chunks:
        if not c.strip():
            continue
        inputs = tokenizer(
            c,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        out_ids = model.generate(**inputs, max_length=512)
        translated = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        translated_chunks.append(translated.strip())

    return " ".join(translated_chunks).strip()


# ---------------------------
# üß† Long-text summarization
# ---------------------------

def _summarize_chunk(chunk: str, max_length: int = 160, min_length: int = 50) -> str:
    """Call HF summarizer safely on one chunk."""
    if not chunk.strip():
        return ""
    # Hugging Face pipeline expects text <= model max tokens. We approximate via chars.
    result = summarizer(
        chunk,
        max_length=max_length,
        min_length=min_length,
        do_sample=False,
    )
    return result[0]["summary_text"].strip()


def summarize_long_text(
    text: str,
    max_chunk_chars: int = 2500,
    chunk_summary_max_len: int = 160,
    chunk_summary_min_len: int = 50,
) -> str:
    """
    Summarize very long claim reports by:
      1. Cleaning + normalizing text
      2. Splitting into overlapping chunks
      3. Summarizing each chunk
      4. Optionally summarizing the concatenated summaries again
    """
    if not text or not text.strip():
        return "‚ö†Ô∏è No text detected in the document."

    # Basic normalization
    text = re.sub(r"\s+", " ", text.strip())

    if len(text) <= max_chunk_chars:
        # Short enough: one-shot summarization
        return _summarize_chunk(
            text,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )

    # 1Ô∏è‚É£ Split into chunks at sentence boundaries where possible
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        # Try to end at a period close to the limit
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 400:  # no good split; just hard cut
            split = end
        else:
            split += 1  # include period
        chunk = text[start:split].strip()
        if chunk:
            chunks.append(chunk)
        start = split

    # 2Ô∏è‚É£ Summarize each chunk
    partial_summaries = []
    for i, c in enumerate(chunks, start=1):
        print(f"üß© Summarizing chunk {i}/{len(chunks)} (len={len(c)} chars)")
        # If chunk is tiny, keep it as is
        if len(c.split()) < 40:
            partial_summaries.append(c)
        else:
            s = _summarize_chunk(
                c,
                max_length=chunk_summary_max_len,
                min_length=chunk_summary_min_len,
            )
            partial_summaries.append(s)

    # 3Ô∏è‚É£ Combine summaries; if still long, summarize again
    combined = " ".join(partial_summaries)
    combined = re.sub(r"\s+", " ", combined).strip()

    if len(combined) <= max_chunk_chars:
        final = _summarize_chunk(
            combined,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )
        return final

    # If still very long, just return concatenated summaries
    return combined


def summarize_claim(text: str) -> str:
    """Summarize insurance claim in customer-friendly English (long-text aware)."""
    if not text.strip():
        return "‚ö†Ô∏è No text detected in the document."

    summary = summarize_long_text(text)
    return summary.strip()


# ---------------------------
# üîó End-to-end pipelines
# ---------------------------

def process_claim_file(file_path: str, back_translate: bool = True) -> str:
    """
    Main pipeline:
    Extract ‚Üí Detect Lang ‚Üí Translate ‚Üí Summarize ‚Üí Back-translate.
    Handles long reports via chunked translation & summarization.
    """
    # 1Ô∏è‚É£ Extract text based on file type
    file_lower = file_path.lower()
    if file_lower.endswith(".pdf"):
        raw_text = extract_text_from_pdf(file_path)
    elif file_lower.endswith((".jpg", ".jpeg", ".png")):
        raw_text = extract_text_from_image(file_path)
    elif file_lower.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
    else:
        return "‚ö†Ô∏è Unsupported file type."

    if not raw_text.strip():
        return "‚ö†Ô∏è No readable text found in the document."

    print("\n‚úÖ Extracted Raw Text (first 400 chars):\n", raw_text[:400], "\n")

    # 2Ô∏è‚É£ Detect language automatically
    src_lang = auto_detect_language(raw_text)
    print(f"üåê Detected Language: {src_lang}")

    # 3Ô∏è‚É£ Translate to English if not already English (chunked translation)
    if src_lang != "en":
        text_in_english = translate_text(raw_text, src_lang, "en")
    else:
        text_in_english = raw_text

    # 4Ô∏è‚É£ Summarize claim (long-text aware)
    summary_en = summarize_claim(text_in_english)

    # 5Ô∏è‚É£ Optionally translate summary back to source language
    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"


def process_claim_text(text: str, back_translate: bool = True) -> str:
    """Directly process a claim text input."""
    if not text.strip():
        return "‚ö†Ô∏è Empty text."

    src_lang = auto_detect_language(text)
    print(f"üåê Detected Language: {src_lang}")

    if src_lang != "en":
        text_in_english = translate_text(text, src_lang, "en")
    else:
        text_in_english = text

    summary_en = summarize_claim(text_in_english)

    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"


# ============================================================
# üß™ Example Tests
# ============================================================

claim_report = """
Claim ID 48290 was rejected because the health insurance policy had expired
before the hospital admission date. The claim was submitted after the coverage
period ended. The insured had last renewed the policy on 01-Apr-2022 with a
grace period of 30 days; however, the hospitalization occurred on 15-May-2022
after the expiry of the grace period. As per policy conditions, claims after
expiry of coverage are not admissible.
"""

print("\nüîπ Direct Text Test:")
print(process_claim_text(claim_report))

print("\nüîπ PDF Test:")
print(process_claim_file("test_files/sample_claim.pdf"))

print("\nüîπ Image Test:")
print(process_claim_file("test_files/claim_image.png"))

print("\nüîπ Text File Test:")
print(process_claim_file("test_files/claim_note.txt"))


‚è≥ Loading models...




Device set to use cpu


‚úÖ Models loaded successfully!


üîπ Direct Text Test:


Your max_length is set to 160, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


üåê Detected Language: en


Your max_length is set to 160, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


üßæ **Customer-Friendly Explanation:**

Claim ID 48290 was rejected because the health insurance policy had expired before the hospital admission date . The claim was submitted after the coverage period ended . The insured had last renewed the policy on 01-Apr-2022 with a grace period of 30 days .

üîπ PDF Test:

‚úÖ Extracted Raw Text (first 400 chars):
 Claim ID: 39561
Patient Name: Priya Sharma
Date of Admission: 21 Sept 2024
Date of Discharge: 25 Sept 2024
Hospital: Medico Hospital, Pune
Diagnosis: Viral Fever
Treatment: Hospitalization and IV Fluids
Claim Amount: ‚Çπ32,000
Approved Amount: ‚Çπ28,500
Claim Status: APPROVED WITH REDUCTION
Remarks: Non-medical expenses such as food and toiletries are not covered under the
policy. 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Priya Sharma was admitted to Medico Hospital, Pune with Viral Fever at 21 Sept 2024 . She was diagnosed with viral fever and was admitted with IV Fluids at the hospital . Food and toilet

Your max_length is set to 160, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)



‚úÖ Extracted Raw Text (first 400 chars):
 Claim ID: 50923
Patient Name: Amit Patel
Date: 5 July 2024
Hospital: CityCare Hospital, Nagpur
Diagnosis: Dental Extraction
Claim Status: DENIED
Reason: Dental procedures are not covered under the current insurance plan. 

üåê Detected Language: en


Your max_length is set to 160, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


üßæ **Customer-Friendly Explanation:**

Dental procedures are not covered under the current insurance plan . Claim ID: 50923 Patient Name: Amit Patel Date: 5 July 2024 Hospital: CityCare Hospital, Nagpur Diagnosis: Dental Extraction Claim Status: DENIED .

üîπ Text File Test:

‚úÖ Extracted Raw Text (first 400 chars):
 Claim ID: 48290  
Patient Name: Ramesh Kumar  
Date of Admission: 12 Aug 2024  
Date of Discharge: 16 Aug 2024  
Hospital Name: LifeCare Multispeciality Hospital  

Diagnosis: Acute Appendicitis  
Treatment: Appendectomy  

Claim Amount Submitted: ‚Çπ58,000  
Amount Approved: ‚Çπ0  
Claim Status: REJECTED  

Reason for Rejection: The health insurance policy expired on 10 Aug 2024, two days before admi 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Claim ID: 48290 Patient Name: Ramesh Kumar Date of Admission: 12 Aug 2024 Date of Discharge: 16 Aug 2024 Hospital Name: LifeCare Multispeciality Hospital Diagnosis: Acute Appendicitis Treatment: Appende

In [4]:
!pip install streamlit transformers torch sentencepiece pytesseract pdfplumber Pillow



In [5]:
!pip install sumy python-docx



In [8]:
%%writefile claim_explainer_app_fixed.py
# claim_explainer_app_fixed.py
# ============================================================
# ‚ö° Insurance Claim Explainer ‚Äî Streamlit + Gen AI Summary
# ============================================================
# - Extracts key fields (Claim ID, Status, Amounts, Reason, etc.)
# - Generates a rule-based explanation
# - PLUS an AI-generated, long-text-aware summary of the full report
#
# Run from terminal or Jupyter:
#   streamlit run claim_explainer_app_fixed.py
# ============================================================

import re
import os
import tempfile
from pathlib import Path

import streamlit as st
import pdfplumber

# ---------- Optional OCR (DocTR) ----------
try:
    from doctr.io import DocumentFile
    from doctr.models import ocr_predictor
    DOCTR_AVAILABLE = True
except Exception:
    DocumentFile = None
    ocr_predictor = None
    DOCTR_AVAILABLE = False

# ---------- Gen AI imports ----------
from langdetect import detect
from transformers import pipeline, MarianTokenizer, MarianMTModel

# ------------------------------------------------------------
# Streamlit page setup & styles
# ------------------------------------------------------------
st.set_page_config(page_title="Insurance Claim Explainer", page_icon="üßæ", layout="centered")

st.markdown(
    """
<style>
.app-title{text-align:center;font-size:36px;color:#123b5a;font-weight:800;margin-bottom:4px}
.app-sub{text-align:center;color:#52606d;margin-bottom:20px}
.card{background:#fff;border:1px solid #e6edf5;border-radius:16px;padding:20px;box-shadow:0 5px 16px rgba(0,0,0,0.05)}
.result{background:#f7fafc;border:1px solid #e6edf5;border-radius:14px;padding:18px;margin-top:16px;white-space:pre-wrap;font-size:15px;color:#0f1a2d}
.footer{text-align:center;color:#738093;font-size:12px;margin-top:20px}
.section-title{font-weight:700;font-size:18px;margin-top:8px;margin-bottom:6px;color:#123b5a}
.section-sub{font-weight:500;font-size:14px;margin-bottom:8px;color:#52606d}
</style>
""",
    unsafe_allow_html=True,
)

st.markdown('<div class="app-title">üßæ Claim Explanation</div>', unsafe_allow_html=True)
st.markdown('<div class="app-sub">Instant, clear summaries showing claim approval or rejection status.</div>', unsafe_allow_html=True)

# ------------------------------------------------------------
# OCR model loader (cached)
# ------------------------------------------------------------
@st.cache_resource(show_spinner=False)
def load_ocr_model():
    if not DOCTR_AVAILABLE:
        return None
    try:
        return ocr_predictor(pretrained=True)
    except Exception:
        return None

ocr_model = load_ocr_model()

# ------------------------------------------------------------
# Gen AI: summarizer loader (cached)
# ------------------------------------------------------------
SUMMARIZATION_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"  # small, CPU-friendly

@st.cache_resource(show_spinner=False)
def load_summarizer():
    return pipeline("summarization", model=SUMMARIZATION_MODEL_NAME)

summarizer = load_summarizer()

# ------------------------------------------------------------
# Text cleaning and extraction helpers (robust)
# ------------------------------------------------------------
def clean_text(t: str) -> str:
    """
    Normalize whitespace, remove repeated separator lines, and cut off
    obvious footer/sign-off blocks to avoid footer bleed into fields.
    """
    if not t:
        return ""
    # Normalize newlines
    t = t.replace("\r\n", "\n").replace("\r", "\n")

    # Remove long separator lines (----- **** ____ etc.)
    t = re.sub(r'(?m)^[\-\=_\*]{3,}\s*$', '\n', t)

    # Strip trailing whitespace per line
    t = re.sub(r'[ \t]+$', '', t, flags=re.M)

    # Collapse many blank lines
    t = re.sub(r'\n{3,}', '\n\n', t)

    # Cut off after common footer headers to avoid footer bleed into earlier fields
    cutoff_headers = [
        r'FINAL REMARKS', r'SIGN-?OFF', r'END OF REPORT', r'REVIEWED & SIGNED BY',
        r'APPROVAL & FINANCIAL ASSESSMENT', r'SIGN-OFF ‚Äî CLAIMS DEPARTMENT', r'FINAL REMARKS:'
    ]
    for h in cutoff_headers:
        m = re.search(fr'(?im)\n{h}.*', t)
        if m:
            t = t[:m.start()].strip()
            break

    return t.strip()

def _extract_section(txt: str, header_patterns, stop_patterns=None):
    """
    Find a header (one of header_patterns) and extract the block that follows,
    stopping at the nearest pattern in stop_patterns (if given). Handles:
     - Header on its own line followed by a block
     - Header and value on same line: 'Reason: some text'
    Returns None if not found.
    """
    if not txt:
        return None

    if isinstance(header_patterns, (list, tuple)):
        header_rx = r'(?:' + r'|'.join(header_patterns) + r')'
    else:
        header_rx = r'(?:' + header_patterns + r')'

    # Case 1: header on its own line, then block
    m = re.search(fr'(?im){header_rx}\s*[:\-]?\s*\n', txt)
    if m:
        start = m.end()
        end = len(txt)
        if stop_patterns:
            stops = stop_patterns if isinstance(stop_patterns, (list, tuple)) else [stop_patterns]
            nearest = None
            for s in stops:
                ms = re.search(fr'(?im)\n{s}\s*[:\-]?\s*\n', txt[start:])
                if ms:
                    end_idx = start + ms.start()
                    if nearest is None or end_idx < nearest:
                        nearest = end_idx
            if nearest is not None:
                end = nearest
        section = txt[start:end].strip()
        return section or None

    # Case 2: header and value on same line
    m2 = re.search(fr'(?im){header_rx}\s*[:\-]\s*(.+)', txt)
    if m2:
        val = m2.group(1)
        return val.strip() if isinstance(val, str) and val.strip() else None

    return None

def resolve_status(txt: str, info_status: str = None) -> str:
    """
    Gather all status mentions in document and return the last one
    (assumes later mentions override earlier ones).
    """
    if not txt:
        return info_status

    candidates = []

    # explicit status lines
    for m in re.finditer(r'(?im)(?:Claim\s*Status|Status)\s*[:\-]\s*([A-Za-z \-]+)', txt):
        val = m.group(1).strip()
        if val:
            candidates.append(val)

    # "Automatically updated status: REJECTED" style
    for m in re.finditer(r'(?im)(?:Automatically updated status|Automatically updated)\s*[:\-]?\s*([A-Za-z \-]+)', txt):
        val = m.group(1).strip()
        if val:
            candidates.append(val)

    # standalone keywords as they appear (in order)
    for m in re.finditer(r'(?im)\b(Approved with Reduction|Partially Approved|Approved|Denied|Rejected|Declined|Full Approval|FULL APPROVAL|APPROVED|DECLINED|DECLINED)\b', txt):
        candidates.append(m.group(1).strip())

    if candidates:
        return candidates[-1]
    return info_status

def extract_info(txt: str) -> dict:
    """
    Safely extract fields from cleaned text. Returns a dict with keys:
    claim_id, policy_number, patient, age, gender, uhid, hospital, hospital_city,
    admission_date, discharge_date, claim_amount, approved_amount, status, reason
    """
    info = {}
    t = txt or ""

    def single_line(pat):
        m = re.search(fr'(?im)(?:{pat})\s*[:\-]\s*(.+)', t)
        if not m:
            return None
        v = m.group(1)
        return v.strip() if isinstance(v, str) and v.strip() else None

    # IDs and simple fields
    info['claim_id'] = single_line(r'Claim\s*ID|Claim\s*No|Claim\s*Number|CLAIM\s*:|CLAIM\s*:')
    info['policy_number'] = single_line(r'Policy\s*Number|Policy\s*No|POLICY\s*NO|Policy\s*:')
    info['patient'] = single_line(r'Patient\s*Name|PATIENT\s*NAME|Name\b')
    # Age/Gender combined or separate
    ag = single_line(r'Age\s*/\s*Gender|Age\s*\(Years\)\s*/\s*Gender|Age/Gender|AGE/GENDER|Age[:]')
    if ag:
        m = re.search(r'(\d{1,3})\s*/\s*([A-Za-z]+)', ag)
        if m:
            info['age'] = m.group(1)
            info['gender'] = m.group(2)
        else:
            parts = [p.strip() for p in re.split(r'[/,|]', ag) if p.strip()]
            for p in parts:
                if re.match(r'^\d{1,3}$', p):
                    info['age'] = p
                elif p.isalpha():
                    info['gender'] = p

    info['uhid'] = single_line(r'UHID|UHID[: ]')
    info['hospital'] = single_line(r'Hospital\s*Name|HOSPITAL|Hospital[:]')
    info['hospital_city'] = single_line(r'Hospital\s*City|HOSPITAL\s*LOCATION|City\s*:')
    info['admission_date'] = single_line(r'Admission\s*Date|Admit[:]')
    info['discharge_date'] = single_line(r'Discharge\s*Date|Discharge[:]')

    # Claim amounts - try explicit patterns, then looser paragraph patterns
    m = re.search(r'(?im)Claim\s*Amount\s*(?:Submitted)?\s*[:\-]?\s*[‚ÇπINR\s]*([0-9,]+(?:\.\d+)?)', t)
    if m:
        info['claim_amount'] = m.group(1).strip()
    else:
        m2 = re.search(r'(?im)(?:total (?:hospital )?expenses|total billed|total claimed|expenses came to|expenses were|Bill paid by patient|expenses paid)\s*(?:[:\-]?)\s*[‚ÇπINR\s]*([0-9,]+(?:\.\d+)?)', t)
        if m2:
            info['claim_amount'] = m2.group(1).strip()

    m3 = re.search(r'(?im)Approved\s*Amount\s*[:\-]?\s*[‚ÇπINR\s]*([0-9,]+(?:\.\d+)?)', t)
    if m3:
        info['approved_amount'] = m3.group(1).strip()

    # Status: combine inline and resolved status
    inline_status = single_line(r'Claim\s*Status|Status')
    resolved = resolve_status(t, inline_status)
    info['status'] = resolved

    # Reason: try section-aware extraction first, then fallback patterns
    reason_section = _extract_section(
        t,
        header_patterns=[r'REASON\s*FOR\s*REJECTION', r'REASON\s*FOR\s*REJECTIONS?', r'Reason', r'Reason\s*:'],
        stop_patterns=[r'FINAL\s+REMARKS', r'SIGN-?OFF', r'APPROVAL\s*&\s*FINANCIAL', r'HOSPITAL\s*BILL\s*SUMMARY', r'FINAL\s+REMARKS:']
    )
    if reason_section:
        reason = re.sub(r'\s+', ' ', reason_section).strip()
        info['reason'] = (reason[:800] + '...') if len(reason) > 800 else reason
    else:
        m = re.search(r'(?im)(policy (?:was )?inactive.*?grace period.*?\.?)', t)
        if m:
            info['reason'] = m.group(1).strip()
        else:
            m2 = re.search(r'(?im)Reason\s*[:\-]?\s*(.+)', t)
            if m2 and m2.group(1):
                info['reason'] = m2.group(1).strip()

    return info

def quick_summary(txt: str, info: dict) -> str:
    """
    Build a compact, human-friendly summary using resolved status and trimmed fields.
    """
    status_text = (info.get('status') or '') or ''
    stx = (status_text or '').lower()

    decision = "Unknown"
    if stx:
        if 'approved' in stx and ('reduction' in stx or 'partial' in stx):
            decision = "Approved with Reduction"
        elif 'approved' in stx or 'full approval' in stx or ('full' in stx and 'approval' in stx):
            decision = "Approved"
        elif 'rejected' in stx or 'denied' in stx or 'declined' in stx or 'decline' in stx:
            decision = "Rejected"
        else:
            if 'reject' in stx or 'deni' in stx:
                decision = "Rejected"
            elif 'approv' in stx:
                decision = "Approved"

    # fallback: search document for last-standing status if still unknown
    if decision == "Unknown":
        final_status = resolve_status(txt, None)
        if final_status:
            fs = final_status.lower()
            if 'reject' in fs or 'deni' in fs:
                decision = "Rejected"
            elif 'approv' in fs:
                decision = "Approved"

    lines = [f"Decision: {decision}"]
    if info.get('claim_id'):
        lines.append(f"Claim ID: {info['claim_id']}")
    if info.get('policy_number'):
        lines.append(f"Policy: {info['policy_number']}")

    # Patient
    patient_parts = []
    if info.get('patient'):
        patient_parts.append(info['patient'])
    if info.get('age'):
        patient_parts.append(f"Age: {info['age']}")
    if info.get('gender'):
        patient_parts.append(f"Gender: {info['gender']}")
    if patient_parts:
        lines.append("Patient: " + " | ".join(patient_parts))

    # Hospital
    if info.get('hospital'):
        hosp = info['hospital']
        if info.get('hospital_city'):
            hosp = f"{hosp} ‚Äî {info['hospital_city']}"
        lines.append(f"Hospital: {hosp}")

    # Amounts
    if info.get('claim_amount'):
        lines.append(f"Claimed: ‚Çπ{info['claim_amount']}")
    if info.get('approved_amount'):
        lines.append(f"Approved: ‚Çπ{info['approved_amount']}")

    # Reason (trimmed)
    if info.get('reason'):
        reason = info['reason'].strip()
        if len(reason) > 500:
            reason = reason[:500].rstrip() + "..."
        lines.append(f"Reason: {reason}")

    # Outcome guidance
    if decision.startswith("Approved"):
        lines.append("Outcome: Your claim has been accepted. Amount may differ due to deductions or sub-limits.")
    elif decision == "Rejected":
        lines.append("Outcome: Claim not approved. Please review reason and contact insurer if clarification needed.")
    else:
        lines.append("Outcome: Decision unclear. Please verify with insurer or upload the original PDF for manual review.")

    return "\n".join(lines)

# ------------------------------------------------------------
# Gen AI: translation + long-text summarization
# ------------------------------------------------------------
def auto_detect_language(text: str) -> str:
    try:
        return detect(text)
    except Exception:
        return "en"

# Simple in-memory cache for translation models so we don't reload same pair repeatedly
_translation_models = {}

def get_translation_model(src_lang: str, tgt_lang: str):
    key = f"{src_lang}-{tgt_lang}"
    if key in _translation_models:
        return _translation_models[key]
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    _translation_models[key] = (tokenizer, model)
    return tokenizer, model

def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_chars: int = 1500) -> str:
    """
    Generic translation using MarianMT with chunking for long texts.
    """
    try:
        tokenizer, model = get_translation_model(src_lang, tgt_lang)
    except Exception as e:
        print(f"Translation model for {src_lang}->{tgt_lang} not found ({e}). Using original text.")
        return text

    text = re.sub(r"\s+", " ", text.strip())
    if not text:
        return text

    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 100:
            split = end
        else:
            split += 1
        chunks.append(text[start:split].strip())
        start = split

    translated_chunks = []
    for c in chunks:
        if not c.strip():
            continue
        inputs = tokenizer(
            c,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        out_ids = model.generate(**inputs, max_length=512)
        translated = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        translated_chunks.append(translated.strip())

    return " ".join(translated_chunks).strip()

def _summarize_chunk(chunk: str, max_length: int = 180, min_length: int = 50) -> str:
    if not chunk.strip():
        return ""
    result = summarizer(
        chunk,
        max_length=max_length,
        min_length=min_length,
        do_sample=False,
    )
    return result[0]["summary_text"].strip()

def summarize_long_text(
    text: str,
    max_chunk_chars: int = 2500,
    chunk_summary_max_len: int = 180,
    chunk_summary_min_len: int = 50,
) -> str:
    """
    Chunk long reports, summarize each piece, then optionally summarize summaries.
    """
    if not text or not text.strip():
        return "‚ö†Ô∏è No text detected in the document."

    text = re.sub(r"\s+", " ", text.strip())

    if len(text) <= max_chunk_chars:
        return _summarize_chunk(
            text,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )

    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chunk_chars, n)
        split = text.rfind(".", start, end)
        if split == -1 or split <= start + 400:
            split = end
        else:
            split += 1
        chunk = text[start:split].strip()
        if chunk:
            chunks.append(chunk)
        start = split

    partial_summaries = []
    for i, c in enumerate(chunks, start=1):
        # No emoji here to avoid UnicodeEncodeError on Windows console
        print(f"Summarizing chunk {i}/{len(chunks)} (len={len(c)} chars)")
        if len(c.split()) < 40:
            partial_summaries.append(c)
        else:
            s = _summarize_chunk(
                c,
                max_length=chunk_summary_max_len,
                min_length=chunk_summary_min_len,
            )
            partial_summaries.append(s)

    combined = " ".join(partial_summaries)
    combined = re.sub(r"\s+", " ", combined).strip()

    if len(combined) <= max_chunk_chars:
        final = _summarize_chunk(
            combined,
            max_length=chunk_summary_max_len,
            min_length=chunk_summary_min_len,
        )
        return final

    return combined

def generate_claim_summary(raw_text: str, back_translate: bool = True) -> str:
    """
    High-level NLP summary:
    - Detect language
    - Translate to English if needed
    - Summarize long text
    - Optionally translate back to source language
    """
    if not raw_text or not raw_text.strip():
        return "‚ö†Ô∏è No text detected to summarize."

    src_lang = auto_detect_language(raw_text)
    # No emoji in console print (Windows cp1252 issue)
    print(f"Detected language for summary: {src_lang}")

    if src_lang != "en":
        text_en = translate_text(raw_text, src_lang, "en")
    else:
        text_en = raw_text

    summary_en = summarize_long_text(text_en)

    if back_translate and src_lang != "en":
        final_summary = translate_text(summary_en, "en", src_lang)
    else:
        final_summary = summary_en

    return final_summary.strip()

# ------------------------------------------------------------
# File text extraction (pdf / image / txt)
# ------------------------------------------------------------
def extract_text(file) -> str:
    suffix = Path(file.name).suffix.lower()
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
    tmp.write(file.read()); tmp.flush(); tmp.close()
    text = ""
    try:
        if suffix == ".pdf":
            with pdfplumber.open(tmp.name) as pdf:
                for p in pdf.pages:
                    page_text = p.extract_text() or ""
                    text += page_text + "\n"
        elif suffix in (".jpg", ".jpeg", ".png"):
            if ocr_model is None:
                raise RuntimeError("OCR model not available in this environment.")
            doc = DocumentFile.from_images(tmp.name)
            res = ocr_model(doc)
            try:
                for pg in res.pages:
                    try:
                        text += pg.get_text() + "\n"
                    except Exception:
                        pass
                if not text.strip():
                    text = res.render()
            except Exception:
                text = res.render()
        elif suffix == ".txt":
            with open(tmp.name, encoding="utf-8", errors="ignore") as fh:
                text = fh.read()
        else:
            with open(tmp.name, encoding="utf-8", errors="ignore") as fh:
                text = fh.read()
    finally:
        try:
            os.unlink(tmp.name)
        except Exception:
            pass
    return clean_text(text)

# ------------------------------------------------------------
# Streamlit UI
# ------------------------------------------------------------
with st.container():
    st.markdown('<div class="card">', unsafe_allow_html=True)
    file = st.file_uploader("Upload claim report (.pdf, .jpg, .jpeg, .png, .txt)", type=["pdf", "jpg", "jpeg", "png", "txt"])
    go = st.button("‚ö° Summarize Claim", type="primary", disabled=(file is None))
    st.markdown('</div>', unsafe_allow_html=True)

if go and file:
    with st.spinner("Processing and summarizing‚Ä¶"):
        try:
            raw_text = extract_text(file)
        except Exception as e:
            st.error(f"Failed to extract text: {e}")
            raw_text = ""

        info = extract_info(raw_text)
        structured_summary = quick_summary(raw_text, info)
        nlp_summary = generate_claim_summary(raw_text, back_translate=True)

    # ---------- Display results ----------
    st.markdown('<div class="result">', unsafe_allow_html=True)

    st.markdown('<div class="section-title">üìå Structured Explanation (Rule-Based)</div>', unsafe_allow_html=True)
    st.markdown('<div class="section-sub">Extracted using regex patterns and decision rules.</div>', unsafe_allow_html=True)
    st.text(structured_summary)

    st.markdown('<div class="section-title">ü§ñ AI-Generated Summary (NLP Model)</div>', unsafe_allow_html=True)
    st.markdown('<div class="section-sub">Generated using transformers with long-text handling.</div>', unsafe_allow_html=True)
    st.text(nlp_summary)

    st.markdown('</div>', unsafe_allow_html=True)

    combined_for_download = (
        "Structured Explanation (Rule-Based):\n"
        + structured_summary
        + "\n\n"
        + "AI-Generated Summary (NLP Model):\n"
        + nlp_summary
    )
    st.download_button(
        "‚¨áÔ∏è Download Explanation",
        data=combined_for_download.encode('utf-8'),
        file_name="claim_summary.txt",
        mime="text/plain",
    )

st.markdown('<div class="footer">Simple. Fast. Understandable ‚Äî instant claim explanations.</div>', unsafe_allow_html=True)

# ============================================================
# End of file
# ============================================================


Overwriting claim_explainer_app_fixed.py


In [None]:
 !streamlit run claim_explainer_app_fixed.py