In [1]:
!pip install python-doctr[torch] "opencv-python-headless<5"
!pip install pdfplumber
!pip install transformers
!pip install langdetect
!pip install sentencepiece
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install streamlit
!pip install pillow





Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [3]:
# ============================================================
# üß© INSURANCE CLAIM SUMMARIZER ‚Äî ENGLISH VERSION (DocTR OCR)
# ============================================================

# ---------------------------
# üì¶ Imports
# ---------------------------
import pdfplumber
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from langdetect import detect
from transformers import MarianTokenizer, MarianMTModel, pipeline


# ---------------------------
# ‚öôÔ∏è Load OCR + Summarization Models
# ---------------------------
print("‚è≥ Loading models...")

# DocTR OCR (CPU mode)
ocr_model = ocr_predictor(pretrained=True)

# Lightweight summarizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

print("‚úÖ Models loaded successfully!\n")


# ============================================================
# üß† Utility Functions
# ============================================================

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using DocTR or pdfplumber"""
    # Try native text first
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    if text.strip():
        return text.strip()

    # Fallback: OCR with DocTR
    print("üîç Using OCR for scanned PDF...")
    doc = DocumentFile.from_pdf(pdf_path)
    result = ocr_model(doc)
    return result.render()


def extract_text_from_image(image_path):
    """Extract text from image using DocTR"""
    doc = DocumentFile.from_images(image_path)
    result = ocr_model(doc)
    return result.render()


def auto_detect_language(text):
    """Auto-detect language code from text"""
    try:
        return detect(text)
    except:
        return "en"


def translate_text(text, src_lang, tgt_lang):
    """Generic translation using MarianMT (Helsinki-NLP models)"""
    try:
        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        translated = model.generate(**inputs, max_length=512)
        return tokenizer.decode(translated[0], skip_special_tokens=True)
    except Exception as e:
        print(f"‚ö†Ô∏è Translation model for {src_lang}->{tgt_lang} not found. Using original text.")
        return text


def summarize_claim(text):
    """Summarize insurance claim in customer-friendly English"""
    if not text.strip():
        return "‚ö†Ô∏è No text detected in the document."
    result = summarizer(text, max_length=130, min_length=50, do_sample=False)
    return result[0]['summary_text'].strip()


def process_claim_file(file_path, back_translate=True):
    """Main pipeline: Extract ‚Üí Detect Lang ‚Üí Translate ‚Üí Summarize ‚Üí Back-translate"""
    # 1Ô∏è‚É£ Extract text based on file type
    if file_path.endswith(".pdf"):
        raw_text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith((".jpg", ".jpeg", ".png")):
        raw_text = extract_text_from_image(file_path)
    elif file_path.lower().endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()
    else:
        return "‚ö†Ô∏è Unsupported file type."

    if not raw_text.strip():
        return "‚ö†Ô∏è No readable text found in the document."

    print("\n‚úÖ Extracted Raw Text:\n", raw_text[:400], "\n")

    # 2Ô∏è‚É£ Detect language automatically
    src_lang = auto_detect_language(raw_text)
    print(f"üåê Detected Language: {src_lang}")

    # 3Ô∏è‚É£ Translate to English if not already English
    text_in_english = raw_text if src_lang == "en" else translate_text(raw_text, src_lang, "en")

    # 4Ô∏è‚É£ Summarize claim
    summary_en = summarize_claim(text_in_english)

    # 5Ô∏è‚É£ Optionally translate summary back to source language
    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"


def process_claim_text(text, back_translate=True):
    """Directly process a claim text input"""
    src_lang = auto_detect_language(text)
    print(f"üåê Detected Language: {src_lang}")

    text_in_english = text if src_lang == "en" else translate_text(text, src_lang, "en")
    summary_en = summarize_claim(text_in_english)

    if back_translate and src_lang != "en":
        summary_final = translate_text(summary_en, "en", src_lang)
    else:
        summary_final = summary_en

    return f"üßæ **Customer-Friendly Explanation:**\n\n{summary_final}"



# ============================================================
# üß™ Example Tests (For English Files)
# ============================================================

# üß© Example 1: Direct text input
claim_report = """
Claim ID 48290 was rejected because the health insurance policy had expired
before the hospital admission date. The claim was submitted after the coverage period ended.
"""

print("\nüîπ Direct Text Test:")
print(process_claim_text(claim_report))

# üß© Example 2: PDF file
print("\nüîπ PDF Test:")
print(process_claim_file("test_files/sample_claim.pdf"))

# üß© Example 3: Image file
print("\nüîπ Image Test:")
print(process_claim_file("test_files/claim_image.png"))

# üß© Example 4: Text file
print("\nüîπ Text File Test:")
print(process_claim_file("test_files/claim_note.txt"))


‚è≥ Loading models...


Device set to use cpu


‚úÖ Models loaded successfully!


üîπ Direct Text Test:


Your max_length is set to 130, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


üåê Detected Language: en


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


üßæ **Customer-Friendly Explanation:**

Claim ID 48290 was rejected because the health insurance policy had expired before the hospital admission date . Claim was submitted after the coverage period ended . The claim was submitted shortly after the insurance policy expired . The hospital was denied the claim because the policy expired before admission .

üîπ PDF Test:

‚úÖ Extracted Raw Text:
 Claim ID: 39561
Patient Name: Priya Sharma
Date of Admission: 21 Sept 2024
Date of Discharge: 25 Sept 2024
Hospital: Medico Hospital, Pune
Diagnosis: Viral Fever
Treatment: Hospitalization and IV Fluids
Claim Amount: ‚Çπ32,000
Approved Amount: ‚Çπ28,500
Claim Status: APPROVED WITH REDUCTION
Remarks: Non-medical expenses such as food and toiletries are not covered under the
policy. 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Priya Sharma was diagnosed with Viral Fever at Medico Hospital, Pune with IV Fluids . She was admitted to the hospital 21 Sept 2024 and discharged 

Your max_length is set to 130, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)



‚úÖ Extracted Raw Text:
 Claim ID: 50923
Patient Name: Amit Patel
Date: 5 July 2024
Hospital: CityCare Hospital, Nagpur
Diagnosis: Dental Extraction
Claim Status: DENIED
Reason: Dental procedures are not covered under the current insurance plan. 

üåê Detected Language: en
üßæ **Customer-Friendly Explanation:**

Amit Patel's dental procedures are not covered under the current insurance plan . Claim ID: 50923 ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬† ¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†-¬†¬†¬†¬†¬†¬†¬†¬†¬†-¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†‚Äì¬†¬†¬†‚ÄòDental Extraction‚Äô is not covered by the current plan .

üîπ Text File Test:

‚úÖ Extracted Raw Text:
 Claim ID: 48290  
Patient Name: Ramesh Kumar  
Date of Admission: 12 Aug 2024  
Date of Discharge: 16 Aug 2024  
Hospital Name: LifeCare Multispeciality Hospital  

Diagnosis: Acute Appendicitis  
Treatment: Appendectomy  

Claim Amount Submitted: ‚Çπ58,000  
Amount Approved: ‚Çπ0  
Claim Status: REJECTED  

Reason for Rejection: 

In [None]:
%%writefile claim_explainer_app.py
# ============================================================
# üíé Insurance Claim Explanation App - Using DocTR (No EasyOCR/Tesseract)
# ============================================================

import streamlit as st
import pdfplumber
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from transformers import pipeline
import tempfile
import os
from pathlib import Path

# ============================================================
# üé® Streamlit Page Config
# ============================================================

st.set_page_config(
    page_title="Insurance Claim Explainer",
    page_icon="üßæ",
    layout="centered",
)

# Custom CSS
st.markdown("""
    <style>
    body {
        background: linear-gradient(180deg, #eef2f3, #ffffff);
    }
    .main-title {
        text-align: center;
        font-size: 36px !important;
        color: #1f4e79;
        font-weight: 700;
    }
    .subtitle {
        text-align: center;
        color: #4b5563;
        font-size: 18px;
        margin-bottom: 20px;
    }
    .result-box {
    background-color: #f9fafb;
    color: #1f2937;  /* üü¢ Dark gray text color for readability */
    padding: 20px;
    border-radius: 10px;
    border: 1px solid #e5e7eb;
    box-shadow: 0 2px 8px rgba(0,0,0,0.05);
    white-space: pre-wrap;  /* Keep text formatting */
    font-size: 16px;
    line-height: 1.5;

    }
    .stButton>button {
        background-color: #1f4e79;
        color: white;
        border-radius: 10px;
        font-size: 16px;
        padding: 8px 20px;
    }
    </style>
""", unsafe_allow_html=True)

# ============================================================
# üß† Load Models
# ============================================================

@st.cache_resource
def load_models():
    ocr_model = ocr_predictor(pretrained=True)
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    return ocr_model, summarizer

ocr_model, summarizer = load_models()

# ============================================================
# üß© Helper Functions
# ============================================================

def extract_text_from_pdf(pdf_path):
    """Extract text using pdfplumber or OCR fallback"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    if text.strip():
        return text.strip()
    
    # Fallback to OCR if PDF text not selectable
    doc = DocumentFile.from_pdf(pdf_path)
    result = ocr_model(doc)
    return result.render()

def extract_text_from_image(image_path):
    """Extract text from image using DocTR OCR"""
    doc = DocumentFile.from_images(image_path)
    result = ocr_model(doc)
    return result.render()

def summarize_claim(text):
    """Summarize insurance claim text"""
    if not text.strip():
        return "‚ö†Ô∏è No text detected in the document."
    text = text[:3000]  # avoid model limit overflow
    result = summarizer(text, max_length=130, min_length=40, do_sample=False)
    return result[0]['summary_text'].strip()

def process_claim_file(file_path):
    """Main pipeline: Extract ‚Üí Summarize"""
    file_ext = Path(file_path).suffix.lower()
    if file_ext == ".pdf":
        raw_text = extract_text_from_pdf(file_path)
    elif file_ext in [".jpg", ".jpeg", ".png"]:
        raw_text = extract_text_from_image(file_path)
    elif file_ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()
    else:
        return "‚ö†Ô∏è Unsupported file type."

    if not raw_text.strip():
        return "‚ö†Ô∏è Could not extract any readable text."
    return summarize_claim(raw_text)

# ============================================================
# üñ•Ô∏è Streamlit UI
# ============================================================

st.markdown('<h1 class="main-title">üßæ Insurance Claim Explanation</h1>', unsafe_allow_html=True)
st.markdown('<p class="subtitle">Summarize claim PDFs or images into simple English.</p>', unsafe_allow_html=True)

option = st.sidebar.radio("Select Input Mode:", ["üìù Enter Text", "üìÇ Upload File"])

if option == "üìù Enter Text":
    claim_text = st.text_area("‚úçÔ∏è Enter or paste claim details below:", height=180)
    if st.button("üîç Explain Claim"):
        if claim_text.strip():
            with st.spinner("Summarizing..."):
                summary = summarize_claim(claim_text)
            st.success("‚úÖ Claim summarized successfully!")
            st.markdown(f'<div class="result-box"><h4>üßæ Customer-Friendly Explanation</h4><p>{summary}</p></div>', unsafe_allow_html=True)
        else:
            st.warning("‚ö†Ô∏è Please enter some text.")
else:
    uploaded_file = st.file_uploader("üìé Upload a file (.pdf, .jpg, .jpeg, .png, .txt):", type=["pdf", "jpg", "jpeg", "png", "txt"])
    if uploaded_file:
        suffix = Path(uploaded_file.name).suffix
        temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
        with open(temp_path, "wb") as f:
            f.write(uploaded_file.read())

        st.info(f"‚úÖ File uploaded: **{uploaded_file.name}**")

        if st.button("üîç Explain Claim"):
            with st.spinner("Extracting and summarizing..."):
                summary = process_claim_file(temp_path)
            st.success("‚úÖ Claim summarized successfully!")
            st.markdown(f'<div class="result-box"><h4>üßæ Customer-Friendly Explanation</h4><p>{summary}</p></div>', unsafe_allow_html=True)

        os.remove(temp_path)


In [None]:
!streamlit run claim_explainer_app.py