In [None]:
# Intelligent Document Analysis System
# File: document_analyzer.py

"""
Advanced Document Analysis System with AI
=========================================

Features:
- Multi-format document processing (PDF, DOCX, images, text)
- OCR for image-based documents
- Named Entity Recognition (NER)
- Document classification
- Sentiment analysis
- Automatic summarization
- Keyword extraction
- Web interface with FastAPI

Requirements:
pip install fastapi uvicorn python-multipart
pip install pillow pytesseract opencv-python
pip install transformers torch sentence-transformers
pip install PyMuPDF python-docx mammoth
pip install spacy && python -m spacy download en_core_web_sm
"""

import os
import io
import re
import json
import logging
from typing import List, Dict, Any, Optional
from datetime import datetime
import time
import hashlib

# Core libraries
import numpy as np
from PIL import Image
import cv2

# OCR
try:
    import pytesseract
    OCR_AVAILABLE = True
except ImportError:
    OCR_AVAILABLE = False
    print("⚠️ OCR not available. Install with: pip install pytesseract")

# Document processing
try:
    import fitz  # PyMuPDF
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False
    print("⚠️ PDF processing not available. Install with: pip install PyMuPDF")

try:
    from docx import Document as DocxDocument
    import mammoth
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False
    print("⚠️ DOCX processing not available. Install with: pip install python-docx mammoth")

# NLP libraries
try:
    import spacy
    NLP_AVAILABLE = True
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        nlp = None
        print("⚠️ spaCy model not found. Install with: python -m spacy download en_core_web_sm")
except ImportError:
    NLP_AVAILABLE = False
    nlp = None
    print("⚠️ spaCy not available. Install with: pip install spacy")

try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️ Transformers not available. Install with: pip install transformers torch")

# Web framework
try:
    from fastapi import FastAPI, File, UploadFile, HTTPException
    from fastapi.responses import HTMLResponse, JSONResponse
    from fastapi.middleware.cors import CORSMiddleware
    import uvicorn
    WEB_AVAILABLE = True
except ImportError:
    WEB_AVAILABLE = False
    print("⚠️ Web framework not available. Install with: pip install fastapi uvicorn python-multipart")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentAnalyzer:
    """
    Advanced Document Analysis System

    Capabilities:
    - Text extraction from multiple formats
    - OCR for images
    - Named Entity Recognition
    - Document classification
    - Sentiment analysis
    - Summarization
    - Keyword extraction
    """

    def __init__(self):
        """Initialize the document analyzer with AI models"""
        logger.info("🚀 Initializing Document Analyzer...")

        # Initialize components
        self.nlp_model = nlp if NLP_AVAILABLE else None
        self.initialize_ai_models()

        # Document type mappings
        self.supported_formats = {
            'pdf': self.extract_text_from_pdf,
            'docx': self.extract_text_from_docx,
            'doc': self.extract_text_from_docx,
            'txt': self.extract_text_from_txt,
            'png': self.extract_text_from_image,
            'jpg': self.extract_text_from_image,
            'jpeg': self.extract_text_from_image,
            'tiff': self.extract_text_from_image,
            'bmp': self.extract_text_from_image
        }

        logger.info("✅ Document Analyzer initialized successfully!")

    def initialize_ai_models(self):
        """Initialize AI models for analysis"""
        self.sentiment_analyzer = None
        self.summarizer = None
        self.classifier = None

        if TRANSFORMERS_AVAILABLE:
            try:
                # Sentiment analysis
                self.sentiment_analyzer = pipeline(
                    "sentiment-analysis",
                    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
                )

                # Summarization
                self.summarizer = pipeline(
                    "summarization",
                    model="facebook/bart-large-cnn"
                )

                # Document classification
                self.classifier = pipeline(
                    "zero-shot-classification",
                    model="facebook/bart-large-mnli"
                )

                logger.info("✅ AI models loaded successfully")

            except Exception as e:
                logger.warning(f"⚠️ Some AI models couldn't be loaded: {e}")

    def get_file_hash(self, content: bytes) -> str:
        """Generate hash for file content"""
        return hashlib.md5(content).hexdigest()

    def detect_file_type(self, filename: str) -> str:
        """Detect file type from filename"""
        return filename.split('.')[-1].lower() if '.' in filename else 'unknown'

    def extract_text_from_pdf(self, content: bytes) -> str:
        """Extract text from PDF using PyMuPDF"""
        if not PDF_AVAILABLE:
            return "PDF processing not available"

        try:
            doc = fitz.open(stream=content, filetype="pdf")
            text = ""
            for page_num in range(doc.page_count):
                page = doc[page_num]
                text += page.get_text()
            doc.close()
            return text.strip()
        except Exception as e:
            logger.error(f"Error extracting PDF text: {e}")
            return f"Error processing PDF: {e}"

    def extract_text_from_docx(self, content: bytes) -> str:
        """Extract text from DOCX file"""
        if not DOCX_AVAILABLE:
            return "DOCX processing not available"

        try:
            # Try mammoth first (better formatting)
            result = mammoth.extract_raw_text(io.BytesIO(content))
            return result.value.strip()
        except Exception as e:
            try:
                # Fallback to python-docx
                doc = DocxDocument(io.BytesIO(content))
                text = []
                for paragraph in doc.paragraphs:
                    text.append(paragraph.text)
                return '\n'.join(text).strip()
            except Exception as e2:
                logger.error(f"Error extracting DOCX text: {e}, {e2}")
                return f"Error processing DOCX: {e}"

    def extract_text_from_txt(self, content: bytes) -> str:
        """Extract text from plain text file"""
        try:
            return content.decode('utf-8', errors='ignore').strip()
        except Exception as e:
            logger.error(f"Error extracting text: {e}")
            return f"Error processing text file: {e}"

    def extract_text_from_image(self, content: bytes) -> str:
        """Extract text from image using OCR"""
        if not OCR_AVAILABLE:
            return "OCR not available"

        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(content))

            # Convert to numpy array for OpenCV processing
            img_array = np.array(image)

            # Convert RGB to BGR for OpenCV
            if len(img_array.shape) == 3:
                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                # Convert to grayscale
                gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
            else:
                gray = img_array

            # Image preprocessing for better OCR
            # Apply Gaussian blur
            blurred = cv2.GaussianBlur(gray, (1, 1), 0)

            # Apply threshold to get binary image
            _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

            # Convert back to PIL Image
            processed_image = Image.fromarray(thresh)

            # OCR configuration
            custom_config = r'--oem 3 --psm 6'
            text = pytesseract.image_to_string(processed_image, config=custom_config)

            return text.strip()

        except Exception as e:
            logger.error(f"Error extracting text from image: {e}")
            return f"Error processing image: {e}"

    def extract_text(self, content: bytes, file_type: str) -> str:
        """Extract text based on file type"""
        if file_type in self.supported_formats:
            return self.supported_formats[file_type](content)
        else:
            return f"Unsupported file type: {file_type}"

    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """Extract named entities using spaCy"""
        if not self.nlp_model or not text.strip():
            return {"message": "NLP model not available or empty text"}

        try:
            # Limit text length for processing
            doc = self.nlp_model(text[:100000])

            entities = {}
            for ent in doc.ents:
                if ent.label_ not in entities:
                    entities[ent.label_] = []
                if ent.text not in entities[ent.label_]:
                    entities[ent.label_].append(ent.text)

            return entities

        except Exception as e:
            logger.error(f"Error extracting entities: {e}")
            return {"error": str(e)}

    def analyze_sentiment(self, text: str) -> Dict[str, Any]:
        """Analyze sentiment of the text"""
        if not self.sentiment_analyzer or not text.strip():
            return {"message": "Sentiment analyzer not available or empty text"}

        try:
            # Limit text length for processing
            text_sample = text[:512]
            result = self.sentiment_analyzer(text_sample)[0]

            return {
                "label": result["label"],
                "confidence": round(result["score"], 4),
                "text_sample": text_sample[:100] + "..." if len(text_sample) > 100 else text_sample
            }

        except Exception as e:
            logger.error(f"Error analyzing sentiment: {e}")
            return {"error": str(e)}

    def classify_document(self, text: str) -> Dict[str, Any]:
        """Classify document into categories"""
        if not self.classifier or not text.strip():
            return {"message": "Classifier not available or empty text"}

        try:
            # Define document categories
            categories = [
                "legal document", "medical report", "financial document",
                "technical manual", "business letter", "academic paper",
                "news article", "personal letter", "contract", "invoice"
            ]

            # Limit text length for processing
            text_sample = text[:1024]
            result = self.classifier(text_sample, categories)

            return {
                "category": result["labels"][0],
                "confidence": round(result["scores"][0], 4),
                "all_scores": {
                    label: round(score, 4)
                    for label, score in zip(result["labels"], result["scores"])
                }
            }

        except Exception as e:
            logger.error(f"Error classifying document: {e}")
            return {"error": str(e)}

    def summarize_text(self, text: str, max_length: int = 150) -> str:
        """Generate summary of the text"""
        if not self.summarizer or not text.strip():
            return "Summarizer not available or empty text"

        try:
            # Ensure text is long enough to summarize
            if len(text.split()) < 50:
                return "Text too short to summarize effectively"

            # Limit text length for processing
            text_sample = text[:1024]

            result = self.summarizer(
                text_sample,
                max_length=max_length,
                min_length=30,
                do_sample=False
            )[0]

            return result["summary_text"]

        except Exception as e:
            logger.error(f"Error summarizing text: {e}")
            return f"Error generating summary: {e}"

    def extract_keywords(self, text: str, num_keywords: int = 10) -> List[str]:
        """Extract keywords using simple frequency analysis"""
        if not text.strip():
            return []

        try:
            # Simple keyword extraction using word frequency
            words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

            # Remove common stop words
            stop_words = {
                'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
                'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before',
                'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these',
                'those', 'his', 'her', 'their', 'our', 'your', 'its', 'his', 'him',
                'she', 'they', 'we', 'you', 'are', 'was', 'were', 'been', 'be',
                'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
                'should', 'may', 'might', 'can', 'said', 'say', 'get', 'go', 'come'
            }

            # Filter out stop words and count frequency
            word_freq = {}
            for word in words:
                if word not in stop_words and len(word) > 3:
                    word_freq[word] = word_freq.get(word, 0) + 1

            # Sort by frequency and return top keywords
            keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
            return [word for word, freq in keywords[:num_keywords]]

        except Exception as e:
            logger.error(f"Error extracting keywords: {e}")
            return []

    def analyze_document(self, content: bytes, filename: str) -> Dict[str, Any]:
        """Complete document analysis pipeline"""
        start_time = time.time()

        # Basic file info
        file_type = self.detect_file_type(filename)
        file_hash = self.get_file_hash(content)
        file_size = len(content)

        # Extract text
        logger.info(f"Extracting text from {file_type} file...")
        extracted_text = self.extract_text(content, file_type)

        if extracted_text.startswith("Error") or not extracted_text.strip():
            return {
                "filename": filename,
                "file_type": file_type,
                "file_size": file_size,
                "file_hash": file_hash,
                "error": "Failed to extract text from document",
                "processing_time": time.time() - start_time
            }

        # Perform analysis
        logger.info("Performing NLP analysis...")

        # Extract entities
        entities = self.extract_entities(extracted_text)

        # Analyze sentiment
        sentiment = self.analyze_sentiment(extracted_text)

        # Classify document
        classification = self.classify_document(extracted_text)

        # Generate summary
        summary = self.summarize_text(extracted_text)

        # Extract keywords
        keywords = self.extract_keywords(extracted_text)

        processing_time = time.time() - start_time

        # Compile results
        result = {
            "filename": filename,
            "file_type": file_type,
            "file_size": file_size,
            "file_hash": file_hash,
            "text_length": len(extracted_text),
            "extracted_text": extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text,
            "entities": entities,
            "sentiment": sentiment,
            "classification": classification,
            "summary": summary,
            "keywords": keywords,
            "processing_time": round(processing_time, 2),
            "timestamp": datetime.now().isoformat()
        }

        logger.info(f"Analysis completed in {processing_time:.2f} seconds")
        return result

# Web Application (if FastAPI is available)
if WEB_AVAILABLE:
    app = FastAPI(title="Document Analyzer API", version="1.0.0")

    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )

    # Initialize analyzer
    analyzer = DocumentAnalyzer()

    @app.get("/", response_class=HTMLResponse)
    async def home():
        """Web interface for document upload"""
        html_content = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>📄 Document Analyzer</title>
            <style>
                body {
                    font-family: Arial, sans-serif;
                    max-width: 1200px;
                    margin: 0 auto;
                    padding: 20px;
                    background: #f5f5f5;
                }
                .container {
                    background: white;
                    padding: 30px;
                    border-radius: 10px;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
                }
                h1 { color: #333; text-align: center; }
                .upload-area {
                    border: 2px dashed #ddd;
                    border-radius: 10px;
                    padding: 40px;
                    text-align: center;
                    margin: 20px 0;
                    background: #fafafa;
                }
                .upload-area:hover { border-color: #007bff; }
                .file-input {
                    display: none;
                }
                .upload-btn {
                    background: #007bff;
                    color: white;
                    padding: 12px 24px;
                    border: none;
                    border-radius: 5px;
                    cursor: pointer;
                    font-size: 16px;
                }
                .upload-btn:hover { background: #0056b3; }
                .analyze-btn {
                    background: #28a745;
                    color: white;
                    padding: 12px 24px;
                    border: none;
                    border-radius: 5px;
                    cursor: pointer;
                    font-size: 16px;
                    margin-top: 10px;
                }
                .analyze-btn:hover { background: #1e7e34; }
                .results {
                    margin-top: 30px;
                    padding: 20px;
                    background: #f8f9fa;
                    border-radius: 5px;
                }
                .loading {
                    display: none;
                    text-align: center;
                    color: #007bff;
                    font-size: 18px;
                }
                .error { color: #dc3545; }
                .success { color: #28a745; }
                .info-card {
                    background: white;
                    padding: 15px;
                    margin: 10px 0;
                    border-radius: 5px;
                    border-left: 4px solid #007bff;
                }
                .entity-tag {
                    display: inline-block;
                    background: #e9ecef;
                    padding: 4px 8px;
                    margin: 2px;
                    border-radius: 3px;
                    font-size: 12px;
                }
                .keyword-tag {
                    display: inline-block;
                    background: #d4edda;
                    padding: 4px 8px;
                    margin: 2px;
                    border-radius: 3px;
                    font-size: 12px;
                }
            </style>
        </head>
        <body>
            <div class="container">
                <h1>📄 AI Document Analyzer</h1>
                <p style="text-align: center; color: #666;">
                    Upload documents for intelligent analysis including OCR, NER, sentiment analysis, and summarization
                </p>

                <div class="upload-area" onclick="document.getElementById('fileInput').click()">
                    <div id="uploadText">
                        <h3>📁 Click to upload document</h3>
                        <p>Supports: PDF, DOCX, TXT, Images (PNG, JPG, etc.)</p>
                    </div>
                    <input type="file" id="fileInput" class="file-input" accept=".pdf,.docx,.doc,.txt,.png,.jpg,.jpeg,.tiff,.bmp">
                </div>

                <div style="text-align: center;">
                    <button class="analyze-btn" onclick="analyzeDocument()" id="analyzeBtn" disabled>
                        🔍 Analyze Document
                    </button>
                </div>

                <div class="loading" id="loading">
                    <h3>🔄 Analyzing document...</h3>
                    <p>This may take a few moments depending on document size and complexity.</p>
                </div>

                <div class="results" id="results" style="display: none;"></div>
            </div>

            <script>
                let selectedFile = null;

                document.getElementById('fileInput').addEventListener('change', function(e) {
                    selectedFile = e.target.files[0];
                    if (selectedFile) {
                        document.getElementById('uploadText').innerHTML =
                            '<h3>✅ File selected: ' + selectedFile.name + '</h3>' +
                            '<p>Size: ' + (selectedFile.size / 1024 / 1024).toFixed(2) + ' MB</p>';
                        document.getElementById('analyzeBtn').disabled = false;
                    }
                });

                async function analyzeDocument() {
                    if (!selectedFile) {
                        alert('Please select a file first!');
                        return;
                    }

                    const formData = new FormData();
                    formData.append('file', selectedFile);

                    document.getElementById('loading').style.display = 'block';
                    document.getElementById('results').style.display = 'none';
                    document.getElementById('analyzeBtn').disabled = true;

                    try {
                        const response = await fetch('/analyze', {
                            method: 'POST',
                            body: formData
                        });

                        const data = await response.json();
                        displayResults(data);

                    } catch (error) {
                        document.getElementById('results').innerHTML =
                            '<div class="error"><h3>❌ Error</h3><p>' + error.message + '</p></div>';
                        document.getElementById('results').style.display = 'block';
                    }

                    document.getElementById('loading').style.display = 'none';
                    document.getElementById('analyzeBtn').disabled = false;
                }

                function displayResults(data) {
                    let html = '<h3>📊 Analysis Results</h3>';

                    if (data.error) {
                        html += '<div class="error"><h4>❌ Error</h4><p>' + data.error + '</p></div>';
                    } else {
                        // File info
                        html += '<div class="info-card">';
                        html += '<h4>📋 File Information</h4>';
                        html += '<p><strong>Filename:</strong> ' + data.filename + '</p>';
                        html += '<p><strong>Type:</strong> ' + data.file_type.toUpperCase() + '</p>';
                        html += '<p><strong>Size:</strong> ' + (data.file_size / 1024).toFixed(2) + ' KB</p>';
                        html += '<p><strong>Text Length:</strong> ' + data.text_length + ' characters</p>';
                        html += '<p><strong>Processing Time:</strong> ' + data.processing_time + 's</p>';
                        html += '</div>';

                        // Classification
                        if (data.classification && data.classification.category) {
                            html += '<div class="info-card">';
                            html += '<h4>📂 Document Classification</h4>';
                            html += '<p><strong>Category:</strong> ' + data.classification.category + '</p>';
                            html += '<p><strong>Confidence:</strong> ' + (data.classification.confidence * 100).toFixed(1) + '%</p>';
                            html += '</div>';
                        }

                        // Sentiment
                        if (data.sentiment && data.sentiment.label) {
                            html += '<div class="info-card">';
                            html += '<h4>😊 Sentiment Analysis</h4>';
                            html += '<p><strong>Sentiment:</strong> ' + data.sentiment.label + '</p>';
                            html += '<p><strong>Confidence:</strong> ' + (data.sentiment.confidence * 100).toFixed(1) + '%</p>';
                            html += '</div>';
                        }

                        // Summary
                        if (data.summary && !data.summary.startsWith('Error') && !data.summary.includes('not available')) {
                            html += '<div class="info-card">';
                            html += '<h4>📝 Summary</h4>';
                            html += '<p>' + data.summary + '</p>';
                            html += '</div>';
                        }

                        // Keywords
                        if (data.keywords && data.keywords.length > 0) {
                            html += '<div class="info-card">';
                            html += '<h4>🔑 Keywords</h4>';
                            data.keywords.forEach(keyword => {
                                html += '<span class="keyword-tag">' + keyword + '</span>';
                            });
                            html += '</div>';
                        }

                        // Entities
                        if (data.entities && Object.keys(data.entities).length > 0 && !data.entities.message) {
                            html += '<div class="info-card">';
                            html += '<h4>🏷️ Named Entities</h4>';
                            for (const [label, entities] of Object.entries(data.entities)) {
                                if (entities.length > 0) {
                                    html += '<p><strong>' + label + ':</strong></p>';
                                    entities.forEach(entity => {
                                        html += '<span class="entity-tag">' + entity + '</span>';
                                    });
                                }
                            }
                            html += '</div>';
                        }

                        // Extracted text preview
                        if (data.extracted_text) {
                            html += '<div class="info-card">';
                            html += '<h4>📄 Text Preview</h4>';
                            html += '<p style="max-height: 200px; overflow-y: auto; background: #f8f9fa; padding: 10px; border-radius: 3px;">' +
                                    data.extracted_text.replace(/\n/g, '<br>') + '</p>';
                            html += '</div>';
                        }
                    }

                    document.getElementById('results').innerHTML = html;
                    document.getElementById('results').style.display = 'block';
                }
            </script>
        </body>
        </html>
        """
        return html_content

    @app.post("/analyze")
    async def analyze_document(file: UploadFile = File(...)):
        """Analyze uploaded document"""
        try:
            # Read file content
            content = await file.read()

            # Analyze document
            result = analyzer.analyze_document(content, file.filename)

            return JSONResponse(content=result)

        except Exception as e:
            logger.error(f"Error analyzing document: {e}")
            raise HTTPException(status_code=500, detail=str(e))

    @app.get("/health")
    async def health_check():
        """Health check endpoint"""
        return {
            "status": "healthy",
            "service": "document_analyzer",
            "features": {
                "pdf_processing": PDF_AVAILABLE,
                "docx_processing": DOCX_AVAILABLE,
                "ocr": OCR_AVAILABLE,
                "nlp": NLP_AVAILABLE,
                "transformers": TRANSFORMERS_AVAILABLE
            }
        }

# Command Line Interface
def main():
    """Main function for command line usage"""
    import sys

    if len(sys.argv) < 2:
        print("📄 Document Analyzer")
        print("===================")
        print()
        print("Usage:")
        print("  python document_analyzer.py <file_path>     # Analyze single file")
        print("  python document_analyzer.py --web           # Start web server")
        print()
        print("Examples:")
        print("  python document_analyzer.py document.pdf")
        print("  python document_analyzer.py --web")
        print()
        return

    if sys.argv[1] == "--web":
        if WEB_AVAILABLE:
            print("🚀 Starting Document Analyzer Web Server...")
            print("📱 Access at: http://localhost:8000")
            uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info")
        else:
            print("❌ Web server not available. Install with: pip install fastapi uvicorn")
        return

    # Analyze file from command line
    file_path = sys.argv[1]

    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return

    try:
        # Read file
        with open(file_path, 'rb') as f:
            content = f.read()

        # Initialize analyzer
        analyzer = DocumentAnalyzer()

        # Analyze document
        print(f"🔍 Analyzing: {file_path}")
        result = analyzer.analyze_document(content, os.path.basename(file_path))

        # Display results
        print("\n📊 Analysis Results")
        print("=" * 50)

        if "error" in result:
            print(f"❌ Error: {result['error']}")
            return

        print(f"📄 File: {result['filename']}")
        print(f"📁 Type: {result['file_type'].upper()}")
        print(f"📏 Size: {result['file_size']:,} bytes")
        print(f"📝 Text Length: {result['text_length']:,} characters")
        print(f"⏱️ Processing Time: {result['processing_time']}s")

        if result.get('classification', {}).get('category'):
            print(f"\n📂 Classification: {result['classification']['category']}")
            print(f"   Confidence: {result['classification']['confidence']:.2%}")

        if result.get('sentiment', {}).get('label'):
            print(f"\n😊 Sentiment: {result['sentiment']['label']}")
            print(f"   Confidence: {result['sentiment']['confidence']:.2%}")

        if result.get('summary') and not result['summary'].startswith('Error'):
            print(f"\n📝 Summary:")
            print(f"   {result['summary']}")

        if result.get('keywords'):
            print(f"\n🔑 Keywords: {', '.join(result['keywords'][:10])}")

        if result.get('entities') and not result['entities'].get('message'):
            print(f"\n🏷️ Named Entities:")
            for label, entities in result['entities'].items():
                if entities:
                    print(f"   {label}: {', '.join(entities[:5])}")

        print(f"\n📄 Text Preview:")
        preview = result['extracted_text'][:300]
        print(f"   {preview}{'...' if len(result['extracted_text']) > 300 else ''}")

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()

⚠️ OCR not available. Install with: pip install pytesseract
⚠️ PDF processing not available. Install with: pip install PyMuPDF
⚠️ DOCX processing not available. Install with: pip install python-docx mammoth
