In [18]:
# DEPENDENCIES
import os
import cv2
import fitz
import sys
import easyocr
import numpy as np
from PIL import Image
from PIL import ImageEnhance
from PIL import ImageFilter
from PIL import ImageOps
from io import BytesIO
from typing import List, Tuple
from pathlib import Path
from typing import Optional
from paddleocr import PaddleOCR

#sys.path.append("../..")
#from utils.error_handler import OCRException
#from config.settings import get_settings
#from config.logging_config import get_logger
#from utils.error_handler import handle_errors


# Setup Settings and Logging
#settings = get_settings()
#logger   = get_logger(__name__)


class OCREngine:
    """
    Universal OCR engine that works across all PDF types:
    - PPT-converted PDFs
    - Image-based PDFs  
    - Scanned documents
    - Mixed content PDFs
    
    Uses adaptive preprocessing and multiple extraction strategies
    """
    def __init__(self, use_paddle: bool = True, lang: str = 'en', gpu: bool = False):
        """
        Initialize OCR engine
        
        Arguments:
        ----------
            use_paddle  { bool } : Use PaddleOCR as primary (better accuracy)
            lang        { str }  : Language code ('en', 'es', 'fr', 'de', etc.)
            gpu         { bool } : Use GPU acceleration if available
        """
        #self.logger       = logger
        self.use_paddle   = use_paddle
        self.lang         = lang
        self.gpu          = gpu
        self.paddle_ocr   = None
        self.easy_ocr     = None
        self._initialized = False
        
        self._initialize_engines()
    

    def _initialize_engines(self):
        """
        Initialize OCR engines with proper error handling
        """
        # Try PaddleOCR first (better accuracy)
        if self.use_paddle:
            try:
                self.paddle_ocr = PaddleOCR(
                    use_angle_cls=True,
                    lang=self.lang,
                    use_gpu=self.gpu,
                    show_log=False,
                    det_db_thresh=0.3,
                    det_db_box_thresh=0.5,
                    rec_batch_num=6,
                )
                #self.logger.info("PaddleOCR initialized successfully")

            except ImportError as e:
                #self.logger.warning(f"PaddleOCR not available: {repr(e)}. Falling back to EasyOCR.")
                self.use_paddle = False
            
            except Exception as e:
                #self.logger.error(f"Failed to initialize PaddleOCR: {repr(e)}")
                self.use_paddle = False
        
        # Initialize EasyOCR as fallback
        if not self.use_paddle:
            try: 
                self.easy_ocr = easyocr.Reader([self.lang], gpu=self.gpu)
                #self.logger.info("EasyOCR initialized successfully")

            except ImportError as e:
                #self.logger.error(f"EasyOCR not available: {repr(e)}")
                raise e #OCRException("No OCR engines available. Install PaddleOCR or EasyOCR.")

            except Exception as e:
                self.logger.error(f"Failed to initialize EasyOCR: {repr(e)}")
                raise e #OCRException(f"OCR engine initialization failed: {repr(e)}")
        
        self._initialized = True
    

    #@handle_errors(error_type=OCRException, log_error=True, reraise=True)
    def extract_text_from_pdf(self, pdf_path: Path, pages: Optional[List[int]] = None) -> str:
        """
        Extract text from PDF using OCR with adaptive preprocessing
        Works across all PDF types automatically
        
        Arguments:
        ----------
            pdf_path { Path } : Path to PDF file
            pages    { list } : Specific pages to OCR (None = all pages)
            
        Returns:
        --------
               { str }        : Extracted text from all specified pages
        """
        pdf_path = Path(pdf_path)
        #self.logger.info(f"Starting universal OCR extraction from PDF: {pdf_path}")
        
        if not pdf_path.exists():
            raise #OCRException(f"PDF file not found: {pdf_path}")
        
        # Convert PDF pages to high-quality images
        images = self._pdf_to_images(pdf_path=pdf_path, pages=pages, dpi=300)
        #self.logger.info(f"Converted {len(images)} pages to images for OCR")
        
        # OCR each image with adaptive approach
        all_text = list()

        for i, image in enumerate(images):
            page_num = pages[i] if pages else i + 1
            #self.logger.info(f"Processing page {page_num}...")
            
            try:
                # Try multiple strategies and pick best result
                page_text = self._extract_text_adaptive(image, page_num)

                if page_text and page_text.strip():
                    all_text.append(f"[PAGE {page_num}]\n{page_text}")
                    #self.logger.info(f"âœ“ Extracted {len(page_text)} characters from page {page_num}")
                else:
                    #self.logger.warning(f"No text extracted from page {page_num}")
                    all_text.append(f"[PAGE {page_num}]\n")
            
            except Exception as e:
                #self.logger.error(f"OCR failed for page {page_num}: {repr(e)}")
                all_text.append(f"[PAGE {page_num}]\n[OCR FAILED: {str(e)}]")
        
        combined_text = "\n\n".join(all_text)
        #self.logger.info(f"OCR completed: {len(combined_text)} total characters extracted")
        
        return combined_text
    

    #@handle_errors(error_type=OCRException, log_error=True, reraise=True)
    def extract_text_from_image(self, image_path: Path) -> str:
        """
        Extract text from image file
        
        Arguments:
        ----------
            image_path { Path } : Path to image file
            
        Returns:
        --------
                { str }         : Extracted text
        """
        image_path = Path(image_path)
        #self.logger.info(f"Extracting text from image: {image_path}")
        
        if not image_path.exists():
            raise #OCRException(f"Image file not found: {image_path}")
        
        # Load image
        image = Image.open(image_path)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Use adaptive extraction
        text = self._extract_text_adaptive(image, page_num=1)
        #self.logger.info(f"Image OCR completed: {len(text)} characters extracted")
        
        return text
    

    def _pdf_to_images(self, pdf_path: Path, pages: Optional[List[int]] = None, dpi: int = 300) -> List[Image.Image]:
        """
        Convert PDF pages to high-quality images
        
        Arguments:
        ----------
            pdf_path { Path } : Path to PDF file
            pages    { list } : Specific pages to convert (None = all pages)
            dpi      { int }  : DPI for image conversion (300 is good balance)
            
        Returns:
        --------
               { list }       : List of PIL Images
        """
        try:
            doc = fitz.open(str(pdf_path))
            images = list()
            
            if pages is None:
                pages_to_process = range(len(doc))
            else:
                pages_to_process = [p-1 for p in pages if 0 < p <= len(doc)]
            
            for page_num in pages_to_process:
                page = doc[page_num]
                
                # High-quality conversion
                zoom = dpi / 72.0
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                
                # Convert to PIL Image
                img_data = pix.tobytes("png")
                image = Image.open(BytesIO(img_data))
                
                if image.mode != 'RGB':
                    image = image.convert('RGB')
                
                images.append(image)
            
            doc.close()
            return images
            
        except Exception as e:
            raise #OCRException(f"Failed to convert PDF to images: {repr(e)}")
    

    def _extract_text_adaptive(self, image: Image.Image, page_num: int) -> str:
        """
        ADAPTIVE TEXT EXTRACTION - tries multiple strategies and picks best result
        
        Strategy 1: Direct OCR (for clean PPT slides)
        Strategy 2: Light preprocessing (for good quality scans)
        Strategy 3: Heavy preprocessing (for poor quality scans)
        
        Arguments:
        ----------
            image    { Image.Image } : PIL Image
            page_num { int }         : Page number for logging
            
        Returns:
        --------
                 { str }             : Best extracted text
        """
        results = []
        
        # Strategy 1: Direct OCR (best for PPT slides, clean images)
        try:
            self.logger.debug(f"Page {page_num}: Trying direct OCR...")
            img_array = np.array(image)
            text1 = self._ocr_image(img_array)
            if text1 and len(text1.strip()) > 10:
                results.append(('direct', text1, len(text1)))
                #self.logger.debug(f"Page {page_num}: Direct OCR extracted {len(text1)} chars")
        except Exception as e:
            #self.logger.debug(f"Page {page_num}: Direct OCR failed: {repr(e)}")
        
        # Strategy 2: Light preprocessing (for slight quality issues)
        try:
            #self.logger.debug(f"Page {page_num}: Trying light preprocessing...")
            processed = self._preprocess_light(image)
            text2 = self._ocr_image(processed)
            if text2 and len(text2.strip()) > 10:
                results.append(('light', text2, len(text2)))
                #self.logger.debug(f"Page {page_num}: Light preprocessing extracted {len(text2)} chars")
        except Exception as e:
            print(e) #self.logger.debug(f"Page {page_num}: Light preprocessing failed: {repr(e)}")
        
        # Strategy 3: Heavy preprocessing (for poor scans)
        try:
            #self.logger.debug(f"Page {page_num}: Trying heavy preprocessing...")
            processed = self._preprocess_heavy(image)
            text3 = self._ocr_image(processed)
            if text3 and len(text3.strip()) > 10:
                results.append(('heavy', text3, len(text3)))
                #self.logger.debug(f"Page {page_num}: Heavy preprocessing extracted {len(text3)} chars")
        except Exception as e:
            raise
            #self.logger.debug(f"Page {page_num}: Heavy preprocessing failed: {repr(e)}")
        
        # Pick the best result (most text extracted)
        if results:
            best_strategy, best_text, best_length = max(results, key=lambda x: x[2])
            #self.logger.info(f"Page {page_num}: Best result from '{best_strategy}' strategy ({best_length} chars)")
            return best_text
        
        return ""
    

    def _preprocess_light(self, image: Image.Image) -> np.ndarray:
        """
        Light preprocessing for good quality images
        - Slight contrast enhancement
        - Sharpening
        - Minimal noise reduction
        """
        try:
            # Enhance contrast slightly
            enhancer = ImageEnhance.Contrast(image)
            image = enhancer.enhance(1.2)
            
            # Sharpen slightly
            image = image.filter(ImageFilter.SHARPEN)
            
            # Convert to numpy
            img_array = np.array(image)
            
            # Convert to grayscale
            if len(img_array.shape) == 3:
                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_array
            
            return gray
            
        except Exception as e:
            #self.logger.warning(f"Light preprocessing failed: {repr(e)}")
            return np.array(image)
    

    def _preprocess_heavy(self, image: Image.Image) -> np.ndarray:
        """
        Heavy preprocessing for poor quality scans
        - Grayscale conversion
        - Noise reduction
        - Adaptive thresholding
        - Deskewing
        - Border removal
        """
        try:
            # Convert to numpy array
            img_array = np.array(image)
            
            # Convert to grayscale
            if len(img_array.shape) == 3:
                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_array
            
            # Denoise
            denoised = cv2.fastNlMeansDenoising(gray, h=10)
            
            # Adaptive thresholding for better text separation
            binary = cv2.adaptiveThreshold(
                denoised,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                11,
                2
            )
            
            # Deskew if needed
            binary = self._deskew_image(binary)
            
            # Morphological operations to clean up
            kernel = np.ones((1, 1), np.uint8)
            binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
            
            return binary
            
        except Exception as e:
            #self.logger.warning(f"Heavy preprocessing failed: {repr(e)}")
            return np.array(image)
    

    def _deskew_image(self, image: np.ndarray) -> np.ndarray:
        """
        Deskew image to correct rotation
        """
        try:
            coords = np.column_stack(np.where(image > 0))
            angle = cv2.minAreaRect(coords)[-1]
            
            if angle < -45:
                angle = -(90 + angle)
            else:
                angle = -angle
            
            # Only rotate if angle is significant
            if abs(angle) > 0.5:
                (h, w) = image.shape[:2]
                center = (w // 2, h // 2)
                M = cv2.getRotationMatrix2D(center, angle, 1.0)
                rotated = cv2.warpAffine(
                    image, M, (w, h),
                    flags=cv2.INTER_CUBIC,
                    borderMode=cv2.BORDER_REPLICATE
                )
                return rotated
            
            return image
            
        except Exception:
            return image
    

    def _ocr_image(self, image_array: np.ndarray) -> str:
        """
        Perform OCR on preprocessed image
        
        Arguments:
        ----------
            image_array { np.ndarray } : Image as numpy array
            
        Returns:
        --------
                    { str }            : Extracted text
        """
        # Try PaddleOCR first
        if self.use_paddle and self.paddle_ocr:
            try:
                result = self._ocr_with_paddle(image_array)
                if result and result.strip():
                    return result
            except Exception as e:
                print (e) #self.logger.debug(f"PaddleOCR failed: {repr(e)}")
        
        # Fallback to EasyOCR
        if self.easy_ocr:
            try:
                result = self._ocr_with_easyocr(image_array)
                if result and result.strip():
                    return result
            except Exception as e:
                #self.logger.debug(f"EasyOCR failed: {repr(e)}")
        
        return ""
    

    def _ocr_with_paddle(self, image_array: np.ndarray) -> str:
        """
        OCR using PaddleOCR with proper text ordering
        """
        try:
            result = self.paddle_ocr.ocr(image_array, cls=True)
            
            if not result or not result[0]:
                return ""
            
            # Extract and sort text by position (top to bottom, left to right)
            text_items = []
            
            page_result = result[0] if isinstance(result, list) else result
            
            for line in page_result:
                if line and len(line) >= 2:
                    # Get bounding box and text
                    bbox = line[0]
                    text_info = line[1]
                    
                    if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                        text = text_info[0]
                        confidence = text_info[1]
                    elif isinstance(text_info, str):
                        text = text_info
                        confidence = 1.0
                    else:
                        continue
                    
                    # Get y-coordinate for sorting (top of bounding box)
                    y_coord = bbox[0][1] if bbox and len(bbox) > 0 else 0
                    
                    # Filter by confidence
                    if confidence > 0.5 and text and text.strip():
                        text_items.append((y_coord, text.strip()))
            
            # Sort by y-coordinate and join
            text_items.sort(key=lambda x: x[0])
            return "\n".join([text for _, text in text_items])
            
        except Exception as e:
            #self.logger.debug(f"PaddleOCR processing error: {repr(e)}")
            return ""
    

    def _ocr_with_easyocr(self, image_array: np.ndarray) -> str:
        """
        OCR using EasyOCR with proper text ordering
        """
        try:
            result = self.easy_ocr.readtext(image_array, paragraph=False)
            
            if not result:
                return ""
            
            # Sort by y-coordinate (top to bottom)
            text_items = []
            
            for detection in result:
                bbox = detection[0]
                text = detection[1]
                confidence = detection[2]
                
                # Get y-coordinate
                y_coord = bbox[0][1] if bbox and len(bbox) > 0 else 0
                
                # Filter by confidence
                if confidence > 0.5 and text and text.strip():
                    text_items.append((y_coord, text.strip()))
            
            # Sort and join
            text_items.sort(key=lambda x: x[0])
            return "\n".join([text for _, text in text_items])
            
        except Exception as e:
            #self.logger.debug(f"EasyOCR processing error: {repr(e)}")
            return ""
    

    def get_supported_languages(self) -> List[str]:
        """Get list of supported languages"""
        return ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'zh', 'ja', 'ko', 'ar']
    

    def get_engine_info(self) -> dict:
        """Get information about OCR engine configuration"""
        return {
            "primary_engine": "PaddleOCR" if self.use_paddle else "EasyOCR",
            "language": self.lang,
            "gpu_enabled": self.gpu,
            "initialized": self._initialized,
            "supported_languages": self.get_supported_languages(),
            "adaptive_preprocessing": True,
        }


# Global OCR instance
_global_ocr_engine = None


def get_ocr_engine() -> OCREngine:
    """Get global OCR engine instance (singleton)"""
    global _global_ocr_engine
    
    if _global_ocr_engine is None:
        _global_ocr_engine = OCREngine()
    
    return _global_ocr_engine


def extract_text_with_ocr(file_path: Path, **kwargs) -> str:
    """Convenience function for OCR text extraction"""
    ocr_engine = get_ocr_engine()
    
    if file_path.suffix.lower() == '.pdf':
        return ocr_engine.extract_text_from_pdf(file_path, **kwargs)
    else:
        return ocr_engine.extract_text_from_image(file_path, **kwargs)


IndentationError: expected an indented block after 'except' statement on line 262 (1419192907.py, line 266)

In [None]:
# Initialize OCR engine
ocr = get_ocr_engine()

# Extract from your calculus PDF
text = ocr.extract_text_from_pdf(Path("calculus_slides.pdf"))
print(text)

# Extract specific pages
text = ocr.extract_text_from_pdf(Path("calculus_slides.pdf"), pages=[1, 2, 3])
