In [26]:
import os
import PyPDF2
from PyPDF2 import PdfReader, PdfWriter
import math

def split_pdf_by_size(input_path, max_size_mb=2, output_dir=None):
    """
    Split a PDF into smaller chunks based on file size
    
    Args:
        input_path: Path to the input PDF file
        max_size_mb: Maximum size per chunk in MB (default: 2MB for optimal speed)
        output_dir: Directory to save chunks (default: same as input file)
    
    Returns:
        List of paths to the created PDF chunks
    """
    # Get input file info
    input_size_mb = os.path.getsize(input_path) / (1024 * 1024)
    filename = os.path.basename(input_path)
    name_without_ext = os.path.splitext(filename)[0]
    
    if output_dir is None:
        output_dir = os.path.dirname(input_path)
    
    # Create output directory if it doesn't exist
    chunks_dir = os.path.join(output_dir, f"{name_without_ext}_chunks")
    if not os.path.exists(chunks_dir):
        os.makedirs(chunks_dir)
    
    print(f"Original PDF size: {input_size_mb:.2f} MB")
    print(f"Target chunk size: {max_size_mb} MB")
    
    # If file is already small enough, just copy it
    if input_size_mb <= max_size_mb:
        print("PDF is already small enough, no splitting needed")
        chunk_path = os.path.join(chunks_dir, f"{name_without_ext}_chunk_001.pdf")
        import shutil
        shutil.copy2(input_path, chunk_path)
        return [chunk_path]
    
    # Read the PDF
    reader = PdfReader(input_path)
    total_pages = len(reader.pages)
    
    # Estimate pages per chunk with maximum limit
    estimated_chunks = math.ceil(input_size_mb / max_size_mb)
    pages_per_chunk = max(1, total_pages // estimated_chunks)
    
    # Enforce maximum of 35 pages per chunk
    if pages_per_chunk > 35:
        pages_per_chunk = 35
        # Recalculate chunks based on page limit
        estimated_chunks = math.ceil(total_pages / pages_per_chunk)
    
    print(f"Total pages: {total_pages}")
    print(f"Estimated chunks: {estimated_chunks}")
    print(f"Pages per chunk: {pages_per_chunk}")
    
    chunk_paths = []
    chunk_num = 1
    
    for start_page in range(0, total_pages, pages_per_chunk):
        end_page = min(start_page + pages_per_chunk, total_pages)
        
        # Create a new PDF writer for this chunk
        writer = PdfWriter()
        
        # Add pages to this chunk
        for page_num in range(start_page, end_page):
            writer.add_page(reader.pages[page_num])
        
        # Save the chunk
        chunk_filename = f"{name_without_ext}_chunk_{chunk_num:03d}.pdf"
        chunk_path = os.path.join(chunks_dir, chunk_filename)
        
        with open(chunk_path, 'wb') as output_file:
            writer.write(output_file)
        
        chunk_size_mb = os.path.getsize(chunk_path) / (1024 * 1024)
        print(f"Created chunk {chunk_num}: {chunk_filename} ({chunk_size_mb:.2f} MB, pages {start_page+1}-{end_page})")
        
        chunk_paths.append(chunk_path)
        chunk_num += 1
    
    print(f"\nPDF split into {len(chunk_paths)} chunks")
    print(f"Chunks saved in: {chunks_dir}")
    
    return chunk_paths

def split_pdf_by_pages(input_path, pages_per_chunk=35, output_dir=None):
    """
    Split a PDF into smaller chunks based on number of pages
    
    Args:
        input_path: Path to the input PDF file
        pages_per_chunk: Number of pages per chunk (default: 35, max recommended)
        output_dir: Directory to save chunks (default: same as input file)
    
    Returns:
        List of paths to the created PDF chunks
    """
    # Get input file info
    filename = os.path.basename(input_path)
    name_without_ext = os.path.splitext(filename)[0]
    
    if output_dir is None:
        output_dir = os.path.dirname(input_path)
    
    # Create output directory if it doesn't exist
    chunks_dir = os.path.join(output_dir, f"{name_without_ext}_chunks")
    if not os.path.exists(chunks_dir):
        os.makedirs(chunks_dir)
    
    # Read the PDF
    reader = PdfReader(input_path)
    total_pages = len(reader.pages)
    
    print(f"Total pages: {total_pages}")
    print(f"Pages per chunk: {pages_per_chunk}")
    
    chunk_paths = []
    chunk_num = 1
    
    for start_page in range(0, total_pages, pages_per_chunk):
        end_page = min(start_page + pages_per_chunk, total_pages)
        
        # Create a new PDF writer for this chunk
        writer = PdfWriter()
        
        # Add pages to this chunk
        for page_num in range(start_page, end_page):
            writer.add_page(reader.pages[page_num])
        
        # Save the chunk
        chunk_filename = f"{name_without_ext}_chunk_{chunk_num:03d}.pdf"
        chunk_path = os.path.join(chunks_dir, chunk_filename)
        
        with open(chunk_path, 'wb') as output_file:
            writer.write(output_file)
        
        chunk_size_mb = os.path.getsize(chunk_path) / (1024 * 1024)
        print(f"Created chunk {chunk_num}: {chunk_filename} ({chunk_size_mb:.2f} MB, pages {start_page+1}-{end_page})")
        
        chunk_paths.append(chunk_path)
        chunk_num += 1
    
    print(f"\nPDF split into {len(chunk_paths)} chunks")
    print(f"Chunks saved in: {chunks_dir}")
    
    return chunk_paths

# Example usage:
# For size-based splitting (recommended for OCR website limits)
# chunk_paths = split_pdf_by_size("/path/to/large_file.pdf", max_size_mb=5)

# For page-based splitting
# chunk_paths = split_pdf_by_pages("/path/to/large_file.pdf", pages_per_chunk=10)

print("PDF splitting functions ready!")

PDF splitting functions ready!


In [27]:
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
import pyperclip
from module.playsound import play_sound


def process_pdf_chunk_through_ocr(driver, chunk_path, output_base_dir):
    """
    Process a single PDF chunk through the OCR workflow - COMPLETE PROCESSING
    
    Args:
        driver: Selenium WebDriver instance
        chunk_path: Path to the PDF chunk to process
        output_base_dir: Base directory for output files
    
    Returns:
        Dictionary with processing results
    """
    chunk_filename = os.path.basename(chunk_path)
    chunk_name = os.path.splitext(chunk_filename)[0]
    
    print(f"\n{'='*60}")
    print(f"Processing: {chunk_filename}")
    print(f"{'='*60}")
    
    try:
        # STEP 1: Navigate to fresh OCR page
        print("🌐 Step 1: Loading fresh OCR page...")
        driver.get('https://www.i2ocr.com/free-online-nepali-ocr')
        time.sleep(5)  # Wait for page to load completely
        
        # STEP 2: Upload the chunk file
        print("📄 Step 2: Uploading PDF chunk...")
        button_for_image_upload = driver.find_element(By.ID, 'i2ocr_uploadedfile')
        button_for_image_upload.send_keys(chunk_path)
        print("✓ File uploaded successfully")
        time.sleep(3)  # Wait for file to upload
        
        # STEP 3: Handle initial cookies
        print("🍪 Step 3: Handling cookies...")
        try:
            time.sleep(2)
            cookie_selectors = [
                'button[id*="accept"]',
                'button[class*="accept"]',
                'button[id*="cookie"]',
                'button[class*="cookie"]'
            ]
            
            for selector in cookie_selectors:
                try:
                    cookie_button = driver.find_element(By.CSS_SELECTOR, selector)
                    if cookie_button.is_displayed():
                        cookie_button.click()
                        print("✓ Initial cookies accepted")
                        break
                except:
                    continue
        except Exception as e:
            print(f"Cookie handling: {e}")
        
        # STEP 4: Handle reCAPTCHA with manual verification for complex CAPTCHAs
        print("🔐 Step 4: Handling reCAPTCHA...")
        captcha_solved = False
        
        try:
            # Look for reCAPTCHA iframe
            recaptcha_iframe = None
            iframe_selectors = [
                'iframe[src*="recaptcha"]',
                'iframe[title*="reCAPTCHA"]',
                'iframe[name*="recaptcha"]'
            ]
            
            for selector in iframe_selectors:
                try:
                    recaptcha_iframe = driver.find_element(By.CSS_SELECTOR, selector)
                    break
                except:
                    continue
            
            if recaptcha_iframe:
                driver.switch_to.frame(recaptcha_iframe)
                
                # Click the checkbox
                checkbox_selectors = [
                    '.recaptcha-checkbox-border',
                    '.recaptcha-checkbox',
                    '#recaptcha-anchor'
                ]
                
                for selector in checkbox_selectors:
                    try:
                        recaptcha_checkbox = driver.find_element(By.CSS_SELECTOR, selector)
                        recaptcha_checkbox.click()
                        print("✓ reCAPTCHA checkbox clicked")
                        break
                    except:
                        continue
                
                driver.switch_to.default_content()
                time.sleep(5)  # Wait for CAPTCHA processing
                
                # Check for complex CAPTCHA
                complex_captcha_detected = False
                try:
                    complex_captcha_selectors = [
                        'iframe[src*="bframe"]',
                        '.rc-imageselect'
                    ]
                    
                    for selector in complex_captcha_selectors:
                        try:
                            element = driver.find_element(By.CSS_SELECTOR, selector)
                            if element.is_displayed():
                                complex_captcha_detected = True
                                break
                        except:
                            continue
                    
                    if not complex_captcha_detected:
                        page_text = driver.page_source.lower()
                        if any(phrase in page_text for phrase in ['select all images', 'crosswalks', 'traffic lights']):
                            complex_captcha_detected = True
                            
                except Exception as e:
                    print(f"Error checking for complex CAPTCHA: {e}")
                
                if complex_captcha_detected:
                    print("🤖 Complex image CAPTCHA detected!")
                    print("👤 Please solve the CAPTCHA manually in the browser window.")
                    print("🔍 Look for image selection challenges (crosswalks, traffic lights, etc.)")
                    play_sound('/var/home/ramrshrcg/Music/Happy Sound Effects [kgS40C2VBBw].mp3')
                    
                   
                    # Wait for user to solve CAPTCHA manually
                    input("✋ Press Enter after you have completed the CAPTCHA manually...")
                    print("✓ Manual CAPTCHA verification completed")
                    captcha_solved = True
                else:
                    print("✓ Simple reCAPTCHA completed")
                    captcha_solved = True
            else:
                print("✓ No reCAPTCHA found")
                captcha_solved = True
                
        except Exception as e:
            print(f"Error handling reCAPTCHA: {e}")
            print("⚠️ If there's a CAPTCHA visible, please solve it manually.")
            user_input = input("Type 'done' when CAPTCHA is solved, or 'skip' to skip this chunk: ").lower()
            if user_input == 'done':
                captcha_solved = True
            else:
                return {"success": False, "error": "CAPTCHA solving skipped by user"}
        
        if not captcha_solved:
            print("❌ CAPTCHA not solved")
            return {"success": False, "error": "CAPTCHA solving failed"}
        
        # STEP 5: Handle post-reCAPTCHA cookies
        print("🍪 Step 5: Handling post-reCAPTCHA cookies...")
        try:
            time.sleep(5)
            cookie_selectors = [
                '.cc-btn.cc-dismiss',
                'a[class*="cc-dismiss"]',
                '.cc-compliance a'
            ]
            
            for selector in cookie_selectors:
                try:
                    cookie_button = driver.find_element(By.CSS_SELECTOR, selector)
                    if cookie_button.is_displayed():
                        cookie_button.click()
                        print("✓ Post-reCAPTCHA cookies handled")
                        break
                except:
                    continue
        except Exception as e:
            print(f"Post-reCAPTCHA cookie handling: {e}")
        
        # STEP 6: Click Extract Text button and wait for processing
        print("⚡ Step 6: Starting OCR processing...")
        try:
            time.sleep(3)
            extract_button = driver.find_element(By.ID, 'submit_i2ocr')
            extract_button.click()
            print("✓ Extract Text button clicked")
            
            # Wait for OCR processing to complete
            print("⏳ Waiting for OCR processing to complete...")
            time.sleep(20)  # Give more time for processing
            
        except Exception as e:
            print(f"❌ Error clicking Extract Text button: {e}")
            return {"success": False, "error": "Extract button click failed"}
        
        # STEP 7: Wait for page viewer to load and extract ALL pages
        print("📖 Step 7: Processing all pages in chunk...")
        
        # Create output folder for this chunk
        chunk_output_dir = os.path.join(output_base_dir, f"extracted_text_{chunk_name}")
        if not os.path.exists(chunk_output_dir):
            os.makedirs(chunk_output_dir)
        
        # Wait for page viewer to load
        max_wait = 60
        waited = 0
        while waited < max_wait:
            try:
                page_elements = driver.find_elements(By.CSS_SELECTOR, 'img[id^="page_"]')
                if page_elements:
                    break
                time.sleep(5)
                waited += 5
                print(f"⏳ Still waiting for page viewer... ({waited}s)")
            except:
                time.sleep(5)
                waited += 5
        
        # Find all pages in the chunk
        try:
            page_elements = driver.find_elements(By.CSS_SELECTOR, 'img[id^="page_"]')
            total_pages = len(page_elements)
            print(f"📄 Found {total_pages} pages in chunk")
            
            if total_pages == 0:
                print("❌ No pages found in chunk")
                return {"success": False, "error": "No pages found"}
            
            extracted_pages = 0
            error_page=[]
            
            # Process each page completely
            for page_num in range(1, total_pages + 1):
                try:
                    print(f"\n--- Processing page {page_num}/{total_pages} ---")
                    
                    # Click on page to select it
                    page_element = driver.find_element(By.ID, f'page_{page_num}')
                    page_element.click()
                    print(f"✓ Selected page {page_num}")
                    time.sleep(10)
                    
                    # Click Extract Page Text button, retry if not found
                    max_retries = 3
                    for attempt in range(1, max_retries + 1):
                        try:
                            extract_page_button = driver.find_element(
                                By.XPATH,
                                "//button[contains(text(), 'Extract Page Text')] | //input[contains(@value, 'Extract Page Text')]"
                            )
                            extract_page_button.click()
                            print(f"✓ Extract Page Text clicked for page {page_num} (attempt {attempt})")
                            time.sleep(15)  # Wait for page processing
                            break
                        except Exception as e:
                            print(f"⚠ Extract Page Text button not found for page {page_num} (attempt {attempt})")
                            if attempt == max_retries:
                                error_page.append(page_num)
                                continue
                            else:
                                time.sleep(5)
                    else:
                        # If all retries failed, skip to next page
                        continue
                    
                    # Wait for this page's OCR to complete
                    print(f"⏳ Waiting for page {page_num} OCR to complete...")
                    max_page_wait = 60
                    page_waited = 0
                    while page_waited < max_page_wait:
                        try:
                            # Check if ocrTextBox has content
                            ocr_textbox = driver.find_element(By.ID, 'ocrTextBox')
                            text_content = ocr_textbox.get_attribute('value') or ocr_textbox.text
                            if text_content and text_content.strip():
                                print(f"✓ Page {page_num} OCR completed")
                                break
                            else:
                                time.sleep(5)
                                page_waited += 5
                                print(f"⏳ Still processing page {page_num}... ({page_waited}s)")
                        except:
                            time.sleep(5)
                            page_waited += 5
                    
                    # Extract text from this page
                    extracted_text = None
                    
                    # Method 1: Try ocrTextBox first
                    try:
                        ocr_textbox = driver.find_element(By.ID, 'ocrTextBox')
                        extracted_text = ocr_textbox.get_attribute('value') or ocr_textbox.text
                        if extracted_text and extracted_text.strip():
                            print(f"✓ Text extracted from ocrTextBox")
                        else:
                            extracted_text = None
                    except:
                        pass
                    
                    # Method 2: Try copy button if ocrTextBox fails
                    if not extracted_text:
                        try:
                            copy_button = driver.find_element(By.CSS_SELECTOR, "button.copy_btn")
                            copy_button.click()
                            time.sleep(2)
                            pyperclip.copy('') # Clear clipboard first
                            copy_button.click() # Click again to copy text
                            time.sleep(2)
                            extracted_text = pyperclip.paste()
                            if extracted_text and extracted_text.strip():
                                print(f"✓ Text extracted from clipboard")
                            else:
                                extracted_text = None
                        except:
                            pass
                    
                    # Save extracted text for this page
                    if extracted_text and extracted_text.strip():
                        page_filename = f"page_{page_num:03d}.txt"
                        page_filepath = os.path.join(chunk_output_dir, page_filename)
                        
                        with open(page_filepath, 'w', encoding='utf-8') as f:
                            f.write(f"Chunk: {chunk_name}\n")
                            f.write(f"Page {page_num} Text:\n")
                            f.write("="*50 + "\n")
                            f.write(extracted_text)
                            f.write("\n" + "="*50 + "\n")
                        
                        print(f"✅ Page {page_num} saved: {page_filename}")
                        print(f"Preview: {extracted_text[:50]}...")
                        extracted_pages += 1
                    else:
                        print(f"⚠ No text extracted for page {page_num}")
                        
                        # Create empty file marker
                        page_filename = f"page_{page_num:03d}_empty.txt"
                        page_filepath = os.path.join(chunk_output_dir, page_filename)
                        
                        with open(page_filepath, 'w', encoding='utf-8') as f:
                            f.write(f"Chunk: {chunk_name}\n")
                            f.write(f"Page {page_num} - No text extracted\n")
                
                except Exception as e:
                    print(f"❌ Error processing page {page_num}: {e}")
                    continue
            
            # Create chunk summary
            summary_path = os.path.join(chunk_output_dir, "chunk_summary.txt")
            with open(summary_path, 'w', encoding='utf-8') as f:
                f.write(f"Chunk Processing Summary\n")
                f.write(f"=" * 50 + "\n")
                f.write(f"Chunk File: {chunk_filename}\n")
                f.write(f"Total Pages: {total_pages}\n")
                f.write(f"Pages with Text: {extracted_pages}\n")
                f.write(f"Processing Time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f'failed pages: {error_page}\n')
            
            print(f"✅ Chunk {chunk_name} COMPLETELY processed: {extracted_pages}/{total_pages} pages")
            
            return {
                "success": True,
                "chunk_name": chunk_name,
                "total_pages": total_pages,
                "extracted_pages": extracted_pages,
                "output_dir": chunk_output_dir
            }
            
        except Exception as e:
            print(f"❌ Error in page processing: {e}")
            return {"success": False, "error": str(e)}
    
    except Exception as e:
        print(f"❌ Error processing chunk: {e}")
        return {"success": False, "error": str(e)}

def batch_process_pdf_chunks(input_pdf_path, max_size_mb=2, pages_per_chunk=35):
    """
    Complete workflow: Split PDF and process each chunk through OCR
    
    Args:
        input_pdf_path: Path to the large PDF file
        max_size_mb: Maximum size per chunk in MB (default: 2MB for optimal speed)
        pages_per_chunk: Alternative splitting method by pages (default: 35 pages max)
    
    Returns:
        Dictionary with batch processing results
    """
    print(f"🚀 Starting batch processing for: {input_pdf_path}")
    
    # Create main output directory
    pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
    output_base_dir = os.path.join(os.path.dirname(input_pdf_path), f"{pdf_name}_batch_ocr")
    
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)
    
    # Step 1: Split PDF
    print("\n📄 Step 1: Splitting PDF...")
    if pages_per_chunk:
        chunk_paths = split_pdf_by_pages(input_pdf_path, pages_per_chunk, output_base_dir)
    else:
        # Use default 35 pages if no specific method chosen
        default_pages = 35  # Optimal for 2-3 MB chunks
        chunk_paths = split_pdf_by_pages(input_pdf_path, default_pages, output_base_dir)
        
        # Also try size-based splitting and use whichever creates smaller chunks
        # size_chunks = split_pdf_by_size(input_pdf_path, max_size_mb, output_base_dir)
        
        # Use the method that creates more chunks (smaller chunks)
        # if len(size_chunks) > len(chunk_paths):
        #     chunk_paths = size_chunks
        #     print(f"Using size-based splitting ({max_size_mb}MB chunks) for better performance")
        # else:
        #     print(f"Using page-based splitting ({default_pages} pages) for better performance")
    
    print(f"✅ PDF split into {len(chunk_paths)} chunks")
    
    # Step 2: Initialize browser
    print("\n🌐 Step 2: Initializing browser...")
    driver = webdriver.Firefox()
    
    # Step 3: Process each chunk
    print("\n🔄 Step 3: Processing chunks through OCR...")
    results = []
    total_pages_processed = 0
    successful_chunks = 0
    
    try:
        for i, chunk_path in enumerate(chunk_paths, 1):
            print(f"\n{'='*80}")
            print(f"🔄 PROCESSING CHUNK {i}/{len(chunk_paths)}")
            print(f"{'='*80}")
            
            # Process this chunk completely
            result = process_pdf_chunk_through_ocr(driver, chunk_path, output_base_dir)
            results.append(result)
            
            if result["success"]:
                successful_chunks += 1
                total_pages_processed += result.get("extracted_pages", 0)
                print(f"\n✅ CHUNK {i} COMPLETED SUCCESSFULLY!")
                print(f"   • Pages extracted: {result.get('extracted_pages', 0)}/{result.get('total_pages', 0)}")
                print(f"   • Output saved to: {result.get('output_dir', 'N/A')}")
            else:
                print(f"\n❌ CHUNK {i} FAILED!")
                print(f"   • Error: {result.get('error', 'Unknown error')}")
            
            # Wait before next chunk to ensure complete separation
            if i < len(chunk_paths):  # Don't wait after the last chunk
                print(f"\n⏳ Waiting 10 seconds before next chunk...")
                time.sleep(10)
    
    finally:
        # Step 4: Close browser
        print("\n🔧 Step 4: Cleaning up...")
        driver.quit()
    
    # Step 5: Create final summary
    print("\n📊 Step 5: Creating final summary...")
    summary_path = os.path.join(output_base_dir, "batch_processing_summary.txt")
    
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write(f"Batch OCR Processing Summary\n")
        f.write(f"=" * 60 + "\n")
        f.write(f"Original PDF: {input_pdf_path}\n")
        f.write(f"Processing Time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total Chunks: {len(chunk_paths)}\n")
        f.write(f"Successful Chunks: {successful_chunks}\n")
        f.write(f"Total Pages Processed: {total_pages_processed}\n")
        f.write(f"Output Directory: {output_base_dir}\n\n")
        
        f.write("Chunk Details:\n")
        f.write("-" * 40 + "\n")
        for i, result in enumerate(results, 1):
            if result["success"]:
                f.write(f"Chunk {i}: ✅ {result['chunk_name']} - {result['extracted_pages']}/{result['total_pages']} pages\n")
            else:
                f.write(f"Chunk {i}: ❌ Failed - {result.get('error', 'Unknown error')}\n")
    
    print(f"\n🎉 Batch processing complete!")
    print(f"✅ Successfully processed: {successful_chunks}/{len(chunk_paths)} chunks")
    print(f"📄 Total pages extracted: {total_pages_processed}")
    print(f"📁 Results saved in: {output_base_dir}")
    
    return {
        "success": True,
        "total_chunks": len(chunk_paths),
        "successful_chunks": successful_chunks,
        "total_pages": total_pages_processed,
        "output_dir": output_base_dir,
        "chunk_results": results
    }

print("Batch processing functions ready! 🚀")

Batch processing functions ready! 🚀


In [28]:
# Example usage:
large_pdf_path ='/var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/परराष्ट्र नीति २०७७ .pdf'

In [29]:
# USAGE EXAMPLE - Batch Process Large PDF
# Replace the path below with your actual large PDF file path


# Method 1: Split by file size (recommended for OCR website limits)
print("🚀 Starting batch processing with size-based splitting...")
try:
    results = batch_process_pdf_chunks(
        input_pdf_path=large_pdf_path,
        max_size_mb=2 # 3MB chunks for optimal speed (faster processing)
    )
    
    if results["success"]:
        print(f"\n🎉 BATCH PROCESSING COMPLETED! 🎉")
        print(f"📊 Results Summary:")
        print(f"   • Total chunks processed: {results['successful_chunks']}/{results['total_chunks']}")
        print(f"   • Total pages extracted: {results['total_pages']}")
        print(f"   • Output directory: {results['output_dir']}")
        
        # Show individual chunk results
        print(f"\n📋 Individual Chunk Results:")
        for i, chunk_result in enumerate(results['chunk_results'], 1):
            if chunk_result["success"]:
                print(f"   ✅ Chunk {i}: {chunk_result['extracted_pages']}/{chunk_result['total_pages']} pages")
            else:
                print(f"   ❌ Chunk {i}: Failed - {chunk_result.get('error', 'Unknown error')}")
    else:
        print("❌ Batch processing failed")
        
except Exception as e:
    print(f"❌ Error in batch processing: {e}")

# Uncomment the lines below if you want to use page-based splitting instead:
# print("🚀 Starting batch processing with page-based splitting...")
# results = batch_process_pdf_chunks(
#     input_pdf_path=large_pdf_path,
#     pages_per_chunk=35  # 35 pages per chunk for optimal speed
# )

🚀 Starting batch processing with size-based splitting...
🚀 Starting batch processing for: /var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/परराष्ट्र नीति २०७७ .pdf

📄 Step 1: Splitting PDF...
Total pages: 31
Pages per chunk: 35
Created chunk 1: परराष्ट्र नीति २०७७ _chunk_001.pdf (0.39 MB, pages 1-31)

PDF split into 1 chunks
Chunks saved in: /var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/परराष्ट्र नीति २०७७ _batch_ocr/परराष्ट्र नीति २०७७ _chunks
✅ PDF split into 1 chunks

🌐 Step 2: Initializing browser...

🔄 Step 3: Processing chunks through OCR...

🔄 PROCESSING CHUNK 1/1

Processing: परराष्ट्र नीति २०७७ _chunk_001.pdf
🌐 Step 1: Loading fresh OCR page...
📄 Step 2: Uploading PDF chunk...
✓ File uploaded successfully
🍪 Step 3: Handling cookies...
🔐 Step 4: Handling reCAPTCHA...
✓ reCAPTCHA checkbox clicked
✓ Simple reCAPTCHA completed
🍪 Step 5: Handling post-reCAPTCHA cookies...
✓ Post-reCAPTCHA cookies handled
⚡ Step 6: Starting OCR processing...
✓ 

In [30]:
# IMPROVED PAGE PROCESSING - FIXES PAGE NUMBER MISMATCH

def process_pages_with_verification(driver, chunk_output_dir, chunk_name, total_pages):
    """
    Improved page processing that fixes page number mismatch issues
    
    Args:
        driver: Selenium WebDriver instance
        chunk_output_dir: Output directory for this chunk
        chunk_name: Name of the chunk being processed
        total_pages: Total number of pages to process
    
    Returns:
        Dictionary with processing results
    """
    extracted_pages = 0
    error_pages = []
    duplicate_content_detected = []
    
    # Store previous page content to detect duplicates
    previous_content_hash = None
    
    print(f"🔍 Starting improved page processing for {total_pages} pages...")
    
    for page_num in range(1, total_pages + 1):
        try:
            print(f"\n{'='*50}")
            print(f"📄 Processing Page {page_num}/{total_pages}")
            print(f"{'='*50}")
            
            # STEP 1: Clear any existing content in textbox
            try:
                ocr_textbox = driver.find_element(By.ID, 'ocrTextBox')
                # Clear the textbox using JavaScript
                driver.execute_script("arguments[0].value = ''; arguments[0].innerHTML = '';", ocr_textbox)
                # Also try clearing with standard clear method
                ocr_textbox.clear()
                print(f"🧹 Cleared textbox for page {page_num}")
                time.sleep(2)
            except Exception as e:
                print(f"⚠ Could not clear textbox: {e}")
            
            # STEP 2: Click on the specific page thumbnail
            page_clicked = False
            for attempt in range(3):  # Try up to 3 times
                try:
                    page_element = driver.find_element(By.ID, f'page_{page_num}')
                    
                    # Scroll to the page element if needed
                    driver.execute_script("arguments[0].scrollIntoView(true);", page_element)
                    time.sleep(1)
                    
                    # Click the page
                    page_element.click()
                    print(f"✓ Clicked on page {page_num} thumbnail (attempt {attempt + 1})")
                    page_clicked = True
                    
                    # Wait for selection to register
                    time.sleep(3)
                    break
                    
                except Exception as e:
                    print(f"⚠ Failed to click page {page_num} (attempt {attempt + 1}): {e}")
                    time.sleep(2)
            
            if not page_clicked:
                print(f"❌ Could not select page {page_num}")
                error_pages.append(page_num)
                continue
            
            # STEP 3: Verify page selection (look for visual indicators)
            page_selected = False
            try:
                # Check if page has selection styling
                selected_page = driver.find_element(By.CSS_SELECTOR, 
                    f'img[id="page_{page_num}"][class*="selected"], ' +
                    f'img[id="page_{page_num}"][style*="border"], ' +
                    f'img[id="page_{page_num}"][style*="outline"]')
                print(f"✓ Page {page_num} selection visually confirmed")
                page_selected = True
            except:
                print(f"⚠ Page {page_num} selection not visually confirmed, but proceeding...")
                page_selected = True  # Assume it worked
            
            # STEP 4: Click "Extract Page Text" button
            extract_clicked = False
            for attempt in range(3):
                try:
                    # Wait for any processing from page selection to complete
                    time.sleep(2)
                    
                    extract_button = driver.find_element(By.XPATH, 
                        "//button[contains(text(), 'Extract Page Text')] | " +
                        "//input[contains(@value, 'Extract Page Text')] | " +
                        "//button[contains(@class, 'extract')] | " +
                        "//input[contains(@class, 'extract')]")
                    
                    extract_button.click()
                    print(f"✓ Extract Page Text clicked for page {page_num} (attempt {attempt + 1})")
                    extract_clicked = True
                    break
                    
                except Exception as e:
                    print(f"⚠ Extract button not found for page {page_num} (attempt {attempt + 1}): {e}")
                    time.sleep(3)
            
            if not extract_clicked:
                print(f"❌ Could not click Extract Text for page {page_num}")
                error_pages.append(page_num)
                continue
            
            # STEP 5: Wait for OCR processing with smart timeout
            print(f"⏳ Waiting for page {page_num} OCR processing...")
            
            # Initial wait for processing to start
            time.sleep(8)
            
            # Wait for content to appear with verification
            max_wait = 90
            waited = 0
            content_ready = False
            
            while waited < max_wait and not content_ready:
                try:
                    ocr_textbox = driver.find_element(By.ID, 'ocrTextBox')
                    current_content = ocr_textbox.get_attribute('value') or ocr_textbox.text or ""
                    
                    # Check if we have meaningful content
                    if current_content and len(current_content.strip()) > 10:
                        # Additional check: make sure it's not the same as previous page
                        import hashlib
                        current_hash = hashlib.md5(current_content.strip().encode()).hexdigest()
                        
                        if previous_content_hash and current_hash == previous_content_hash:
                            print(f"⚠ Same content as previous page detected, waiting more...")
                            time.sleep(5)
                            waited += 5
                            continue
                        else:
                            content_ready = True
                            print(f"✓ Page {page_num} content ready ({len(current_content)} chars)")
                            break
                    else:
                        time.sleep(5)
                        waited += 5
                        if waited % 15 == 0:  # Print status every 15 seconds
                            print(f"⏳ Still waiting for page {page_num} content... ({waited}s)")
                        
                except Exception as e:
                    time.sleep(5)
                    waited += 5
            
            # STEP 6: Extract the text content
            extracted_text = None
            extraction_method = None
            
            # Method 1: Direct textbox extraction
            try:
                ocr_textbox = driver.find_element(By.ID, 'ocrTextBox')
                extracted_text = ocr_textbox.get_attribute('value') or ocr_textbox.text
                
                if extracted_text and len(extracted_text.strip()) > 5:
                    extraction_method = "textbox"
                    print(f"✓ Text extracted via textbox ({len(extracted_text)} chars)")
                else:
                    extracted_text = None
                    
            except Exception as e:
                print(f"⚠ Textbox extraction failed: {e}")
            
            # Method 2: Copy button extraction
            if not extracted_text:
                try:
                    copy_button = driver.find_element(By.XPATH, 
                        "//button[contains(text(), 'Copy')] | " +
                        "//input[contains(@value, 'Copy')]")
                    copy_button.click()
                    time.sleep(3)
                    
                    import pyperclip
                    extracted_text = pyperclip.paste()
                    
                    if extracted_text and len(extracted_text.strip()) > 5:
                        extraction_method = "clipboard"
                        print(f"✓ Text extracted via clipboard ({len(extracted_text)} chars)")
                    else:
                        extracted_text = None
                        
                except Exception as e:
                    print(f"⚠ Clipboard extraction failed: {e}")
            
            # STEP 7: Content validation and duplicate detection
            if extracted_text and extracted_text.strip():
                # Check for duplicate content
                import hashlib
                current_hash = hashlib.md5(extracted_text.strip().encode()).hexdigest()
                
                if previous_content_hash and current_hash == previous_content_hash:
                    print(f"🚨 DUPLICATE CONTENT DETECTED for page {page_num}!")
                    duplicate_content_detected.append(page_num)
                    
                    # Still save it but mark it clearly
                    page_filename = f"page_{page_num:03d}_DUPLICATE.txt"
                else:
                    page_filename = f"page_{page_num:03d}.txt"
                
                previous_content_hash = current_hash
            else:
                page_filename = f"page_{page_num:03d}_empty.txt"
            
            # STEP 8: Save the content
            if extracted_text and extracted_text.strip():
                page_filepath = os.path.join(chunk_output_dir, page_filename)
                
                with open(page_filepath, 'w', encoding='utf-8') as f:
                    f.write(f"Chunk: {chunk_name}\\n")
                    f.write(f"Page: {page_num}\\n")
                    f.write(f"Extraction Method: {extraction_method}\\n")
                    f.write(f"Content Length: {len(extracted_text)} characters\\n")
                    f.write(f"Content Hash: {current_hash[:8]}\\n")
                    f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\\n")
                    
                    if page_num in duplicate_content_detected:
                        f.write(f"*** WARNING: DUPLICATE CONTENT DETECTED ***\\n")
                    
                    f.write("="*60 + "\\n")
                    f.write(extracted_text)
                    f.write("\\n" + "="*60 + "\\n")
                
                print(f"✅ Page {page_num} saved: {page_filename}")
                
                # Show preview without newlines
                preview = extracted_text[:100].replace('\\n', ' ').replace('\\r', ' ')
                print(f"📄 Preview: {preview}...")
                
                extracted_pages += 1
            else:
                # Save empty file with debugging info
                page_filepath = os.path.join(chunk_output_dir, page_filename)
                
                with open(page_filepath, 'w', encoding='utf-8') as f:
                    f.write(f"Chunk: {chunk_name}\\n")
                    f.write(f"Page: {page_num}\\n")
                    f.write(f"Status: No content extracted\\n")
                    f.write(f"Page Selected: {page_selected}\\n")
                    f.write(f"Extract Clicked: {extract_clicked}\\n")
                    f.write(f"Content Ready: {content_ready}\\n")
                    f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\\n")
                
                print(f"⚠ Page {page_num} - No content extracted")
            
            # Brief pause before next page
            time.sleep(3)
            
        except Exception as e:
            print(f"❌ Critical error processing page {page_num}: {e}")
            error_pages.append(page_num)
            continue
    
    # Create detailed summary
    summary_path = os.path.join(chunk_output_dir, "processing_summary.txt")
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write(f"Improved Page Processing Summary\\n")
        f.write(f"=" * 50 + "\\n")
        f.write(f"Chunk: {chunk_name}\\n")
        f.write(f"Total Pages: {total_pages}\\n")
        f.write(f"Successfully Extracted: {extracted_pages}\\n")
        f.write(f"Failed Pages: {len(error_pages)}\\n")
        f.write(f"Duplicate Content Detected: {len(duplicate_content_detected)}\\n")
        f.write(f"Processing Time: {time.strftime('%Y-%m-%d %H:%M:%S')}\\n\\n")
        
        if error_pages:
            f.write(f"Failed Pages: {error_pages}\\n\\n")
        if duplicate_content_detected:
            f.write(f"Pages with Duplicate Content: {duplicate_content_detected}\\n\\n")
    
    print(f"\\n{'='*60}")
    print(f"📊 Processing Complete for Chunk: {chunk_name}")
    print(f"✅ Successfully extracted: {extracted_pages}/{total_pages} pages")
    if error_pages:
        print(f"❌ Failed pages: {error_pages}")
    if duplicate_content_detected:
        print(f"🚨 Duplicate content detected on pages: {duplicate_content_detected}")
    print(f"{'='*60}")
    
    return {
        "extracted_pages": extracted_pages,
        "error_pages": error_pages,
        "duplicate_pages": duplicate_content_detected,
        "total_pages": total_pages
    }

print("✅ Improved page processing function ready!")
print("🔧 This function fixes:")
print("   • Page number mismatch issues")
print("   • Duplicate content detection") 
print("   • Better content verification")
print("   • Detailed error tracking")

✅ Improved page processing function ready!
🔧 This function fixes:
   • Page number mismatch issues
   • Duplicate content detection
   • Better content verification
   • Detailed error tracking


In [31]:
# # USAGE EXAMPLE - Using the Improved Page Processing

# def process_single_chunk_improved(chunk_path, output_dir):
#     """
#     Process a single PDF chunk using the improved page processing method
    
#     Args:
#         chunk_path: Path to the PDF chunk file
#         output_dir: Output directory for extracted text
    
#     Returns:
#         Processing results dictionary
#     """
#     print(f"🚀 Starting improved processing for: {os.path.basename(chunk_path)}")
    
#     # Initialize browser
#     driver = webdriver.Firefox()
    
#     try:
#         chunk_filename = os.path.basename(chunk_path)
#         chunk_name = os.path.splitext(chunk_filename)[0]
        
#         # Create output directory for this chunk
#         chunk_output_dir = os.path.join(output_dir, f"extracted_text_{chunk_name}")
#         if not os.path.exists(chunk_output_dir):
#             os.makedirs(chunk_output_dir)
        
#         # Navigate and upload file
#         print("🌐 Loading OCR website...")
#         driver.get('https://www.i2ocr.com/free-online-nepali-ocr')
#         time.sleep(3)
        
#         # Upload file
#         print("📄 Uploading file...")
#         upload_button = driver.find_element(By.ID, 'i2ocr_uploadedfile')
#         upload_button.send_keys(chunk_path)
#         time.sleep(3)
        
#         # Manual intervention point
#         print("\\n⚠️  MANUAL STEPS REQUIRED:")
#         print("👤 Please complete the following in the browser:")
#         print("   1. Handle any cookie dialogs")
#         print("   2. Solve any CAPTCHA challenges")
#         print("   3. Click 'Extract Text' button")
#         print("   4. Wait for initial processing to complete")
        
#         input("\\n✋ Press Enter when ready to start page-by-page extraction...")
        
#         # Wait for page viewer to load
#         print("⏳ Waiting for page viewer...")
#         time.sleep(10)
        
#         # Find total pages
#         page_elements = driver.find_elements(By.CSS_SELECTOR, 'img[id^="page_"]')
#         total_pages = len(page_elements)
#         print(f"📄 Found {total_pages} pages to process")
        
#         if total_pages == 0:
#             print("❌ No pages found!")
#             return {"success": False, "error": "No pages found"}
        
#         # Use the improved page processing
#         results = process_pages_with_verification(driver, chunk_output_dir, chunk_name, total_pages)
        
#         # Show final results
#         print(f"\\n🎉 Processing Complete!")
#         print(f"📊 Results:")
#         print(f"   • Total pages: {results['total_pages']}")
#         print(f"   • Successfully extracted: {results['extracted_pages']}")
#         print(f"   • Failed pages: {len(results['error_pages'])}")
#         print(f"   • Duplicate content pages: {len(results['duplicate_pages'])}")
#         print(f"   • Output directory: {chunk_output_dir}")
        
#         if results['error_pages']:
#             print(f"❌ Failed pages: {results['error_pages']}")
#         if results['duplicate_pages']:
#             print(f"🚨 Pages with duplicate content: {results['duplicate_pages']}")
#             print("   📝 These pages are marked as '_DUPLICATE' in filenames")
        
#         return {
#             "success": True,
#             "chunk_name": chunk_name,
#             "results": results,
#             "output_dir": chunk_output_dir
#         }
    
#     except Exception as e:
#         print(f"❌ Error in processing: {e}")
#         return {"success": False, "error": str(e)}
    
#     finally:
#         # Close browser
#         print("\\n🔧 Closing browser...")
#         driver.quit()

# # Example usage:
# # chunk_file = "/path/to/your/chunk.pdf"
# # output_directory = "/path/to/output"
# # result = process_single_chunk_improved(chunk_file, output_directory)

# print("\\n✅ Improved processing function ready to use!")
# print("🎯 Key improvements:")
# print("   • Clears textbox before each page")
# print("   • Verifies page selection")
# print("   • Detects duplicate content")
# print("   • Better error handling")
# print("   • Content hash verification")
# print("   • Detailed processing logs")