# PDF OCR Text Extraction Script

This notebook extracts text from Nepali PDF documents using i2pdf.com's OCR service.

## Dependencies
If you want to use clipboard functionality on Linux, you need to install either xclip or xsel:
```bash
sudo apt-get install xclip
# or
sudo apt-get install xsel
```

However, this script has been modified to work without clipboard access by directly extracting text from the webpage elements.

In [36]:
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
import pyperclip
from module.playsound import play_sound

In [37]:
driver = webdriver.Firefox()
print("🌐 Step 1: Loading fresh OCR page...")
driver.get('https://www.i2pdf.com/pdf-ocr/nepali')
time.sleep(5)  # Wait for page to load completely


🌐 Step 1: Loading fresh OCR page...


In [29]:
#5
pdf='/var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/सामाजिक अध्ययन तथा जीवनोपयोगी शिक्षा: कक्षा ११ .pdf'

In [28]:
pdf='/var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/नेपाली: कक्षा ७ .pdf'
#4'

In [27]:
pdf='/var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/नेपाली: कक्षा ९ .pdf'
#3

In [38]:
#2
pdf='/var/home/ramrshrcg/Downloads/nepali datas folder/Nepali Materials/सामाजिक अध्ययन तथा मानव मूल्य शिक्षा: कक्षा ७ .pdf'

In [39]:
# STEP 2: Upload the PDF file
print("📄 Step 2: Uploading PDF...")
button_for_pdf_upload = driver.find_element(By.ID, 'upload_file')
button_for_pdf_upload.send_keys(pdf)
print("✓ File uploaded successfully")

# Wait for the upload to complete
print("⏳ Waiting for upload to complete...")
time.sleep(150)  # Increased wait time for upload

# Take a screenshot to see what's on the page
# screenshot_path = "/var/home/ramrshrcg/Desktop/Python/scrapping/page_after_upload.png"
# driver.save_screenshot(screenshot_path)
# print(f"✓ Screenshot saved to {screenshot_path}")

# Detect how many pages are in the document
try:
    # Look for page cards in the document
    page_cards = driver.find_elements(By.CSS_SELECTOR, "div.card[data-id]")
    if page_cards:
        print(f"✓ Detected {len(page_cards)} pages in the document")
    else:
        # Try alternative selectors
        page_cards = driver.find_elements(By.CSS_SELECTOR, "div[id][data-filename*='page-']")
        if page_cards:
            print(f"✓ Detected {len(page_cards)} pages using alternative selector")
        else:
            page_cards = driver.find_elements(By.CSS_SELECTOR, "div.card")
            if page_cards:
                print(f"✓ Detected {len(page_cards)} possible page cards")
            else:
                print("⚠️ Could not detect page cards. This might be a single-page document or the page structure is different.")
except Exception as e:
    print(f"Error detecting pages: {e}")
    
# Print the page source for debugging
print("Current page URL:", driver.current_url)

📄 Step 2: Uploading PDF...
✓ File uploaded successfully
⏳ Waiting for upload to complete...
✓ Detected 195 pages in the document
Current page URL: https://www.i2pdf.com/pdf-ocr/nepali
✓ Detected 195 pages in the document
Current page URL: https://www.i2pdf.com/pdf-ocr/nepali


In [None]:
# STEP 3: Extract text from all pages
def find_and_click_button():
    # List of possible button selectors to try
    button_selectors = [
        # CSS Selectors
        {"type": "css", "selector": ".pdf_ocr.btn.btn-primary.mr-2.glow"},
        {"type": "css", "selector": ".pdf_ocr.btn.btn-primary"},
        {"type": "css", "selector": "button.pdf_ocr"},
        {"type": "css", "selector": ".btn-primary"},
        {"type": "css", "selector": "button.btn-primary"},
        {"type": "css", "selector": ".btn-success"},
        {"type": "css", "selector": "button.btn-success"},
        # XPath Selectors
        {"type": "xpath", "selector": "//button[contains(@class, 'pdf_ocr')]"},
        {"type": "xpath", "selector": "//button[contains(@class, 'btn-primary')]"},
        {"type": "xpath", "selector": "//button[contains(@class, 'btn-success')]"},
        {"type": "xpath", "selector": "//button[contains(., 'Start')]"},
        {"type": "xpath", "selector": "//button[contains(., 'Extract')]"},
        {"type": "xpath", "selector": "//button[contains(., 'OCR')]"},
        {"type": "xpath", "selector": "//button[contains(., 'Process')]"},
        {"type": "xpath", "selector": "//button[contains(., 'Convert')]"},
        {"type": "xpath", "selector": "//a[contains(@class, 'btn-primary')]"},
        {"type": "xpath", "selector": "//a[contains(@class, 'btn-success')]"},
        {"type": "xpath", "selector": "//a[contains(@class, 'btn')]"},
        # Generic button selectors
        {"type": "xpath", "selector": "//button"},
        {"type": "css", "selector": "button"}
    ]
    
    # Try each selector
    for selector in button_selectors:
        try:
            if selector["type"] == "css":
                elements = driver.find_elements(By.CSS_SELECTOR, selector["selector"])
            else:
                elements = driver.find_elements(By.XPATH, selector["selector"])
                
            if not elements:
                continue
                
            print(f"Found {len(elements)} elements with selector: {selector['selector']}")
            
            # Try to find a button related to OCR or extraction
            for element in elements:
                try:
                    element_text = element.text.strip().lower()
                    element_class = element.get_attribute('class')
                    element_id = element.get_attribute('id')
                    
                    print(f"Element text: '{element_text}', class: '{element_class}', id: '{element_id}'")
                    
                    # Check if the button looks like an extract/OCR button
                    is_extract_button = (
                        'ocr' in element_text or 
                        'extract' in element_text or
                        'start' in element_text or
                        'process' in element_text or
                        'convert' in element_text or
                        'ocr' in element_class.lower() or
                        'extract' in element_class.lower() or
                        (element_id and ('ocr' in element_id.lower() or 'extract' in element_id.lower()))
                    )
                    
                    if is_extract_button:
                        print(f"Found likely extract button: '{element_text}'")
                        element.click()
                        print("✓ Button clicked successfully")
                        return True
                except Exception as e:
                    print(f"Error examining element: {e}")
            
            # If we didn't find a specific OCR button, try clicking the first button
            if elements:
                print("No specific OCR button found, trying first button")
                elements[0].click()
                print("✓ First button clicked")
                return True
                
        except Exception as e:
            print(f"Selector {selector['selector']} failed: {e}")
    
    return False

# Function to find and extract text from the current page
def find_and_extract_text():
    # Try to get the text using different selectors
    text_selectors = [
        {"type": "id", "selector": "ocrTextBox"},
        {"type": "css", "selector": "textarea.form-control"},
        {"type": "css", "selector": "textarea"},
        {"type": "xpath", "selector": "//textarea"},
        {"type": "css", "selector": "div.form-control"},
        {"type": "css", "selector": "pre"},
        {"type": "xpath", "selector": "//pre"},
        {"type": "css", "selector": ".text-output"},
        {"type": "css", "selector": ".ocr-output"},
        {"type": "css", "selector": ".output-text"},
        {"type": "xpath", "selector": "//*[contains(@id, 'text')]"},
        {"type": "xpath", "selector": "//*[contains(@id, 'ocr')]"},
        {"type": "xpath", "selector": "//*[contains(@class, 'text')]"},
        {"type": "xpath", "selector": "//*[contains(@class, 'ocr')]"},
        {"type": "xpath", "selector": "//*[contains(@class, 'output')]"}
    ]
    
    for selector in text_selectors:
        try:
            if selector["type"] == "id":
                element = driver.find_element(By.ID, selector["selector"])
            elif selector["type"] == "css":
                element = driver.find_element(By.CSS_SELECTOR, selector["selector"])
            else:
                element = driver.find_element(By.XPATH, selector["selector"])
                
            print(f"Found text element with selector: {selector['selector']}")
            
            # Try different ways to get text from the element
            try:
                text = element.get_attribute('value')
                if text and len(text) > 0:
                    print(f"Got text using get_attribute('value') - Length: {len(text)}")
                    return text
            except:
                pass
                
            try:
                text = element.text
                if text and len(text) > 0:
                    print(f"Got text using .text property - Length: {len(text)}")
                    return text
            except:
                pass
                
            try:
                text = element.get_attribute('innerHTML')
                if text and len(text) > 0:
                    print(f"Got text using get_attribute('innerHTML') - Length: {len(text)}")
                    return text
            except:
                pass
                
        except Exception as e:
            print(f"Selector {selector['selector']} failed: {e}")
    
    # If all direct approaches failed, try JavaScript
    try:
        print("Trying JavaScript approaches...")
        
        # Try to get text from any element that might contain OCR results
        text = driver.execute_script("""
            // Function to check if a string looks like OCR output
            function looksLikeOcrOutput(str) {
                if (!str) return false;
                return str.length > 50;  // At least 50 characters
            }
            
            // Try different sources
            var sources = [
                document.getElementById('ocrTextBox'),
                document.querySelector('textarea'),
                document.querySelector('pre'),
                document.querySelector('div.form-control')
            ];
            
            for (var i = 0; i < sources.length; i++) {
                var el = sources[i];
                if (el) {
                    if (el.value && looksLikeOcrOutput(el.value)) 
                        return el.value;
                    if (el.textContent && looksLikeOcrOutput(el.textContent)) 
                        return el.textContent;
                }
            }
            
            return '';
        """)
        
        if text and len(text) > 0:
            print(f"Got text using JavaScript - Length: {len(text)}")
            return text
    except Exception as e:
        print(f"JavaScript approach failed: {e}")
    
    return None

# Function to select a specific page
def select_page(page_num):
    try:
        # Try to find the page element by card ID or data-id
        selectors = [
            f"div.card[data-id='{page_num}']",
            f"div.card[id='{page_num}']",
            f"div[data-id='{page_num}']",
            f"div[id='{page_num}'][data-filename*='page-']"
        ]
        
        for selector in selectors:
            try:
                page_element = driver.find_element(By.CSS_SELECTOR, selector)
                page_element.click()
                print(f"✓ Selected page {page_num}")
                return True
            except:
                continue
                
        # If direct selection failed, try to find by page label
        try:
            page_element = driver.find_element(By.XPATH, f"//div[contains(@class, 'card')]//h5[text()='{page_num}']/..")
            page_element.click()
            print(f"✓ Selected page {page_num} by label")
            return True
        except:
            pass
            
        # Try JavaScript as a last resort
        selected = driver.execute_script(f"""
            var cards = document.querySelectorAll('div.card, div[data-id], div[data-filename*="page-"]');
            for (var i = 0; i < cards.length; i++) {{
                var card = cards[i];
                if (card.getAttribute('data-id') == '{page_num}' || 
                    card.getAttribute('id') == '{page_num}' ||
                    card.querySelector('h5:contains("{page_num}")')) {{
                    card.click();
                    return true;
                }}
            }}
            return false;
        """)
        
        if selected:
            print(f"✓ Selected page {page_num} using JavaScript")
            return True
            
        print(f"⚠️ Could not find page {page_num}")
        return False
    except Exception as e:
        print(f"Error selecting page {page_num}: {e}")
        return False

# First, analyze all interactive elements for debugging
print("Analyzing page for interactive elements...")
try:
    # Get all buttons, links, and inputs
    elements = driver.find_elements(By.CSS_SELECTOR, "button, a.btn, input[type='button'], input[type='submit']")
    print(f"Found {len(elements)} interactive elements:")
    
    for i, element in enumerate(elements):
        try:
            element_text = element.text.strip() if element.text else "[No text]"
            element_tag = element.tag_name
            element_class = element.get_attribute('class')
            element_id = element.get_attribute('id') or "[No ID]"
            element_type = element.get_attribute('type') or "[No type]"
            
            print(f"Element {i+1}: Tag={element_tag}, Text='{element_text}', ID={element_id}, Class={element_class}, Type={element_type}")
        except:
            print(f"Element {i+1}: [Error retrieving details]")
except Exception as e:
    print(f"Error analyzing elements: {e}")

# Take a screenshot before processing
# screenshot_path = "/var/home/ramrshrcg/Desktop/Python/scrapping/page_before_processing.png"
# driver.save_screenshot(screenshot_path)
# print(f"✓ Screenshot saved to {screenshot_path}")

# Save page source for analysis
source_path = "/var/home/ramrshrcg/Desktop/Python/scrapping/page_source_before_processing.html"
with open(source_path, 'w', encoding='utf-8') as f:
    f.write(driver.page_source)
print(f"✓ Page source saved to {source_path}")

# Find all page cards again to confirm count
try:
    # Try various selectors for page cards
    selectors = [
        "div.card[data-id]",
        "div[data-filename*='page-']",
        "div.card",
        "div[id][data-id]"
    ]
    
    for selector in selectors:
        page_cards = driver.find_elements(By.CSS_SELECTOR, selector)
        if page_cards and len(page_cards) > 0:
            print(f"Found {len(page_cards)} pages using selector: {selector}")
            break
    
    # If no pages found, assume single page
    if not page_cards or len(page_cards) == 0:
        page_cards = [None]  # Dummy entry for single page
        print("Assuming single-page document")
except Exception as e:
    print(f"Error detecting pages: {e}")
    page_cards = [None]  # Default to single page
    print("Defaulting to single-page processing due to error")

# Process each page
all_text = ""
for page_idx, _ in enumerate(page_cards, 1):
    page_num = page_idx  # Pages are usually 1-indexed
    
    print(f"\n==== Processing Page {page_num} ====")
    
    # Skip page selection for single page document
    if len(page_cards) > 1:
        # Select the page
        if not select_page(page_num):
            print(f"⚠️ Skipping page {page_num} due to selection failure")
            continue
        
        # Wait for page selection to take effect
        time.sleep(2)
    
    # Look for and click the extract button
    print(f"Looking for extract button for page {page_num}...")
    button_clicked = find_and_click_button()
    
    if not button_clicked:
        print(f"⚠️ Could not find extract button for page {page_num}")
        
        # Try the manual approach for this page if automatic detection fails
        try:
            print("Trying JavaScript to find and click buttons...")
            buttons_found = driver.execute_script("""
                var buttons = document.querySelectorAll('button, a.btn');
                var count = 0;
                for (var i = 0; i < buttons.length; i++) {
                    var btn = buttons[i];
                    var text = btn.innerText.toLowerCase();
                    if (text.includes('ocr') || text.includes('extract') || 
                        text.includes('start') || text.includes('process')) {
                        btn.click();
                        count++;
                        break;
                    }
                }
                return count;
            """)
            
            if buttons_found > 0:
                print(f"✓ Clicked a button using JavaScript")
                button_clicked = True
            else:
                print("⚠️ No buttons found via JavaScript")
        except Exception as e:
            print(f"JavaScript button approach failed: {e}")
    
    if button_clicked:
        # Wait for OCR processing to complete
        print(f"⏳ Waiting for OCR processing to complete for page {page_num}...")
        time.sleep(60)  # Adjust based on processing time per page
        
        # Take a screenshot after processing
        # screenshot_path = f"/var/home/ramrshrcg/Desktop/Python/scrapping/page_{page_num}_after_processing.png"
        # driver.save_screenshot(screenshot_path)
        # print(f"✓ Screenshot saved to {screenshot_path}")
        
        # Extract text from the current page
        print(f"Extracting text from page {page_num}...")
        page_text = find_and_extract_text()
        
        if page_text:
            print(f"✓ Successfully extracted text from page {page_num} (length: {len(page_text)} characters)")
            
            # Add page header and text to the combined text
            all_text += f"\n\n========== PAGE {page_num} ==========\n\n"
            all_text += page_text
        else:
            print(f"⚠️ Could not extract text from page {page_num}")
    
    # If there are more pages, wait before proceeding to next page
    if page_idx < len(page_cards):
        time.sleep(5)  # Brief pause between pages

# If we extracted any text, save it to a file
if all_text.strip():
    print(f"\n✓ Successfully extracted text from {len(page_cards)} pages (total length: {len(all_text)} characters)")
    
    # Create output filename based on input PDF
    import os
    pdf_filename = os.path.basename(pdf)
    output_filename = os.path.splitext(pdf_filename)[0] + "_extracted.txt"
    output_path = os.path.join('/var/home/ramrshrcg/Desktop/Python/scrapping/Book', output_filename)
    
    # Save the extracted text to a file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(all_text)
    
    print(f"✓ Extracted text saved to: {output_path}")
    
    # Play a sound to notify completion
    try:
        play_sound()
    except:
        print("Sound notification failed, but text was saved successfully")
else:
    print("⚠️ Could not extract any text from any page")
    print("Trying to save page source as a last resort...")
    
    # Save the page source as a last resort
    source_path = "/var/home/ramrshrcg/Desktop/Python/scrapping/page_source_final.html"
    with open(source_path, 'w', encoding='utf-8') as f:
        f.write(driver.page_source)
    print(f"✓ Final page source saved to {source_path}")
    print("Please examine the screenshots and page source to identify the text element")

Analyzing page for interactive elements...
Found 13 interactive elements:
Element 1: Tag=button, Text='[No text]', ID=[No ID], Class=navbar-toggler btn btn-icon btn-sm rounded-circle, Type=button
Element 2: Tag=button, Text='[No text]', ID=[No ID], Class=btn btn-icon btn-sm btn-ghost-light, Type=button
Element 3: Tag=button, Text='[No text]', ID=botRgn, Class=btn btn-primary, Type=button
Element 4: Tag=button, Text='Start', ID=[No ID], Class=pdf_ocr btn  btn-primary  mr-2  glow, Type=button
Element 5: Tag=a, Text='Options', ID=[No ID], Class=js-hs-unfold-invoker options_btn btn btn-soft-secondary, Type=[No type]
Element 6: Tag=a, Text='[No text]', ID=[No ID], Class=js-hs-unfold-invoker btn btn-icon btn-xs btn-soft-secondary, Type=[No type]
Element 7: Tag=button, Text='Nepali', ID=[No ID], Class=btn dropdown-toggle btn-soft-primary border, Type=button
Element 8: Tag=button, Text='[No text]', ID=[No ID], Class=btn dropdown-toggle btn-soft-white border, Type=button
Element 9: Tag=button, 

In [None]:
# OPTIONAL: Manual intervention if automatic page processing fails
# Uncomment and run this cell only if the automatic page extraction fails
# This gives you time to manually handle a specific page

"""
# Which page needs manual intervention?
page_num = 1  # Change this to the problematic page number

print(f"⚠️ Manual intervention required for page {page_num}")
print("You have 60 seconds to:")
print("1. Click on the correct page (if needed)")
print("2. Click the 'Start' button")
print("3. Wait for OCR to complete")
print("4. The script will continue automatically after the timeout")

time.sleep(60)  # Wait for manual intervention
print("✓ Continuing with extraction...")

# Take a screenshot after manual intervention
screenshot_path = f"/var/home/ramrshrcg/Desktop/Python/scrapping/page_{page_num}_after_manual.png"
driver.save_screenshot(screenshot_path)
print(f"✓ Screenshot saved to {screenshot_path}")

# Try to extract text from the current page
from selenium.webdriver.common.by import By
import time

# Extract text
extracted_text = None
try:
    # First try by ID
    text_element = driver.find_element(By.ID, 'ocrTextBox')
    extracted_text = text_element.get_attribute('value')
except:
    try:
        # Then try by tag
        text_element = driver.find_element(By.CSS_SELECTOR, 'textarea')
        extracted_text = text_element.get_attribute('value')
    except:
        print("Could not find text element after manual intervention")

if extracted_text:
    print(f"✓ Successfully extracted text (length: {len(extracted_text)} characters)")
    
    # Create output filename for this page
    import os
    pdf_filename = os.path.basename(pdf)
    output_filename = os.path.splitext(pdf_filename)[0] + f"_page{page_num}_manual_extracted.txt"
    output_path = os.path.join('/var/home/ramrshrcg/Desktop/Python/scrapping/Book', output_filename)
    
    # Save the extracted text to a file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    
    print(f"✓ Extracted text saved to: {output_path}")
"""

'\n# Which page needs manual intervention?\npage_num = 1  # Change this to the problematic page number\n\nprint(f"⚠️ Manual intervention required for page {page_num}")\nprint("You have 60 seconds to:")\nprint("1. Click on the correct page (if needed)")\nprint("2. Click the \'Start\' button")\nprint("3. Wait for OCR to complete")\nprint("4. The script will continue automatically after the timeout")\n\ntime.sleep(60)  # Wait for manual intervention\nprint("✓ Continuing with extraction...")\n\n# Take a screenshot after manual intervention\nscreenshot_path = f"/var/home/ramrshrcg/Desktop/Python/scrapping/page_{page_num}_after_manual.png"\ndriver.save_screenshot(screenshot_path)\nprint(f"✓ Screenshot saved to {screenshot_path}")\n\n# Try to extract text from the current page\nfrom selenium.webdriver.common.by import By\nimport time\n\n# Extract text\nextracted_text = None\ntry:\n    # First try by ID\n    text_element = driver.find_element(By.ID, \'ocrTextBox\')\n    extracted_text = text_e

In [None]:
# STEP 4: Optional Additional Text Extraction
# This cell can be used if you need to extract text again from a specific page
# For example, if a previous extraction failed or you need to re-extract


# If you need to extract text from the current state of the page:
extracted_text = None

# Try different methods to get the text
for selector in ['#ocrTextBox', 'textarea', 'pre', '.text-output']:
    try:
        element = driver.find_element(By.CSS_SELECTOR, selector)
        text = element.get_attribute('value') or element.text
        if text and len(text) > 0:
            extracted_text = text
            print(f"Found text using selector: {selector} (length: {len(text)} chars)")
            break
    except:
        pass

# JavaScript fallback
if not extracted_text:
    try:
        extracted_text = driver.execute_script("""
            var textarea = document.querySelector('textarea');
            if (textarea && textarea.value) return textarea.value;
            var pre = document.querySelector('pre');
            if (pre && pre.textContent) return pre.textContent;
            return '';
        """)
        
        if extracted_text:
            print(f"Found text using JavaScript (length: {len(extracted_text)} chars)")
    except:
        pass

# Save the text if found
if extracted_text:
    # Create unique filename
    import os
    import time
    timestamp = int(time.time())
    pdf_filename = os.path.basename(pdf)
    output_filename = os.path.splitext(pdf_filename)[0] + f"_additional_{timestamp}.txt"
    output_path = os.path.join('/var/home/ramrshrcg/Desktop/Python/scrapping/Book', output_filename)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    
    print(f"✓ Additional extracted text saved to: {output_path}")
else:
    print("⚠️ No additional text found")


⚠️ No additional text found


In [None]:
# STEP 5: Clean up
driver.close()
print("✓ Browser closed")
print("✅ OCR extraction completed successfully!")

InvalidSessionIdException: Message: Tried to run command without establishing a connection
