<a href="https://colab.research.google.com/github/shubh1608/CabDriver-OptimizationProblem/blob/main/OCRusingTesseract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [3]:
import pytesseract
import shutil
import os
import random
try:
    from PIL import Image
except ImportError:
    import Image

In [None]:
import pytesseract
from PIL import Image
import pandas as pd
import sys
import os

def assess_image_quality_by_ocr(image_path, confidence_threshold=60):
    """
    Assess image quality based on OCR confidence scores.

    Args:
        image_path (str): Path to the image file
        confidence_threshold (int): Threshold below which image is considered bad quality

    Returns:
        dict: Contains assessment results including average confidence, quality rating, and detailed data
    """

    try:
        # Load the image using PIL
        image = Image.open(image_path)
        print(f"Processing image: {image_path}")
        print(f"Image size: {image.size}")

        # Perform OCR with detailed output including confidence scores
        # output_type=Output.DATAFRAME returns a pandas DataFrame with confidence scores
        ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)

        # Filter out rows where confidence is -1 (indicates no text detected)
        # Confidence of -1 typically means Tesseract couldn't detect any text in that region
        valid_text_data = ocr_data[ocr_data['conf'] > 0]

        if valid_text_data.empty:
            return {
                'average_confidence': 0,
                'quality_assessment': 'BAD - No readable text detected',
                'total_text_regions': 0,
                'readable_regions': 0,
                'image_readable': False,
                'threshold_used': confidence_threshold
            }

        # Calculate statistics about the OCR results
        average_confidence = valid_text_data['conf'].mean()
        total_regions = len(ocr_data)
        readable_regions = len(valid_text_data)
        high_confidence_regions = len(valid_text_data[valid_text_data['conf'] >= confidence_threshold])

        # Determine if image quality is good or bad based on average confidence
        is_good_quality = average_confidence >= confidence_threshold
        quality_label = "GOOD - Readable" if is_good_quality else "BAD - Poor quality"

        # Extract actual text content for reference
        detected_text = ' '.join(valid_text_data['text'].dropna().astype(str))

        # Print detailed analysis
        print(f"\n--- OCR Quality Analysis ---")
        print(f"Total text regions detected: {total_regions}")
        print(f"Regions with readable text: {readable_regions}")
        print(f"Regions meeting confidence threshold: {high_confidence_regions}")
        print(f"Average confidence score: {average_confidence:.2f}")
        print(f"Confidence threshold: {confidence_threshold}")
        print(f"Quality assessment: {quality_label}")
        print(f"Sample detected text: {detected_text[:100]}..." if detected_text else "No text detected")

        return {
            'average_confidence': round(average_confidence, 2),
            'quality_assessment': quality_label,
            'total_text_regions': total_regions,
            'readable_regions': readable_regions,
            'high_confidence_regions': high_confidence_regions,
            'image_readable': is_good_quality,
            'threshold_used': confidence_threshold,
            'detected_text_sample': detected_text[:200] if detected_text else None,
            'confidence_distribution': {
                'min': valid_text_data['conf'].min(),
                'max': valid_text_data['conf'].max(),
                'median': valid_text_data['conf'].median(),
                'std': valid_text_data['conf'].std()
            }
        }

    except FileNotFoundError:
        print(f"Error: Image file '{image_path}' not found.")
        return None
    except Exception as e:
        print(f"Error processing image: {str(e)}")
        return None

def batch_assess_images(image_paths, confidence_threshold=60):
    """
    Assess multiple images for quality based on OCR confidence.

    Args:
        image_paths (list): List of image file paths
        confidence_threshold (int): Confidence threshold for quality assessment

    Returns:
        list: Results for each image
    """
    results = []

    for image_path in image_paths:
        print(f"\n{'='*50}")
        result = assess_image_quality_by_ocr(image_path, confidence_threshold)
        if result:
            results.append({
                'image_path': image_path,
                **result
            })

    return results

# Example usage and testing
if __name__ == "__main__":
    # Example 1: Single image assessment
    print("OCR-Based Image Quality Assessment Tool")
    print("=" * 40)

    # You can modify this path to point to your image
    sample_image_path = "sample_document.jpg"  # Replace with your image path

    # You can adjust this threshold based on your needs
    # Typical ranges: 50-70 for moderate quality, 70+ for good quality
    confidence_threshold = 60

    if len(sys.argv) > 1:
        # Allow image path to be provided as command line argument
        sample_image_path = sys.argv[1]

    if len(sys.argv) > 2:
        # Allow confidence threshold to be provided as command line argument
        confidence_threshold = int(sys.argv[2])

    # Check if image file exists before processing
    if os.path.exists(sample_image_path):
        result = assess_image_quality_by_ocr(sample_image_path, confidence_threshold)

        if result:
            print(f"\n{'='*50}")
            print("FINAL ASSESSMENT:")
            print(f"Image: {sample_image_path}")
            print(f"Average Confidence: {result['average_confidence']}")
            print(f"Quality: {result['quality_assessment']}")
            print(f"Readable: {'Yes' if result['image_readable'] else 'No'}")
    else:
        print(f"Sample image '{sample_image_path}' not found.")
        print("\nTo use this script:")
        print("1. Install required packages: pip install pytesseract pillow pandas")
        print("2. Install Tesseract OCR engine on your system")
        print("3. Run: python script.py your_image.jpg [confidence_threshold]")

        # Example of how to use with multiple images
        print("\n" + "="*50)
        print("BATCH PROCESSING EXAMPLE:")

        # Uncomment and modify these lines to process multiple images
        # image_list = ["image1.jpg", "image2.png", "image3.pdf"]
        # batch_results = batch_assess_images(image_list, confidence_threshold=65)
        #
        # for result in batch_results:
        #     print(f"{result['image_path']}: {result['quality_assessment']} (Avg: {result['average_confidence']})")