In [1]:
!pip install transformers torchvision torchaudio easyocr opencv-python pillow scikit-learn

Collecting torchaudio
  Downloading torchaudio-2.6.0-cp312-cp312-win_amd64.whl.metadata (6.7 kB)
Downloading torchaudio-2.6.0-cp312-cp312-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.4 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.4 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.4 MB ? eta -:--:--
   ---- ---

In [2]:
import cv2
import numpy as np
from PIL import Image

def preprocess_image(path):
    img = cv2.imread(path)
    if img is None:
        raise FileNotFoundError(f"Image not found at: {path}")
        
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    inverted = cv2.bitwise_not(gray)
    blurred = cv2.GaussianBlur(inverted, (3, 3), 0)
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    resized = cv2.resize(thresh, (1024, 512))
    return Image.fromarray(cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)), gray


In [None]:
from sklearn.linear_model import LogisticRegression

# A very basic classifier based on pixel intensity variance (you can replace with a CNN if needed)
def is_handwritten_text(gray_img):
    # Compute basic statistical features
    mean = np.mean(gray_img)
    std = np.std(gray_img)
    
    # Heuristic: high variance often means handwriting (irregular strokes)
    return std > 50


In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

# Load once
processor_trocr = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model_trocr = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_trocr.to(device)

def run_trocr(image_pil):
    pixel_values = processor_trocr(images=image_pil, return_tensors="pt").pixel_values.to(device)
    with torch.no_grad():
        generated_ids = model_trocr.generate(pixel_values)
        generated_text = processor_trocr.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

In [6]:
import easyocr
reader = easyocr.Reader(['en'], gpu=False)

def run_easyocr(image_path):
    result = reader.readtext(image_path)
    return ' '.join([d[1] for d in result])


Using CPU. Note: This module is much faster with a GPU.


In [7]:
def run_ocr(image_path):
    print(f"🔍 Analyzing image: {image_path}")
    
    image_pil, gray = preprocess_image(image_path)
    
    if is_handwritten_text(gray):
        print("🖊️ Detected: Handwritten text (using TrOCR)")
        text = run_trocr(image_pil)
    else:
        print("🖨️ Detected: Printed text (using EasyOCR)")
        text = run_easyocr(image_path)
    
    print("✅ Recognized Text:\n", text)
    return text


In [10]:
# Replace with your image path
run_ocr("ocr7.jpg")


🔍 Analyzing image: ocr7.jpg
🖨️ Detected: Printed text (using EasyOCR)
✅ Recognized Text:
 penguins are cute except one T


'penguins are cute except one T'