In [1]:
!pip install transformers torchvision torchaudio easyocr opencv-python pillow scikit-learn



In [3]:
import cv2
import numpy as np
from PIL import Image

def preprocess_image(path):
    img = cv2.imread(path)
    if img is None:
        raise FileNotFoundError(f"Image not found at: {path}")
        
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    inverted = cv2.bitwise_not(gray)
    blurred = cv2.GaussianBlur(inverted, (3, 3), 0)
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    resized = cv2.resize(thresh, (1024, 512))
    return Image.fromarray(cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)), gray


In [4]:
from sklearn.linear_model import LogisticRegression

# A very basic classifier based on pixel intensity variance (you can replace with a CNN if needed)
def is_handwritten_text(gray_img):
    # Compute basic statistical features
    mean = np.mean(gray_img)
    std = np.std(gray_img)
    
    # Heuristic: high variance often means handwriting (irregular strokes)
    return std > 50


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [5]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

# Load once
processor_trocr = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model_trocr = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_trocr.to(device)

def run_trocr(image_pil):
    pixel_values = processor_trocr(images=image_pil, return_tensors="pt").pixel_values.to(device)
    with torch.no_grad():
        generated_ids = model_trocr.generate(pixel_values)
        generated_text = processor_trocr.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.0"
}

Config of the decoder: <class 'transfor

In [6]:
import easyocr
reader = easyocr.Reader(['en'], gpu=False)

def run_easyocr(image_path):
    result = reader.readtext(image_path)
    return ' '.join([d[1] for d in result])


Using CPU. Note: This module is much faster with a GPU.


In [7]:
def run_ocr(image_path):
    print(f"🔍 Analyzing image: {image_path}")
    
    image_pil, gray = preprocess_image(image_path)
    
    if is_handwritten_text(gray):
        print("🖊️ Detected: Handwritten text (using TrOCR)")
        text = run_trocr(image_pil)
    else:
        print("🖨️ Detected: Printed text (using EasyOCR)")
        text = run_easyocr(image_path)
    
    print("✅ Recognized Text:\n", text)
    return text


In [10]:
# Replace with your image path
run_ocr("ocr7.jpg")


🔍 Analyzing image: ocr7.jpg
🖨️ Detected: Printed text (using EasyOCR)
✅ Recognized Text:
 penguins are cute except one T


'penguins are cute except one T'