In [8]:
# Core
import os
import torch
from PIL import Image
import numpy as np
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torchvision.transforms as transforms

# Models
from transformers import BlipProcessor, BlipForConditionalGeneration
import easyocr
import cv2

# TTS
import pyttsx3

# YOLOv8
from ultralytics import YOLO


In [18]:
# Load BLIP-2 (Scene Description)
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load YOLOv8 (Object Detection)
yolo_model = YOLO("yolov8n.pt")  # Nano model (lightweight)

# Load EasyOCR (Text Detection)
# Load models once
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
reader = easyocr.Reader(['en'], gpu=False)

# Load PyTTSX3
engine = pyttsx3.init()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.0"
}

Config of the decoder: <class 'transfor

In [3]:
def describe_scene(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(raw_image, return_tensors="pt")
    out = blip_model.generate(**inputs)
    description = blip_processor.decode(out[0], skip_special_tokens=True)
    return description


In [4]:
def detect_objects(image_path):
    results = yolo_model(image_path)
    objects = results[0].names
    labels = [objects[int(cls)] for cls in results[0].boxes.cls]
    return list(set(labels))


In [20]:


def read_text(image_path):

# EasyOCR (already loaded)
    reader = easyocr.Reader(['en'], gpu=False)

# Preprocessing for TrOCR
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return image

# Combined OCR function
def read_text_combined(image_path):
    print("\n🔍 Performing OCR with TrOCR and EasyOCR...")

    # TrOCR part
    image = preprocess_image(image_path)
    pixel_values = trocr_processor(images=image, return_tensors="pt").pixel_values
    generated_ids = trocr_model.generate(pixel_values)
    trocr_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # EasyOCR part
    result_easyocr = reader.readtext(image_path)
    easy_text = [item[1] for item in result_easyocr if len(item[1]) > 1]
    easyocr_combined = " ".join(easy_text)

    # Merge both
    final_text = trocr_text.strip() + ". " + easyocr_combined.strip()

    return final_text if final_text.strip() else "No readable text found."




In [21]:
def speak(text):
    print("Speaking:", text)
    engine.say(text)
    engine.runAndWait()


In [22]:
def run_pipeline(image_path):
    print("Analyzing image:", image_path)

    scene = describe_scene(image_path)
    objects = detect_objects(image_path)
    text = read_text_combined(image_path)

    # Display results
    print("Scene:", scene)
    print("Objects Detected:", objects)
    print("Text Found:", text)

    # Speak out
    speak("Scene description: " + scene)
    speak("Detected objects: " + ", ".join(objects))
    speak("Text says: " + text)


In [25]:
run_pipeline("finalocr.jpeg")


Analyzing image: finalocr.jpeg

image 1/1 C:\Users\Hp\Documents\finalocr.jpeg: 384x640 1 person, 1 cup, 1 tv, 2 laptops, 1 mouse, 1 keyboard, 74.7ms
Speed: 2.7ms preprocess, 74.7ms inference, 14.8ms postprocess per image at shape (1, 3, 384, 640)

🔍 Performing OCR with TrOCR and EasyOCR...
Scene: a woman sitting at a desk with a book and a computer
Objects Detected: ['tv', 'laptop', 'cup', 'person', 'keyboard', 'mouse']
Text Found: ITEM. I At Gidep Yiqui AF 467
Speaking: Scene description: a woman sitting at a desk with a book and a computer
Speaking: Detected objects: tv, laptop, cup, person, keyboard, mouse
Speaking: Text says: ITEM. I At Gidep Yiqui AF 467
