In [14]:
# !pip install pytesseract 
# !pip install torch torchvision transformers timm opencv-python numpy sounddevice 
# !pip install torch torchvision transformers timm opencv-python numpy sounddevice
# !pip install pytesseract 
# !pip install transformers
# !pip install pyttsx3 pyaudio



In [1]:
import torch
import cv2
import numpy as np
from transformers import DetrImageProcessor, DetrForObjectDetection, AutoProcessor, AutoModelForSeq2SeqLM
import torchvision.transforms as T
import pytesseract
from PIL import Image
import pyttsx3  # For text-to-speech
from transformers import VisionEncoderDecoderModel

# ============================
# Load Models
# ============================

# 1. Load DETR Model for Object Detection (Hugging Face)
detr_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# 2. Load MiDaS for Depth Estimation (Torch Hub)
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.eval()

# 3. Load OCR Model (TrOCR for text recognition)
ocr_processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten",use_fast=True)
# ocr_model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/trocr-base-handwritten")
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


# 4. Configure Tesseract OCR Path (Ensure Tesseract is installed)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 5. Initialize Text-to-Speech Engine
tts = pyttsx3.init()

# ============================
# Camera & Processing Setup
# ============================

# Open camera feed
cap = cv2.VideoCapture(0)

# Define average human step in feet (adjustable)
avg_step_feet = 2.5  # Estimated step size for a human

# Preprocessing transformations for MiDaS (Depth Estimation)
midas_transform = T.Compose([
    T.Resize((384, 384)),  # Resize image for MiDaS
    T.ToTensor(),  # Convert image to tensor
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize for MiDaS
])

# ============================
# Real-Time Processing Loop
# ============================
try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # Stop if camera feed fails

        # Convert OpenCV image to PIL format for DETR processing
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs = detr_processor(images=pil_image, return_tensors="pt")

        # Object Detection using DETR
        with torch.no_grad():
            outputs = detr_model(**inputs)

        # Post-process DETR output
        target_sizes = torch.tensor([pil_image.size[::-1]])
        results = detr_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.7)[0]

        # Depth Estimation using MiDaS
        img_tensor = midas_transform(pil_image).unsqueeze(0)  # Preprocess image for MiDaS
        with torch.no_grad():
            depth_map = midas(img_tensor)

        # Resize depth map to match input image dimensions
        depth_map = depth_map.squeeze().cpu().numpy()
        depth_map = cv2.resize(depth_map, (frame.shape[1], frame.shape[0]))

        # ============================
        # Process Detected Objects
        # ============================

        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            x1, y1, x2, y2 = map(int, box.tolist())  # Extract bounding box coordinates
            class_name = detr_model.config.id2label[label.item()]  # Get object label

            # Estimate Distance (Average depth value inside bounding box)
            avg_depth = np.mean(depth_map[y1:y2, x1:x2])
            distance_ft = avg_depth * 3.28  # Convert meters to feet
            steps_to_object = distance_ft / avg_step_feet  # Estimate number of steps

            # Draw Bounding Box & Display Distance on Frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            text = f"{class_name} | {distance_ft:.1f} ft | Steps: {steps_to_object:.1f}"
            cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            # ============================
            # Text Recognition (OCR) for Objects
            # ============================

            obj_crop = frame[y1:y2, x1:x2]  # Crop detected object
            gray_crop = cv2.cvtColor(obj_crop, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
            detected_text = pytesseract.image_to_string(gray_crop, config="--psm 6")  # OCR Processing

            if detected_text.strip():  # If text is detected
                print(f"Detected Text: {detected_text}")
                tts.say(detected_text)  # Read detected text aloud
                tts.runAndWait()  # Wait for speech to complete
                cv2.putText(frame, f"Text: {detected_text}", (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 2)

        # Show the processed frame
        cv2.imshow("Object Detection, Distance Estimation, & OCR", frame)

        key = cv2.waitKey(1) & 0xFF
        if key == ord("q") or key == 27:  # Press 'q' or ESC to quit
            print("Exiting program...")
            break

except KeyboardInterrupt:
    print("Program interrupted by user. Exiting...")

finally:
    # Release resources
    cap.release()
    cv2.destroyAllWindows()
    print("Camera feed closed. All resources released.")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using cache found in C:\Users\Reek

Loading weights:  None


Using cache found in C:\Users\Reek/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cr

Detected Text: iF

Detected Text: q

Detected Text: La |
L*
e%
aver
4

Detected Text: |

Detected Text: iF

Detected Text: Kid

Detected Text: 4 ;
E
‘oy

Detected Text: YF

Detected Text: ‘\

Detected Text: l,

Detected Text: i

Detected Text: =|

Detected Text: .
4

Detected Text: =,

Detected Text: —

Detected Text: \,

Detected Text: ‘a

Detected Text: Y

Exiting program...
Camera feed closed. All resources released.
