In [None]:
# !pip install ultralytics opencv-python torch torchvision torchaudio
# !pip install timm 


In [4]:
# Import required libraries
from ultralytics import YOLO
import cv2
import torch
import numpy as np
from torchvision import transforms as T
from PIL import Image

# Step 1: Load YOLOv8 model for object detection
# YOLOv8n (nano) is used for faster inference; you can replace with 'yolov8m.pt' or 'yolov8l.pt' for better accuracy.
model = YOLO("yolov8n.pt")

# Step 2: Initialize the camera (0 for default webcam)
cap = cv2.VideoCapture(0)

# Step 3: Load MiDaS model for depth estimation
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")  # MiDaS_small is a lightweight model
midas.eval()  # Set the model to evaluation mode

# Step 4: Define MiDaS preprocessing pipeline
# - Resize input to 384x384 (default for MiDaS_small)
# - Convert image to tensor format
# - Normalize the image (standard ImageNet mean & std)
transform = T.Compose([
    T.Resize((384, 384)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Step 5: Start capturing frames from the camera
while cap.isOpened():
    ret, frame = cap.read()  # Read a frame from the webcam
    if not ret:
        break  # Exit if the frame is empty

    # Step 6: Run YOLOv8 object detection on the frame
    results = model(frame)

    # Step 7: Process detected objects
    for r in results:
        for box in r.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Extract bounding box coordinates
            label = r.names[int(box.cls[0])]  # Get detected object class name
            conf = box.conf[0].item()  # Confidence score of detection

            # Crop detected object for depth estimation
            obj = frame[y1:y2, x1:x2]

            # Step 8: Convert OpenCV image (NumPy array) to PIL image
            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Step 9: Apply MiDaS transformation (resize, normalize, convert to tensor)
            img = transform(pil_image).unsqueeze(0)  # Add batch dimension

            # Step 10: Perform depth estimation with MiDaS
            with torch.no_grad():  # Disable gradient computation for faster inference
                depth_map = midas(img)

            # Step 11: Resize depth map to match original image size
            depth_map = depth_map.squeeze().cpu().numpy()  # Convert tensor to NumPy array
            depth_map = cv2.resize(depth_map, (frame.shape[1], frame.shape[0]))  # Resize to frame size

            # Step 12: Compute average depth of the detected object
            avg_depth = np.mean(depth_map[y1:y2, x1:x2])

            # Step 13: Draw bounding box and depth information on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw bounding box
            text = f"{label} ({conf:.2f}), Depth: {avg_depth:.2f}m"  # Display label, confidence, and depth
            cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Step 14: Show the frame with detected objects and depth estimation
    cv2.imshow("Object Detection & Distance Estimation", frame)

    # Step 15: Exit when 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Step 16: Release resources
cap.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\Reek/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\Reek/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master



0: 480x640 2 persons, 1 clock, 44.4ms
Speed: 1.9ms preprocess, 44.4ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 clock, 28.8ms
Speed: 1.1ms preprocess, 28.8ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 clock, 33.7ms
Speed: 0.8ms preprocess, 33.7ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 clock, 27.5ms
Speed: 0.8ms preprocess, 27.5ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 clock, 33.1ms
Speed: 0.7ms preprocess, 33.1ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 clock, 34.3ms
Speed: 0.8ms preprocess, 34.3ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 clock, 36.1ms
Speed: 0.7ms preprocess, 36.1ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 clock, 31.2ms
Speed: 1.0ms p