In [None]:
# Import required libraries
import cv2
import torch
import pyttsx3
import keyboard
from ultralytics import YOLO


# Initialize the YOLO model
def init_model(model_path):
    """Initialize the YOLO model with the given path."""
    return YOLO(model_path)


# Initialize the Text-to-Speech engine
def init_text_to_speech():
    """Initialize the text-to-speech engine."""
    return pyttsx3.init()


# Draw bounding boxes and descriptions
def draw_boxes(frame, detections, model, confidence_threshold):
    """
    Draw bounding boxes and print the description if confidence is above the threshold.
    
    :param frame: The frame to draw boxes on.
    :param detections: Detected objects data.
    :param model: The YOLO model.
    :param confidence_threshold: Confidence threshold for detections.
    """
    for data in detections.boxes.data.tolist():
        confidence = data[4]
        if confidence < confidence_threshold:
            continue

        xmin, ymin, xmax, ymax, _, class_id = map(int, data)
        class_name = model.names[class_id]

        # Draw the rectangle around each object
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

        # Description to be spoken
        description = f"{class_name} with confidence {confidence:.1f}"
        print(description)

        # Speak the description
        text_to_speech.say(description)
        text_to_speech.runAndWait()


if __name__ == "__main__":
    # Configuration
    model_path = "yolov8m.pt"
    confidence_threshold = 0.25

    # Initialize model and text-to-speech
    model = init_model(model_path)
    text_to_speech = init_text_to_speech()

    # Capture video from the default camera (camera index 0)
    cap = cv2.VideoCapture(0)

    # Validate if the capture is opened properly
    if not cap.isOpened():
        print("Error: Could not open video capture.")
        exit()

    # Main loop for capturing frames and processing
    while not keyboard.is_pressed('q'):
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            break

        # Get the detections from the model
        results = model(frame)
        detections = results[0]

        # Draw bounding boxes and descriptions on the frame
        draw_boxes(frame, detections, model, confidence_threshold)

        # Display the frame with bounding boxes
        cv2.imshow('Object Detection', frame)
        
        # Wait for 1ms in-between frames and break if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release video capture and close windows
    cap.release()
    cv2.destroyAllWindows()

