In [1]:
import sys
import cv2
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QPushButton, QHBoxLayout
from PyQt5.QtGui import QImage, QPixmap
from PyQt5.QtCore import QTimer
from ultralytics import YOLO
from gtts import gTTS
import pygame
import os
from datetime import datetime

class ObjectDetectionGUI(QWidget):
    def __init__(self):
        super().__init__()
        self.model = YOLO('yolov8n.pt')
        pygame.mixer.init()
        self.initUI()
        self.detected_objects = []
        self.conf_threshold = 0.5
        self.cap = cv2.VideoCapture(0)
        self.timer = QTimer()
        self.timer.timeout.connect(self.update_frame)
        self.timer.start(30)

    def initUI(self):
        self.setWindowTitle('Object Detection GUI')
        self.setGeometry(100, 100, 960, 700)
        self.video_label = QLabel(self)
        self.video_label.setFixedSize(960, 540)

        self.announce_btn = QPushButton('Announce')
        self.announce_btn.clicked.connect(self.announce_objects)
        self.conf_up_btn = QPushButton('Increase Confidence')
        self.conf_up_btn.clicked.connect(self.conf_up)
        self.conf_down_btn = QPushButton('Decrease Confidence')
        self.conf_down_btn.clicked.connect(self.conf_down)
        self.quit_btn = QPushButton('Quit')
        self.quit_btn.clicked.connect(self.close_app)

        hbox = QHBoxLayout()
        hbox.addWidget(self.announce_btn)
        hbox.addWidget(self.conf_up_btn)
        hbox.addWidget(self.conf_down_btn)
        hbox.addWidget(self.quit_btn)
        
        self.obj_label = QLabel("Detected objects will appear here.")
        self.obj_label.setWordWrap(True)

        vbox = QVBoxLayout()
        vbox.addWidget(self.video_label)
        vbox.addLayout(hbox)
        vbox.addWidget(self.obj_label)
        self.setLayout(vbox)

    def update_frame(self):
        ret, frame = self.cap.read()
        if ret:
            frame = cv2.flip(frame, 1)
            objects_in_frame, boxed_frame = self.detect_objects(frame)
            self.detected_objects = objects_in_frame

            # Convert OpenCV frame to Qt format and display
            rgb_image = cv2.cvtColor(boxed_frame, cv2.COLOR_BGR2RGB)
            h, w, ch = rgb_image.shape
            bytes_per_line = ch * w
            qt_image = QImage(rgb_image.data, w, h, bytes_per_line, QImage.Format_RGB888)
            self.video_label.setPixmap(QPixmap.fromImage(qt_image))
            
            # Show objects in label
            objects_str = ", ".join([f"{obj[0]}({obj[1]:.2f})" for obj in objects_in_frame])
            self.obj_label.setText(f"Detected: {objects_str}\nConfidence threshold: {self.conf_threshold:.2f}")

    def detect_objects(self, frame):
        results = self.model(frame, conf=self.conf_threshold)
        objects = []
        for r in results:
            boxes = r.boxes
            if boxes is not None:
                for box in boxes:
                    x1, y1, x2, y2 = map(int, box.xyxy[0])
                    label = self.model.names[int(box.cls[0])]
                    conf = float(box.conf[0])
                    color = (0,255,0) if conf > 0.7 else (0,255,255) if conf > 0.5 else (0,165,255)
                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, f"{label} {conf:.2f}", (x1, y1-5), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
                    objects.append((label, conf))
        return objects, frame

    def text_to_speech(self, text):
        filename = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
        tts = gTTS(text=text, lang='en')
        tts.save(filename)
        pygame.mixer.music.load(filename)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(20)
        pygame.mixer.music.unload()
        os.remove(filename)

    def announce_objects(self):
        if not self.detected_objects:
            self.text_to_speech("No objects detected.")
            return
        summary = "The follwing objects can be detected" + ", ".join(f"a {obj[0]}" for obj in self.detected_objects if obj[1] > self.conf_threshold)
        self.text_to_speech(summary)

    def conf_up(self):
        self.conf_threshold = min(0.95, self.conf_threshold + 0.05)

    def conf_down(self):
        self.conf_threshold = max(0.05, self.conf_threshold - 0.05)

    def close_app(self):
        self.cap.release()
        pygame.mixer.quit()
        self.close()

if __name__ == '__main__':
    app = QApplication(sys.argv)
    od_app = ObjectDetectionGUI()
    od_app.show()
    sys.exit(app.exec_())

  from pkg_resources import resource_stream, resource_exists


pygame 2.6.1 (SDL 2.28.4, Python 3.13.0)
Hello from the pygame community. https://www.pygame.org/contribute.html

0: 480x640 1 person, 134.0ms
Speed: 2.8ms preprocess, 134.0ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 85.8ms
Speed: 2.4ms preprocess, 85.8ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.4ms
Speed: 1.2ms preprocess, 69.4ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.2ms
Speed: 1.0ms preprocess, 64.2ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.8ms
Speed: 1.2ms preprocess, 61.8ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.7ms
Speed: 2.0ms preprocess, 72.7ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 75.0ms
Speed: 1.3ms preprocess, 75.0ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0:

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
