In [2]:
import cv2
import time
import numpy as np

# Load YOLO
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
layer_names = net.getLayerNames()
output_layer_indexes = net.getUnconnectedOutLayers()

if output_layer_indexes.ndim == 1:
    output_layers = [layer_names[i - 1] for i in output_layer_indexes]
else:
    output_layers = [layer_names[i[0] - 1] for i in output_layer_indexes]

classes = open("coco.names").read().strip().split("\n")

# Define the object classes we are interested in
interested_classes = ['pen', 'bottle', 'cup', 'book']  # Assuming 'cup' is the class for glasses

# Initialize the webcam capture
cap = cv2.VideoCapture(0)

# Check if the webcam is opened correctly
if not cap.isOpened():
    raise IOError("Cannot open webcam")

# Initialize frame count, object count, and start time
frame_count = 0
object_frame_count = 0
start_time = time.time()

try:
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        frame_count += 1

        if not ret:
            break

        height, width, channels = frame.shape

        # Detecting objects
        blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
        net.setInput(blob)
        outs = net.forward(output_layers)

        # Information for each object detected
        for out in outs:
            for detection in out:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5 and classes[class_id] in interested_classes:
                    # Object detected
                    object_frame_count += 1
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)

                    # Rectangle coordinates
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)

                    # Draw rectangle around the object
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                    text = f"{classes[class_id]}: {confidence:.2f}"
                    cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                    break  # Stop as we found an interested object in this frame

        # Show the live feed with the object detected
        cv2.imshow("Frame", frame)

        # Break the loop
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    # When everything is done, release the capture
    cap.release()
    cv2.destroyAllWindows()

    # Calculate total time of video processed
    end_time = time.time()
    total_time_seconds = end_time - start_time
    total_minutes = total_time_seconds / 60

    # Print statistics
    print(f'Total number of frames: {frame_count}')
    print(f'Total minutes of video processed: {total_minutes:.2f}')
    print(f'Frames with interested objects: {object_frame_count}')


Total number of frames: 24
Total minutes of video processed: 0.31
Frames with interested objects: 10
