In [1]:
import ultralytics
from ultralytics import YOLO
ultralytics.checks()

Ultralytics 8.3.40  Python-3.10.6 torch-2.4.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4060, 8187MiB)
Setup complete  (12 CPUs, 15.9 GB RAM, 889.9/1863.0 GB disk)


In [15]:
def center_crop(frame, target_size=(480, 480)):
    h, w, _ = frame.shape
    crop_h, crop_w = target_size

    # Calculate cropping coordinates
    start_x = max(0, (w - crop_w) // 2)
    start_y = max(0, (h - crop_h) // 2)
    end_x = start_x + crop_w
    end_y = start_y + crop_h

    # Perform cropping
    cropped_frame = frame[start_y:end_y, start_x:end_x]
    return cropped_frame

# Object Detection

In [2]:
model = YOLO("runs/detect/train5/weights/best.pt")

## Webcam connected on pc

In [None]:
import cv2

# Initialize the webcam (use 0 for default webcam, or 1, 2, etc., for external webcams)
cap = cv2.VideoCapture(0)

# model = YOLO("runs/detect/train5/weights/best.pt")

# Check if the webcam is opened correctly
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

# Loop to capture frames
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    
    # If frame is read correctly, ret will be True
    if not ret:
        print("Error: Cannot read frame.")
        break

    # results = model.predict(frame, imgsz=480)
    cropped_frame = center_crop(frame, target_size=(480, 480))
    results = model.predict(cropped_frame, imgsz=480)
    for result in results:
        for box in result.boxes:
            # Get bounding box coordinates
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert to integers
            conf = box.conf[0]  # Confidence score
            cls = int(box.cls[0])  # Class index
            label = model.names[cls]  # Class label from model

            # Draw the bounding box
            cv2.rectangle(cropped_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            # Add label and confidence score
            cv2.putText(cropped_frame, f"{label} {conf:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    # Display the resulting frame
    cv2.imshow('Webcam', cropped_frame)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


## Phone camera

Steps:
1. Install **IP Webcam** (Thyoni Tech)
2. Scroll down and click on **Start Server** (make sure that PC and phone are connected on network)
3. Change the url

In [None]:
# Import essential libraries 
import requests 
import cv2 
import numpy as np 

# Replace the below URL with your own. Make sure to add "/shot.jpg" at last. 
url = "http://192.168.188.30:8080/shot.jpg"

# While loop to continuously fetching data from the Url 
while True: 
	img_resp = requests.get(url) 
	img_arr = np.array(bytearray(img_resp.content), dtype=np.uint8) 
	frame = cv2.imdecode(img_arr, -1) 
	# cropped_frame = center_crop(frame, target_size=(480, 480))
	cropped_frame = frame
	results = model.predict(cropped_frame, imgsz=480)
	for result in results:
		for box in result.boxes:
			# Get bounding box coordinates
			x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert to integers
			conf = box.conf[0]  # Confidence score
			cls = int(box.cls[0])  # Class index
			label = model.names[cls]  # Class label from model

			# Draw the bounding box
			cv2.rectangle(cropped_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
			# Add label and confidence score
			cv2.putText(cropped_frame, f"{label} {conf:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

	cv2.imshow("Android_cam", cropped_frame) 

	# Press Esc key to exit 
	if cv2.waitKey(1) & 0xFF == ord('q'):
		break

cv2.destroyAllWindows() 


# Segmentation

In [42]:
model = YOLO("runs/segment/train5/weights/best.pt")

In [None]:
# Import essential libraries 
import requests 
import cv2 
import numpy as np 

# Replace the below URL with your own. Make sure to add "/shot.jpg" at last. 
url = "http://192.168.188.30:8080/shot.jpg"

alpha = 0.5

# While loop to continuously fetching data from the URL
while True:
    # Fetch the frame from the Android camera
    img_resp = requests.get(url)
    img_arr = np.array(bytearray(img_resp.content), dtype=np.uint8)
    frame = cv2.imdecode(img_arr, -1)

    # Run YOLO prediction on the frame
    results = model(frame)

    # Get the masks and bounding boxes from the results
    if results[0].masks:  # Check if masks exist
        masks = results[0].masks.data.cpu().numpy()  # Convert masks to NumPy array
        boxes = results[0].boxes.xyxy.cpu().numpy()  # Bounding boxes (x_min, y_min, x_max, y_max)
        confidences = results[0].boxes.conf.cpu().numpy()  # Confidence scores
        classes = results[0].boxes.cls.cpu().numpy()  # Class IDs

        # Overlay transparent red masks on the frame
        for mask in masks:
            mask_resized = cv2.resize(mask, (frame.shape[1], frame.shape[0]))  # Resize mask if needed
            mask_binary = (mask_resized > 0.5).astype(np.uint8)  # Threshold the mask

            # Create a red mask
            red_mask = np.zeros_like(frame, dtype=np.uint8)
            red_mask[:, :, 2] = mask_binary * 255  # Set red channel to maximum

            # Apply transparency by blending the red mask with the original frame
            frame = np.where(
                mask_binary[:, :, None] == 1,
                cv2.addWeighted(frame, 1 - alpha, red_mask, alpha, 0),
                frame
            )

        # Draw bounding boxes and labels
        for i, box in enumerate(boxes):
            x_min, y_min, x_max, y_max = map(int, box)
            confidence = confidences[i]
            class_id = int(classes[i])

            # Draw the bounding box
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

            # Add label and confidence score
            label = f"Class {class_id} ({confidence:.2f})"
            cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame with the transparent red mask and bounding boxes
    cv2.imshow("Android_cam", frame)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()


0: 288x480 (no detections), 213.0ms
Speed: 3.0ms preprocess, 213.0ms inference, 0.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 16.0ms
Speed: 1.0ms preprocess, 16.0ms inference, 1.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 15.0ms
Speed: 1.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 0.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 1.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 0.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 23.0ms
Speed: 2.0ms preprocess, 23.0ms inference, 1.0ms postprocess per image at shape (1, 3, 288, 480)

0: 288x480 (no detections), 17.0ms
Speed: 1.0ms preprocess, 17.0ms