# Problem Statement: Automated Object Detection and Cropping in Video Files

### Objective: Develop a solution that processes video files in .mp4 format to automatically detect objects in each frame and save the cropped images of these objects into a designated folder. The system should handle videos of a minimum length of 5 to 10 minutes and ensure that all detected objects are saved correctly and organized in a folder structure for easy comparison with the original input video.

In [None]:
!pip install ultralytics



In [None]:
from ultralytics import YOLO

In [None]:
# Define weights path
weights_path = '/content/drive/MyDrive/Intern assignment/Object detection/YOLO/yolov8s.pt'

In [None]:
# Load and save the YOLOv8 model
model = YOLO('yolov8s.pt')  # Downloading the YOLOv8 small model

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:00<00:00, 56.2MB/s]


In [None]:
# Save the model weights
model.save(weights_path)
print(f"YOLOv8 weights saved to {weights_path}")

YOLOv8 weights saved to /content/drive/MyDrive/Intern assignment/Object detection/YOLO/yolov8s.pt


###import library

In [None]:
import cv2
import os
import pandas
from ultralytics import YOLO
from google.colab import drive

### Mount drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define paths
weights_path = '/content/drive/MyDrive/Intern assignment/Object detection/YOLO/yolov8s.pt'
video_path = '/content/drive/MyDrive/Intern assignment/Object detection/supermarkets.mp4'
output_folder = '/content/drive/MyDrive/Intern assignment/Object detection/cropped_objects'

In [None]:
# Load YOLOv8 model
model = YOLO(weights_path)

In [None]:
# Function to save cropped objects from video
def save_cropped_objects(video_path, output_folder, num_frames=10):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = [int(i * frame_count / num_frames) for i in range(num_frames)]

    for idx, frame_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            break

        # Perform object detection
        results = model(frame)

        # Process results
        for result in results:
            # Extract bounding boxes and class names
            for detection in result.boxes:
                # Convert tensors to native Python data types
                xyxy = detection.xyxy.tolist()  # Convert tensor to list
                xmin, ymin, xmax, ymax = map(int, xyxy[0])
                conf = float(detection.conf.item())
                class_id = int(detection.cls.item())
                class_name = model.names[class_id]

                # Draw bounding box and label
                color = (0, 255, 0)  # Green
                cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2)
                label = f'{class_name} {conf:.2f}'
                cv2.putText(frame, label, (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                # Save cropped image
                cropped_img = frame[ymin:ymax, xmin:xmax]
                output_path = os.path.join(output_folder, f'frame_{idx+1}_object_{class_id}_{class_name}.jpg')
                cv2.imwrite(output_path, cropped_img)

        # Save the frame with bounding boxes and labels
        frame_with_boxes_path = os.path.join(output_folder, f'frame_{idx+1}_with_boxes.jpg')
        cv2.imwrite(frame_with_boxes_path, frame)

    cap.release()
    cv2.destroyAllWindows()

In [None]:
# Save cropped objects and frames with bounding boxes
save_cropped_objects(video_path, output_folder, num_frames=10)


0: 384x640 1 person, 1 cup, 458.5ms
Speed: 4.3ms preprocess, 458.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 bottle, 2 cups, 1 hot dog, 1 dining table, 398.3ms
Speed: 4.0ms preprocess, 398.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 oranges, 627.9ms
Speed: 10.2ms preprocess, 627.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 bowls, 2 apples, 3 oranges, 1 carrot, 411.0ms
Speed: 4.1ms preprocess, 411.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 apples, 403.5ms
Speed: 4.9ms preprocess, 403.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 2 cups, 450.2ms
Speed: 5.7ms preprocess, 450.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 bottle, 6 cups, 2 chairs, 432.9ms
Speed: 6.6ms preprocess, 432.9ms inference, 1.5ms postproce