In [2]:
import cv2
import pandas as pd
from ultralytics import YOLO
import json

# Initialize the YOLO model
model = YOLO('yolov8s.pt')

# Load the class names
class_list = model.names

# Open the video file
cap = cv2.VideoCapture('test_videos/6.mp4')

# Initialize video writer for saving the output video
output_video_path = 'output_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for the output video
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)  # Get the video's FPS
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Initialize variables for tracking
tracking_timestamps = {}

# GCP-like JSON structure
gcp_json = {
    "annotationResults": [
        {
            "segment": {
                "startTimeOffset": "0s",
                "endTimeOffset": f"{int(cap.get(cv2.CAP_PROP_FRAME_COUNT) / fps)}s"
            },
            "labelAnnotations": [],
            "objectAnnotations": []
        }
    ]
}

frame_number = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_number += 1
    timestamp = frame_number / fps  # Calculate the timestamp for each frame

    results = model.predict(frame)

    if results[0].boxes.data is not None:
        a = results[0].boxes.data.detach().cpu().numpy()
        px = pd.DataFrame(a)

        for index, row in px.iterrows():
            x1, y1, x2, y2, conf, d = row
            if conf > 0.5:  # Skip low-confidence detections
                c = class_list[int(d)]
                id = int(index)  # Use the index as a unique ID for this iteration

                # Update timestamps for the detected class
                if id not in tracking_timestamps:
                    tracking_timestamps[id] = {
                        "class": c,
                        "start_time": timestamp,
                        "last_time": timestamp
                    }
                else:
                    tracking_timestamps[id]["last_time"] = timestamp

                # Append to the GCP-like JSON structure
                gcp_json["annotationResults"][0]["objectAnnotations"].append({
                    "entity": {
                        "description": c,
                    },
                    "segment": {
                        "startTimeOffset": f"{tracking_timestamps[id]['start_time']:.2f}s",
                        "endTimeOffset": f"{tracking_timestamps[id]['last_time']:.2f}s"
                    },
                    "confidence": float(conf),
                    "boundingBox": {
                        "normalizedVertices": [
                            {"x": float(x1) / frame.shape[1], "y": float(y1) / frame.shape[0]},
                            {"x": float(x2) / frame.shape[1], "y": float(y2) / frame.shape[0]}
                        ]
                    },
                    "trackId": id
                })

                # Draw bounding box (optional, can be removed if not needed)
                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                cv2.putText(frame, c, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Write the frame to the output video
    out.write(frame)

    # Show the frame (optional, can be removed if not needed)
    # cv2.imshow("frames", frame)
    # if cv2.waitKey(1) & 0xFF == 27:
    #     break

# Release video capture and writer and close windows
cap.release()
out.release()  # Save the output video
cv2.destroyAllWindows()

# Save the GCP-like JSON output to a file (clears the file first)
with open('gcp_style_output.json', 'w') as f:
    f.truncate(0)  # Clear the file's contents before writing new data
    json.dump(gcp_json, f, indent=4)

print("GCP-style JSON has been saved to 'gcp_style_output.json'")
print(f"Output video saved as '{output_video_path}'")



0: 384x640 13 cars, 3 trucks, 1 traffic light, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 cars, 3 trucks, 1 traffic light, 10.5ms
Speed: 2.0ms preprocess, 10.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 cars, 3 trucks, 1 traffic light, 9.8ms
Speed: 2.3ms preprocess, 9.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 cars, 3 trucks, 1 traffic light, 8.8ms
Speed: 3.4ms preprocess, 8.8ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 cars, 1 bus, 2 trucks, 1 traffic light, 9.9ms
Speed: 2.0ms preprocess, 9.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 cars, 4 trucks, 1 traffic light, 9.1ms
Speed: 1.4ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 cars, 1 bus, 3 trucks, 1 traffic light, 9.0ms
Speed: 2.0ms preprocess, 9.0