In [1]:
from simulation.webots.controllers.ardupilot_vehicle_controller.drone_data import DroneData
from mavlink.mavlink.data import Attitude, GlobalPosition, Quaternion, Gimbal


def get_objects(data):
    quaternion = Quaternion(data['gimbal']['quaternion'])
    
    gimbal = Gimbal(
        timestamp=data['gimbal']['timestamp'],
        flags=data['gimbal']['flags'],
        quaternion=quaternion
    )
    
    global_position = GlobalPosition(
        timestamp=data['global_position']['timestamp'],
        latitude=data['global_position']['latitude'],
        longitude=data['global_position']['longitude'],
        altitude=data['global_position']['altitude'],
        relative_altitude=data['global_position']['relative_altitude'],
        vx=data['global_position']['vx'],
        vy=data['global_position']['vy'],
        vz=data['global_position']['vz'],
        heading=data['global_position']['heading']
    )

    attitude = Attitude(
        timestamp=data['attitude']['timestamp'],
        roll=data['attitude']['roll'],
        pitch=data['attitude']['pitch'],
        yaw=data['attitude']['yaw'],
        roll_speed=data['attitude']['roll_speed'],
        pitch_speed=data['attitude']['pitch_speed'],
        yaw_speed=data['attitude']['yaw_speed']
    )

    all_objects = {
        "gimbal": gimbal,
        "global_position": global_position,
        "attitude": attitude,
        "webots": DroneData.from_json(data['webots_data'])
    }
    
    return all_objects

In [2]:
import json

file_name = "data.json"

with open(file_name, "r") as file:
    data = json.load(file)

In [6]:
from ultralytics import YOLO


model = YOLO("yolov8n-visdrone.pt", verbose=False)

In [4]:
import random
from deep_sort.deep_sort.tracker import Tracker
from deep_sort.deep_sort.deep.extractor import Extractor
from deep_sort.deep_sort.deep.weights import RESNET18_WEIGHTS
from deep_sort.deep_sort.deep.configuration import ResNetConfiguration

resnet = ResNetConfiguration(
    base="resnet18",
    weights_path=RESNET18_WEIGHTS,
    use_cuda=True
)
extractor = Extractor(model=resnet, batch_size=4)

tracker = Tracker(
    feature_extractor=extractor
)

colors = [(
    random.randint(0, 255),
    random.randint(0, 255),
    random.randint(0, 255)) for j in range(10)
]

In [7]:
import cv2

fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter("yolo_result.avi", fourcc, 30, (640, 480))

for key, value in data.items():
    frame = get_objects(value)["webots"].camera.frame
    
    result = model.predict(
        source=frame,
        imgsz=frame.shape[:2],
        classes=None,
        conf=0.1,
        iou=0.5,
        max_det=10,
        augment=False,
        agnostic_nms=True,
        device="cpu",
        half=False
    )[0]
    
    detections = []
    boxes = result.boxes
    for i in range(boxes.shape[0]):
        class_id = boxes.cls[i].int().item()
        confidence = boxes.conf[i].float().item()
        xyxy = boxes.xyxy[i].int().tolist() 
        
        record = xyxy + [confidence, class_id]
        
        detections.append(record)
    
    plot = result.plot()
    out.write(plot)
    
    print(f"Frame {int(key) + 1} out of {len(data)}")
    
out.release()    


0: 480x640 2 pedestrians, 1 car, 223.7ms
Speed: 5.0ms preprocess, 223.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 1 out of 5351

0: 480x640 2 pedestrians, 1 car, 193.1ms
Speed: 3.0ms preprocess, 193.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 2 out of 5351

0: 480x640 2 pedestrians, 1 car, 183.7ms
Speed: 2.0ms preprocess, 183.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 3 out of 5351

0: 480x640 2 pedestrians, 1 car, 209.9ms
Speed: 3.0ms preprocess, 209.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 4 out of 5351

0: 480x640 2 pedestrians, 1 car, 204.3ms
Speed: 3.0ms preprocess, 204.3ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 5 out of 5351

0: 480x640 2 pedestrians, 1 car, 233.8ms
Speed: 5.5ms preprocess, 233.8ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)
Frame 6 out of 5351

0: 480x640 2 pedestrians, 1 car, 234.3m