# Read in text file

In [2]:
import pandas as pd

sample_df = pd.read_csv('etg_samples.txt', sep=" ")

# Minimum Requirement: Detect and Track Cars

In [None]:
import cv2
from ultralytics import YOLO

# Load YOLOv8 model
model = YOLO("yolo11n.pt")
model.to('cuda') # comment if no GPU
# Open input video: either 1 min version or full version
cap = cv2.VideoCapture("video_etg_1min.mp4")

# Video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Output video
fourcc = cv2.VideoWriter_fourcc(*"XVID")
out = cv2.VideoWriter("video_etg_1min_tracked_minimum.mp4", fourcc, fps, (width, height))

CAR_CLASS_ID = 2

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Run YOLO tracking
    results = model.track(
        frame,
        conf=0.7, # adjustable
        classes=[CAR_CLASS_ID],
        tracker="bytetrack.yaml",
        persist=True
    )

    for r in results:
        if r.boxes.id is None:
            continue

        boxes = r.boxes.xyxy.cpu().numpy()
        track_ids = r.boxes.id.cpu().numpy().astype(int)

        for box, track_id in zip(boxes, track_ids):
            x1, y1, x2, y2 = map(int, box)

            # Draw bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw object ID (instead of confidence)
            label = f"car ID:{track_id}"
            cv2.putText(
                frame,
                label,
                (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 255, 0),
                2
            )

    out.write(frame)

cap.release()
out.release()
cv2.destroyAllWindows()


# Perform gaze annotation

In [6]:
import pandas as pd

# only keep the first row
df_unique = (
    sample_df
    .sort_values("frame_etg")
    .drop_duplicates(subset="frame_etg", keep="first")
    .dropna(subset=["frame_etg", "X", "Y"])
)

# dictionary containing the frame count and X,Y coordinate only.
gaze_dict = {
    int(row.frame_etg): (int(row.X), int(row.Y))
    for row in df_unique.itertuples()
}


In [7]:
import cv2

cap = cv2.VideoCapture("video_etg_1min_tracked_minimum.mp4")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

fourcc = cv2.VideoWriter_fourcc(*"XVID")
out = cv2.VideoWriter("video_etg_1min_tracked_with_gaze_minimum.mp4", fourcc, fps, (width, height))

frame_idx = 0  # OpenCV frame counter

def draw_plus(img, center, size=50, thickness=3, color=(0, 0, 255)):
    cx, cy = center
    half = size // 2
    cv2.line(img, (cx - half, cy), (cx + half, cy), color, thickness)
    cv2.line(img, (cx, cy - half), (cx, cy + half), color, thickness)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # If gaze data exists for this frame
    if frame_idx in gaze_dict:
        x, y = gaze_dict[frame_idx]

        # Draw red circle (BGR: 0,0,255)
        draw_plus(frame, (x, y), size=50, thickness=5, color=(0, 0, 255))

    out.write(frame)
    frame_idx += 1

cap.release()
out.release()
cv2.destroyAllWindows()


OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


# fixation calculation over all objects

In [8]:
import pandas as pd

# Keep only fixation rows
df_fix = sample_df[sample_df["event_type"] == "Fixation"]

# Keep first row per frame
df_fix = (
    df_fix
    .sort_values("frame_etg")
    .drop_duplicates(subset="frame_etg", keep="first")
    .dropna(subset=["frame_etg", "X", "Y"])
)

# Frame → gaze point
fixation_dict = {
    int(row.frame_etg): (float(row.X), float(row.Y))
    for row in df_fix.itertuples()
}


In [9]:
import cv2
from ultralytics import YOLO
from collections import defaultdict

model = YOLO("yolo11n.pt")

cap = cv2.VideoCapture("video_etg_1min.mp4")

fps = cap.get(cv2.CAP_PROP_FPS)
FRAME_TIME = 1.0 / fps  

# Accumulators
fixation_time = defaultdict(float)      # track_id → seconds
track_class = {}                        # track_id → class_name

frame_idx = 0


In [10]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Run tracking on all classes
    results = model.track(
        frame,
        persist=True,
        conf=0.7,
        tracker="bytetrack.yaml"
    )

    # No fixation in this frame → skip
    if frame_idx not in fixation_dict:
        frame_idx += 1
        continue

    gaze_x, gaze_y = fixation_dict[frame_idx]

    for r in results:
        if r.boxes.id is None:
            continue

        boxes = r.boxes.xyxy.cpu().numpy()
        track_ids = r.boxes.id.cpu().numpy().astype(int)
        class_ids = r.boxes.cls.cpu().numpy().astype(int)

        for box, track_id, cls_id in zip(boxes, track_ids, class_ids):
            x1, y1, x2, y2 = box

            # Check if fixation lies inside bounding box
            if x1 <= gaze_x <= x2 and y1 <= gaze_y <= y2:
                fixation_time[track_id] += FRAME_TIME
                track_class[track_id] = model.names[cls_id]

    frame_idx += 1

cap.release()



0: 480x640 2 cars, 7.8ms
Speed: 1.7ms preprocess, 7.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 7.3ms
Speed: 2.8ms preprocess, 7.3ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 6.8ms
Speed: 2.0ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 7.4ms
Speed: 3.2ms preprocess, 7.4ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 8.5ms
Speed: 2.0ms preprocess, 8.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 7.2ms
Speed: 1.7ms preprocess, 7.2ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 8.0ms
Speed: 1.6ms preprocess, 8.0ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 7.1ms
Speed: 1.7ms preprocess, 7.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 10.2

In [11]:
output_df = pd.DataFrame([
    {
        "object_id": track_id,
        "class_name": track_class.get(track_id, "unknown"),
        "total_fixation_time_sec": round(time_sec, 4)
    }
    for track_id, time_sec in fixation_time.items()
])

output_df.to_csv("video_etg_1min_fixation.txt", index=False)

# advanced requirement: instance segmentation with tracking (gaze + RT)

In [13]:
import cv2
import numpy as np
from ultralytics import YOLO

cap = cv2.VideoCapture("video_etg.avi")
assert cap.isOpened(), "Error reading video file"


w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
dt = 1.0 / fps

video_writer = cv2.VideoWriter(
    "video_etg_with_real_time_tracking_and_segmentation_advanced.avi",
    cv2.VideoWriter_fourcc(*"mp4v"),
    fps,
    (w, h)
)

model = YOLO("yolo11n-seg.pt")
model.to('cuda')  # comment if no GPU
class_names = model.names


fixation_time = {}  
frame_idx = 0

ALPHA = 0.35
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.6
FONT_THICKNESS = 2
DASH_LEN = 10
GAP_LEN = 6


COLOR_PALETTE = [
    (0, 255, 0),     # green
    (255, 0, 0),     # blue
    (0, 0, 255),     # red
    (255, 255, 0),   # cyan
    (255, 0, 255),   # magenta
]

NUM_COLORS = len(COLOR_PALETTE)


def draw_dashed_rect(img, pt1, pt2, color, thickness):
    x1, y1 = pt1
    x2, y2 = pt2

    for x in range(x1, x2, DASH_LEN + GAP_LEN):
        cv2.line(img, (x, y1), (min(x + DASH_LEN, x2), y1), color, thickness)
        cv2.line(img, (x, y2), (min(x + DASH_LEN, x2), y2), color, thickness)

    for y in range(y1, y2, DASH_LEN + GAP_LEN):
        cv2.line(img, (x1, y), (x1, min(y + DASH_LEN, y2)), color, thickness)
        cv2.line(img, (x2, y), (x2, min(y + DASH_LEN, y2)), color, thickness)

def draw_plus(img, center, size=50, thickness=3, color=(0, 0, 255)):
    cx, cy = center
    half = size // 2
    cv2.line(img, (cx - half, cy), (cx + half, cy), color, thickness)
    cv2.line(img, (cx, cy - half), (cx, cy + half), color, thickness)


while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    results = model.track(
        frame,
        persist=True,
        verbose=False,
        conf=0.7
    )

    annotated = frame.copy()
    overlay = frame.copy()
    r = results[0]

    gaze = fixation_dict.get(frame_idx, None)
    if gaze is not None:
        gx, gy = int(gaze[0]), int(gaze[1])
        draw_plus(
            annotated,
            (gx, gy),
            size=100,
            thickness=3,
            color=(0, 0, 255)
        )

    if r.boxes is not None and r.boxes.id is not None and r.masks is not None:
        boxes = r.boxes.xyxy.cpu().numpy()
        ids = r.boxes.id.cpu().numpy().astype(int)
        clss = r.boxes.cls.cpu().numpy().astype(int)
        polys = r.masks.xy

        for box, obj_id, cls_id, poly in zip(boxes, ids, clss, polys):
            x1, y1, x2, y2 = map(int, box)

            if obj_id not in fixation_time:
                fixation_time[obj_id] = 0.0

            color = COLOR_PALETTE[obj_id % NUM_COLORS]

            if gaze is not None:
                if x1 <= gx <= x2 and y1 <= gy <= y2:
                    fixation_time[obj_id] += dt

            poly_i = poly.astype(np.int32)
            cv2.fillPoly(overlay, [poly_i], color)

            cv2.polylines(
                annotated,
                [poly_i],
                isClosed=True,
                color=color,
                thickness=2
            )


            draw_dashed_rect(
                annotated,
                (x1, y1),
                (x2, y2),
                color,
                thickness=2
            )

            class_name = class_names[cls_id].upper()
            fixation_ms = int(fixation_time[obj_id] * 1000)

            line1 = f"{class_name} ID: {obj_id}"
            line2 = f"{fixation_ms} ms"

            (w1, h1), _ = cv2.getTextSize(line1, FONT, FONT_SCALE, FONT_THICKNESS)
            (w2, h2), _ = cv2.getTextSize(line2, FONT, FONT_SCALE, FONT_THICKNESS)

            box_w = max(w1, w2) + 20
            box_h = h1 + h2 + 20

            cx = (x1 + x2) // 2
            box_x1 = int(cx - box_w / 2)
            box_y1 = max(y1 - box_h - 8, 0)
            box_x2 = box_x1 + box_w
            box_y2 = box_y1 + box_h

            cv2.rectangle(
                annotated,
                (box_x1, box_y1),
                (box_x2, box_y2),
                color,
                thickness=-1
            )

            text_x1 = box_x1 + (box_w - w1) // 2
            text_y1 = box_y1 + h1 + 8

            text_x2 = box_x1 + (box_w - w2) // 2
            text_y2 = text_y1 + h2 + 4

            cv2.putText(
                annotated,
                line1,
                (text_x1, text_y1),
                FONT,
                FONT_SCALE,
                (255, 255, 255),
                FONT_THICKNESS
            )

            cv2.putText(
                annotated,
                line2,
                (text_x2, text_y2),
                FONT,
                FONT_SCALE,
                (255, 255, 255),
                FONT_THICKNESS
            )

    annotated = cv2.addWeighted(
        overlay,
        ALPHA,
        annotated,
        1 - ALPHA,
        0
    )

    video_writer.write(annotated)
    frame_idx += 1

cap.release()
video_writer.release()
cv2.destroyAllWindows()
