02. DETECTIONS TRACKING

In [1]:
import os
os.environ["ONNXRUNTIME_EXECUTION_PROVIDERS"] = "[CUDAExecutionProvider]"

import sys
sys.path.append(os.getenv("PROJECT_PATH"))

from inference import get_model
import supervision as sv

ROBOFLOW_API_KEY = os.getenv("ROBOFLOW_API_KEY")

PLAYER_DETECTION_MODEL_ID = 'football-players-detection-3zvbc/2'
PLAYER_DETECTION_MODEL = get_model(PLAYER_DETECTION_MODEL_ID, ROBOFLOW_API_KEY)

FIELD_DETECTION_MODEL_ID = "football-field-detection-f07vi/14"
FIELD_DETECTION_MODEL = get_model(model_id=FIELD_DETECTION_MODEL_ID, api_key=ROBOFLOW_API_KEY)

SOURCE_VIDEO_PATH = "../../data/videos/new5s.mp4"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Supervision annotators
from utils.pitchconfig import SoccerPitchConfiguration

CONFIG = SoccerPitchConfiguration()

ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), #blue, pink, yellow
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), #blue, pink, yellow
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#FFD700'),
    base=25,
    height=21,
    outline_thickness=1
)

#Supervision - virtualization
edge_annotator = sv.EdgeAnnotator(
    color=sv.Color.from_hex('#00BFFF'),
    thickness=2, edges=CONFIG.edges)
vertex_annotator = sv.VertexAnnotator(
    color=sv.Color.from_hex('#FF1493'),
    radius=8)
vertex_annotator_2 = sv.VertexAnnotator(
    color=sv.Color.from_hex('#00BFFF'),
    radius=8)

In [3]:
#Team Assignment based on colours
#use of the SigLIP, UMAP, and KMeans combo
import supervision as sv
from tqdm import tqdm
from utils.teamclassifier import TeamClassifier

PLAYER_ID = 2
STRIDE = 60
frame_generator = sv.get_video_frames_generator(
    source_path=SOURCE_VIDEO_PATH, stride=STRIDE
)

crops = []
for frame in tqdm(frame_generator, desc="collecting crops"):
    result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    detections = sv.Detections.from_inference(result)
    players_detections = detections[detections.class_id == PLAYER_ID]
    
    # Skip the frame if no players are detected
    if len(players_detections.xyxy) == 0:
        continue
    
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
    crops += players_crops

# Ensure there are enough crops for clustering
if len(crops) < 2:
    print("Not enough player crops detected. Skipping team classification.")
else:
    team_classifier = TeamClassifier(device="cpu")
    team_classifier.fit(crops)
    
print(f"Number of players detected: {len(crops)}")

collecting crops: 3it [00:03,  1.32s/it]
Embedding extraction: 2it [00:21, 10.55s/it]


Number of players detected: 61


In [4]:
#GK Assignment - calculate avg centroid of players and assign GK to the team with the closest centroid
import numpy as np
import supervision as sv

def resolve_goalkeepers_team_id(
    players: sv.Detections,
    goalkeepers: sv.Detections
) -> np.ndarray:
    goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
    team_1_centroid = players_xy[players.class_id == 1].mean(axis=0)
    goalkeepers_team_id = []
    for goalkeeper_xy in goalkeepers_xy:
        dist_0 = np.linalg.norm(goalkeeper_xy - team_0_centroid)
        dist_1 = np.linalg.norm(goalkeeper_xy - team_1_centroid)
        goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)

    return np.array(goalkeepers_team_id)

In [5]:
#Frame collection for training
from tqdm import tqdm

frames = []

for frame_index, frame in enumerate(tqdm(sv.get_video_frames_generator(SOURCE_VIDEO_PATH), desc="Processing...")):
    if frame_index % 5 == 0: #default: 5
        frames.append(frame)

print(f"Total frames collected: {len(frames)}")

Processing...: 152it [00:01, 89.22it/s]


Total frames collected: 31


In [7]:
#Full tracking
import supervision as sv
from utils.viewtransformer import ViewTransformer
from utils.drawpitch import draw_pitch, draw_points_on_pitch

BALL_ID = 0
PLAYER_ID = 2
GOALKEEPER_ID = 1

pitch_frames = []

ball_coords = []
gk_t1_coords = []
gk_t2_coords = []

player_coords_per_id = dict()
player_team_per_id = {}

# Tracker
tracker = sv.ByteTrack()

for frame_index, frame in enumerate(tqdm(frames, desc="Processing...")):
    tqdm.write(f"... frame {frame_index}: ")
    
    # ball and players detections
    result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.2)[0]
    detections = sv.Detections.from_inference(result)

    ball_detections = detections[detections.class_id == BALL_ID]
    ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)
    
    all_detections = detections[detections.class_id != BALL_ID]
    all_detections = all_detections.with_nms(threshold=0.5, class_agnostic=True)
    all_detections = tracker.update_with_detections(detections=all_detections)

    goalkeepers_detections = all_detections[all_detections.class_id == GOALKEEPER_ID]
    players_detections = all_detections[all_detections.class_id == PLAYER_ID]

    # team assignment
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
    players_detections.class_id = team_classifier.predict(players_crops)

    if len(goalkeepers_detections.xyxy) > 0:
        goalkeepers_detections.class_id = resolve_goalkeepers_team_id(
            players_detections, goalkeepers_detections)
    else:
        #if no gk detected no need to resolve team id
        goalkeepers_detections.class_id = np.array([])
    
    all_detections = sv.Detections.merge([players_detections, goalkeepers_detections])

    # frame visualization
    labels = []
    for idx, tracker_id in enumerate(all_detections.tracker_id):
        class_id = all_detections.class_id[idx]
        if tracker_id in player_team_per_id:
            team = player_team_per_id[tracker_id]
        else:
            team = class_id
        labels.append(f"id_{tracker_id}_t_{team}")

    all_detections.class_id = all_detections.class_id.astype(int)

    annotated_frame = frame.copy()
    annotated_frame = ellipse_annotator.annotate(
        scene=annotated_frame,
        detections=all_detections)
    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=all_detections,
        labels=labels)
    annotated_frame = triangle_annotator.annotate(
        scene=annotated_frame,
        detections=ball_detections)

    pitch_frames.append(annotated_frame.copy())
    
    players_detections = sv.Detections.merge([
        players_detections, goalkeepers_detections
    ])
    
    # Detect pitch key points
    result = FIELD_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    key_points = sv.KeyPoints.from_inference(result)
    
    # Ensure key_points is not None and contains valid data
    if key_points is None or key_points.confidence is None:
        print(f"Skipping frame {frame_index} due to missing key points.")
        continue  # Skip this frame and move to the next one

    # project ball, players and gk on pitch
    filter = key_points.confidence[0] > 0.5
    frame_reference_points = key_points.xy[0][filter]
    pitch_reference_points = np.array(CONFIG.vertices)[filter]

    # Ensure there are at least 4 points for homography calculation
    if len(frame_reference_points) < 4 or len(pitch_reference_points) < 4:
        print(f"Skipping frame due to insufficient keypoints: {len(frame_reference_points)} found.")
        continue  # Skip this frame and move to the next one

    # Proceed with homography calculation
    transformer = ViewTransformer(
        source = frame_reference_points,
        target = pitch_reference_points
    )

    frame_ball_xy = ball_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    pitch_ball_xy = transformer.transform_points(points=frame_ball_xy)

    players_xy = players_detections.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    pitch_players_xy = transformer.transform_points(points=players_xy)
    
    # --- Save player coordinates per ID in pitch space ---
    for idx, tracker_id in enumerate(players_detections.tracker_id):
        if tracker_id is not None:
            if tracker_id not in player_coords_per_id:
                player_coords_per_id[tracker_id] = []
                # Save the team assignment for this tracker_id
                player_team_per_id[tracker_id] = int(players_detections.class_id[idx])
            x, y = pitch_players_xy[idx]
            player_coords_per_id[tracker_id].append((frame_index, x, y))

    # --- Save ball coordinates in pitch space ---
    if len(pitch_ball_xy) > 0:
        x, y = pitch_ball_xy[0]
        ball_coords.append((frame_index, x, y))
    else:
        ball_coords.append((frame_index, None, None))

    # --- Save goalkeeper coordinates for each team in pitch space ---
    # Find goalkeepers in players_detections (merged with goalkeepers)
    gk_indices = np.where(players_detections.class_id == GOALKEEPER_ID)[0]
    for idx in gk_indices:
        x, y = pitch_players_xy[idx]
        # Use the original team assignment for GKs
        team_id = resolve_goalkeepers_team_id(players_detections, players_detections[idx:idx+1])[0]
        if team_id == 0:
            gk_t1_coords.append((frame_index, x, y))
        elif team_id == 1:
            gk_t2_coords.append((frame_index, x, y)) 
    
    if frame_index > 30: #bytetrack memory is 30 frames
        tracker.reset()

                                                     

... frame 0: 


Processing...:   0%|          | 0/31 [00:00<?, ?it/s]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:08,  8.74s/it]
                                                             

... frame 1: 


Processing...:   3%|▎         | 1/31 [00:18<09:28, 18.95s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.38s/it]
                                                             

... frame 2: 


Processing...:   6%|▋         | 2/31 [00:30<07:03, 14.61s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.95s/it]
                                                             

... frame 3: 


Processing...:  10%|▉         | 3/31 [00:41<05:57, 12.78s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.26s/it]
                                                             

... frame 4: 


Processing...:  13%|█▎        | 4/31 [00:50<05:11, 11.54s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.20s/it]
                                                             

... frame 5: 


Processing...:  16%|█▌        | 5/31 [01:00<04:39, 10.73s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.20s/it]
                                                             

... frame 6: 


Processing...:  19%|█▉        | 6/31 [01:09<04:16, 10.24s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.03s/it]
                                                             

... frame 7: 


Processing...:  23%|██▎       | 7/31 [01:18<03:59,  9.96s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.35s/it]
                                                             

... frame 8: 


Processing...:  26%|██▌       | 8/31 [01:28<03:46,  9.83s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.21s/it]
                                                             

... frame 9: 


Processing...:  29%|██▉       | 9/31 [01:37<03:33,  9.73s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.76s/it]
                                                              

... frame 10: 


Processing...:  32%|███▏      | 10/31 [01:47<03:26,  9.84s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:05,  5.55s/it]
                                                              

... frame 11: 


Processing...:  35%|███▌      | 11/31 [01:57<03:12,  9.63s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.47s/it]
                                                              

... frame 12: 


Processing...:  39%|███▊      | 12/31 [02:07<03:08,  9.95s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:05,  5.97s/it]
                                                              

... frame 13: 


Processing...:  42%|████▏     | 13/31 [02:18<03:00, 10.05s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.89s/it]
                                                              

... frame 14: 


Processing...:  45%|████▌     | 14/31 [02:30<03:03, 10.78s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.70s/it]
                                                              

... frame 15: 


Processing...:  48%|████▊     | 15/31 [02:42<03:00, 11.29s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.06s/it]
                                                              

... frame 16: 


Processing...:  52%|█████▏    | 16/31 [02:52<02:42, 10.80s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.25s/it]
                                                              

... frame 17: 


Processing...:  55%|█████▍    | 17/31 [03:02<02:26, 10.43s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  7.00s/it]
                                                              

... frame 18: 


Processing...:  58%|█████▊    | 18/31 [03:13<02:17, 10.60s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.12s/it]
                                                              

... frame 19: 


Processing...:  61%|██████▏   | 19/31 [03:24<02:08, 10.72s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.09s/it]
                                                              

... frame 20: 


Processing...:  65%|██████▍   | 20/31 [03:34<01:56, 10.58s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.72s/it]
                                                              

... frame 21: 


Processing...:  68%|██████▊   | 21/31 [03:44<01:45, 10.54s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:05,  5.72s/it]
                                                              

... frame 22: 


Processing...:  71%|███████   | 22/31 [03:53<01:31, 10.11s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.33s/it]
                                                              

... frame 23: 


Processing...:  74%|███████▍  | 23/31 [04:04<01:21, 10.23s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:08,  8.30s/it]
                                                              

... frame 24: 


Processing...:  77%|███████▋  | 24/31 [04:18<01:19, 11.37s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.93s/it]
                                                              

... frame 25: 


Processing...:  81%|████████  | 25/31 [04:32<01:12, 12.16s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:09,  9.44s/it]
                                                              

... frame 26: 


Processing...:  84%|████████▍ | 26/31 [04:47<01:05, 13.13s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:14, 14.36s/it]
                                                              

... frame 27: 


Processing...:  87%|████████▋ | 27/31 [05:09<01:02, 15.54s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:11, 11.77s/it]
                                                              

... frame 28: 


Processing...:  90%|█████████ | 28/31 [05:27<00:49, 16.46s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:07,  7.61s/it]
                                                              

... frame 29: 


Processing...:  94%|█████████▎| 29/31 [05:40<00:30, 15.48s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:06,  6.82s/it]
                                                              

... frame 30: 


Processing...:  97%|█████████▋| 30/31 [05:51<00:13, 13.88s/it]
Embedding extraction: 0it [00:00, ?it/s]
Embedding extraction: 1it [00:05,  5.52s/it]
Processing...: 100%|██████████| 31/31 [05:59<00:00, 11.60s/it]


In [None]:
#Save frames with detections to folder
import cv2
import numpy as np
import os

video_name = os.path.splitext(os.path.basename(SOURCE_VIDEO_PATH))[0]

frames_len = len(frames)

output_dir = f"../../data/frames/{video_name}-{frames_len}f"
os.makedirs(output_dir, exist_ok=True)

for idx, frame in enumerate(pitch_frames):
    frame_path = os.path.join(output_dir, f"frame_{idx:04d}.png")
    
    cv2.imwrite(frame_path,np.array(frame))

In [None]:
#Save coordinates per frame to CSV
import csv
import numpy as np
import os

video_name = os.path.splitext(os.path.basename(SOURCE_VIDEO_PATH))[0]

output_csv_path = f"../../data/coords/{video_name}-{frames_len}f.csv" 
all_ids = sorted(player_coords_per_id.keys())

# Build lookups
frame_lookup = {}
for tracker_id, coords in player_coords_per_id.items():
    for frame_index, x, y in coords:
        if frame_index not in frame_lookup:
            frame_lookup[frame_index] = {}
        frame_lookup[frame_index][tracker_id] = [x, y]

ball_lookup = {f: [x, y] for f, x, y in ball_coords}

header = (
    ["frame_index", "ball"] +
    [f"id_{id}_team_{player_team_per_id[id]}" for id in all_ids]
)

all_frames = sorted(set(
    list(frame_lookup.keys()) +
    list(ball_lookup.keys())
))

with open(output_csv_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for frame_index in all_frames:
        row = [frame_index]
        # Ball
        ball = ball_lookup.get(frame_index, None)
        if ball and None not in ball:
            row.append(f"{ball[0]:.2f},{ball[1]:.2f}")
        else:
            row.append("")
        # Players by id
        for id in all_ids:
            coord = frame_lookup.get(frame_index, {}).get(id, None)
            if coord and None not in coord:
                row.append(f"{coord[0]:.2f},{coord[1]:.2f}")
            else:
                row.append("")
        writer.writerow(row)

print(f"Coordinates per frame saved to {output_csv_path}")

Coordinates per frame saved to ../../data/coords/new5s-31f.csv
