# 1. Setup and Imports


In [None]:
!pip install -q gdown inference-gpu supervision umap-learn torch transformers

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m752.5/752.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.0/906.0 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.7/135.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m63.5 MB/s[0m eta [36m0:

In [None]:
import supervision as sv
from tqdm import tqdm
from google.colab import drive
import os
from inference import get_model
import torch
from transformers import AutoProcessor, SiglipVisionModel
from PIL import Image
import umap.umap_ as umap
from sklearn.cluster import KMeans
import numpy as np
import json
import cv2



In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Read config file
with open('/content/drive/MyDrive/path_to_your_config.json', 'r') as config_file:
    config = json.load(config_file)

In [None]:
# Set up environment variables
os.environ["ROBOFLOW_API_KEY"] = config['roboflow_api_key']
os.environ["ONNXRUNTIME_EXECUTION_PROVIDERS"] = "[CUDAExecutionProvider]"


In [None]:
# Define input video path
input_video_path = '08fd33_4_short.mp4'

In [None]:
# Check GPU availability    
!nvidia-smi

# 2. Load Model and Define Helper Functions

In [None]:
# Load football tracking model
player_tracking_model_id = config['player_tracking_model_id']
model = get_model(model_id=player_tracking_model_id, api_key=os.environ["ROBOFLOW_API_KEY"])

In [None]:
# Load SIGLIP and UMAP models
siglip_model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
REDUCER = umap.UMAP(n_components=3)

In [None]:
## Define helper functions

# Function to extract cropped images of each player from input video
def extract_crops(video_path):
  player_id = 2
  stride = 25
  frame_generator = sv.get_video_frames_generator(source_path=video_path, stride=stride)

  crops = []
  for frame in tqdm(frame_generator, desc='crops'):

      results = model.infer(frame, confidence=0.3)[0]
      detections = sv.Detections.from_inference(results)
      detections = detections.with_nms(threshold=0.5, class_agnostic=True)
      player_detections = detections[detections.class_id == player_id]
      players_crops = [sv.crop_image(frame, xyxy) for xyxy in player_detections.xyxy]
      crops += players_crops

  return crops

# Function to fit player team classifier model
def team_classifier_fit(crops):
  feature_vectors = []

  for crop in crops:
    crop_image = Image.fromarray(crop)

    inputs = processor(images=crop_image, return_tensors="pt", padding="max_length")

    with torch.no_grad():
      outputs = siglip_model(**inputs)

    feature_vector = outputs.pooler_output
    feature_vectors.append(feature_vector.cpu().numpy())

  feature_vectors = np.concatenate(feature_vectors)

  CLUSTERING_MODEL = KMeans(n_clusters=2)

  projections = REDUCER.fit_transform(feature_vectors)
  CLUSTERING_MODEL.fit(projections)

  return CLUSTERING_MODEL

# Function to predict player team using trained classifier model
def team_classifier_predict(clustering_model, crops):
  feature_vectors = []

  for crop in crops:
    crop_image = Image.fromarray(crop)

    inputs = processor(images=crop_image, return_tensors="pt", padding="max_length")

    with torch.no_grad():
      outputs = siglip_model(**inputs)

    feature_vector = outputs.pooler_output
    feature_vectors.append(feature_vector.cpu().numpy())

  feature_vectors = np.concatenate(feature_vectors)

  projections = REDUCER.transform(feature_vectors)
  clusters = clustering_model.predict(projections)

  return clusters

# Function to resolve goalkeeper team ID
def resolve_goalkeepers_team_id(players: sv.Detections, goalkeepers: sv.Detections) -> np.ndarray:
    goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
    team_1_centroid = players_xy[players.class_id == 1].mean(axis=0)
    goalkeepers_team_id = []
    for goalkeeper_xy in goalkeepers_xy:
        dist_0 = np.linalg.norm(goalkeeper_xy - team_0_centroid)
        dist_1 = np.linalg.norm(goalkeeper_xy - team_1_centroid)
        goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)

    return np.array(goalkeepers_team_id)

# Simple possession determination based on closeness of ball and players
def determine_possession(players: sv.Detections, ball: sv.Detections):
   ball_xy = ball.get_anchors_coordinates(sv.Position.CENTER)
   players_xy = players.get_anchors_coordinates(sv.Position.CENTER)
   distances = np.linalg.norm(players_xy - ball_xy, axis=1)
   closest_player_id = np.argmin(distances)
   closest_player_team_id = players.class_id[closest_player_id]

   return closest_player_team_id

def update_possession(possession, stats):
    if possession == 0:
      stats['team_1'] += 1
    elif possession == 1:
      stats['team_2'] += 1
    stats['total'] += 1

    # Calculate possession percentage
    total = stats['total']
    stats['team_1_possession'] = stats['team_1'] / total * 100 if total > 0 else 0
    stats['team_2_possession'] = stats['team_2'] / total * 100 if total > 0 else 0

    return stats['team_1_possession'], stats['team_2_possession']

def draw_possession_bar(frame, team_1_possession, team_2_possession):
    height, width = frame.shape[:2]
    bar_height = 30
    bar_y = height - bar_height - 10  # 10 pixels from the bottom
    
    # Draw background
    cv2.rectangle(frame, (10, bar_y), (width - 10, bar_y + bar_height), (50, 50, 50), -1)
    
    # Draw team 1 possession (left side)
    team1_width = int((width - 20) * (team_1_possession / 100))
    cv2.rectangle(frame, (10, bar_y), (10 + team1_width, bar_y + bar_height), (252, 45, 135), -1)
    
    # Draw team 2 possession (right side)
    team2_width = int((width - 20) * (team_2_possession / 100))
    cv2.rectangle(frame, (width - 10 - team2_width, bar_y), (width - 10, bar_y + bar_height), (87, 190, 250), -1)
    
    # Add percentage labels
    cv2.putText(frame, f"{team_1_possession:.1f}%", (15, bar_y + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
    cv2.putText(frame, f"{team_2_possession:.1f}%", (width - 65, bar_y + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
    
    return frame

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/813M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

# 3. Perform Detections

In [None]:
video_path = '/content/drive/' + input_video_path
target_video_path = '/content/result.mp4'

ball_id = 0
player_id = 2
goalkeeper_id = 1
referee_id = 3

# Initialize possession stats
possession_stats = {
    'team_1': 0,
    'team_2': 0,
    'total': 0,
    'team_1_possession': 0,
    'team_2_possession': 0
}

ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#fc2d87', '#57befa', '#fce42d']),
    thickness=3
)

triangle_annotator = sv.TriangleAnnotator(color=sv.Color.from_hex('#fff41f'),
                                          base=20, height=17)


label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#fc2d87', '#57befa', '#fce42d']),
    text_color=sv.Color.from_hex('#ff4d52'),
    text_position=sv.Position.TOP_CENTER
)


crops = extract_crops('/content/drive/MyDrive/08fd33_4_short.mp4')
clustering_model = team_classifier_fit(crops)

tracker = sv.ByteTrack()
tracker.reset()

frame_generator = sv.get_video_frames_generator(video_path)
frame = next(frame_generator)

video_info = sv.VideoInfo.from_video_path(video_path)
video_sink = sv.VideoSink(target_video_path, video_info=video_info)
frame_generator = sv.get_video_frames_generator(video_path)

with video_sink:
  for frame in tqdm(frame_generator, total=video_info.total_frames):

    results = model.infer(frame, confidence=0.3)[0]
    detections = sv.Detections.from_inference(results)

    ball_detections = detections[detections.class_id == ball_id]
    ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

    all_detections = detections[detections.class_id != ball_id]
    all_detections = all_detections.with_nms(threshold=0.5, class_agnostic=True)
    all_detections = tracker.update_with_detections(detections=all_detections)

    goalkeepers_detections = all_detections[all_detections.class_id == goalkeeper_id]
    players_detections = all_detections[all_detections.class_id == player_id]
    referees_detections = all_detections[all_detections.class_id == referee_id]

    players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
    players_detections.class_id = team_classifier_predict(clustering_model, players_crops)

    goalkeepers_detections.class_id = resolve_goalkeepers_team_id(
        players_detections, goalkeepers_detections)

    referees_detections.class_id -= 1

    all_detections = sv.Detections.merge([
        players_detections, goalkeepers_detections, referees_detections])

    labels = [
        f"#{tracker_id}"
        for tracker_id
        in all_detections.tracker_id
    ]

    all_detections.class_id = all_detections.class_id.astype(int)


    annotated_frame = frame.copy()
    annotated_frame = ellipse_annotator.annotate(
        scene=annotated_frame,
        detections=all_detections)
    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=all_detections,
        labels=labels)
    annotated_frame = triangle_annotator.annotate(
        scene=annotated_frame,
        detections=ball_detections)
    
    # Determine possession
    possession = determine_possession(players_detections, ball_detections)

    # Update possession stats
    team_1_possession, team_2_possession = update_possession(possession, possession_stats)

    # Annotate possession stats
    annotated_frame = draw_possession_bar(annotated_frame, team_1_possession, team_2_possession)

    video_sink.write_frame(annotated_frame)


crops: 24it [02:13,  5.58s/it]


KeyboardInterrupt: 