In [None]:
!pip install ultralytics
# Install required packages in Google Colab
!pip install filterpy
!pip install scipy
!pip install scikit-learn
!pip install torch torchvision
!pip install ultralytics


#   1. Approach 1 ( Pretrained ResNet based model)

### 1. Import Dependencies and Libraries


In [None]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from scipy.optimize import linear_sum_assignment
from torchvision import transforms
from collections import namedtuple
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import matplotlib.pyplot as plt
from ultralytics import YOLO

### 2. Video Preprocessing and Player Detection using YOLO
Assuming we have the YOLO model available for object detection, we'll load it and process the video frames.

Input: broadcast.mp4, tacticam.mp4

Output: Bounding boxes and confidence scores for players


In [None]:
# Assume we have a YOLOv4 or YOLOv5 model pre-trained for player detection
class YOLOModel:
    def __init__(self, model_path):
        self.model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # Using YOLOv5s as an example

    def detect(self, frame):
        results = self.model(frame)
        return results.pandas().xywh  # Return the bounding boxes (center_x, center_y, width, height)

# Load the video files
video_broadcast = cv2.VideoCapture("broadcast.mp4")
video_tacticam = cv2.VideoCapture("tacticam.mp4")

yolo_model = YOLOModel('yolov5')

def process_frame(video_capture, frame_id):
    success, frame = video_capture.read()
    if not success:
        return None
    detections = yolo_model.detect(frame)
    return detections


### 3. Feature Extraction using ResNet-101
We use a pre-trained ResNet-101 model to extract embeddings for the detected players. This helps in feature extraction from the cropped player images.

In [None]:
class FeatureExtractor:
    def __init__(self):
        self.model = models.resnet101(pretrained=True)
        self.model = nn.Sequential(*list(self.model.children())[:-1])  # Remove the final classification layer
        self.model.eval()
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def extract_embedding(self, image):
        image = self.transform(image)
        image = image.unsqueeze(0)
        with torch.no_grad():
            embedding = self.model(image)
        return embedding.flatten()

feature_extractor = FeatureExtractor()

def extract_player_features(detections, frame):
    features = []
    for _, row in detections.iterrows():
        x1, y1, x2, y2 = row['xmin'], row['ymin'], row['xmax'], row['ymax']
        player_crop = frame[int(y1):int(y2), int(x1):int(x2)]
        feature = feature_extractor.extract_embedding(player_crop)
        features.append(feature)
    return features


### 4. Cosine Similarity Calculation for Matching
Cosine similarity is used to compare player embeddings between two frames (one from broadcast.mp4 and one from tacticam.mp4).

In [None]:
def calculate_similarity(features_broadcast, features_tacticam):
    cosine_sim = cosine_similarity(features_broadcast, features_tacticam)
    return cosine_sim


### 5. Hungarian Algorithm for Optimal Matching
The Hungarian algorithm is used to find the optimal one-to-one assignment of players between the two frames based on the similarity matrix.

In [None]:
def hungarian_algorithm(similarity_matrix):
    row_ind, col_ind = linear_sum_assignment(-similarity_matrix)  # Maximize similarity
    return row_ind, col_ind


### 6. Final Matching Output and Evaluation
After calculating similarities and using the Hungarian algorithm, we output the optimal player matches.



In [None]:
def match_players(video_broadcast, video_tacticam):
    frame_id = 0
    while True:
        broadcast_detections = process_frame(video_broadcast, frame_id)
        tacticam_detections = process_frame(video_tacticam, frame_id)

        if broadcast_detections is None or tacticam_detections is None:
            break

        broadcast_features = extract_player_features(broadcast_detections, video_broadcast.read()[1])
        tacticam_features = extract_player_features(tacticam_detections, video_tacticam.read()[1])

        similarity_matrix = calculate_similarity(broadcast_features, tacticam_features)
        row_ind, col_ind = hungarian_algorithm(similarity_matrix)

        # Display matched players (for debugging)
        for i, j in zip(row_ind, col_ind):
            print(f"Player {i} in Broadcast.mp4 is matched with Player {j} in Tacticam.mp4")

        frame_id += 1

# Running the matching
match_players(video_broadcast, video_tacticam)


#   2. Approach 2 ( Pretrained Siamese Network model)

### 1. Import Necessary Libraries
We will import the necessary libraries for video processing, deep learning, and optimization.



In [None]:
import cv2
import torch
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.pairwise import cosine_similarity
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim


### 2. Load Pretrained Siamese Network
For simplicity, we'll assume you have access to a pretrained Siamese network that is designed to compare player appearances directly. A basic Siamese network will accept two images and output embeddings.

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        # For simplicity, we assume a custom CNN architecture for the Siamese network.
        # A typical architecture would involve two identical CNNs that share weights.

        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.fc_layers = nn.Sequential(
            nn.Linear(256 * 28 * 28, 512),  # Assuming the image is 224x224
            nn.ReLU(inplace=True),
            nn.Linear(512, 128),  # Final embedding size is 128
        )

    def forward_once(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten the output
        x = self.fc_layers(x)
        return x

    def forward(self, x1, x2):
        # Forward pass for two images
        output1 = self.forward_once(x1)
        output2 = self.forward_once(x2)
        return output1, output2


###  3. Video Preprocessing & Player Detection Using YOLO
We will use YOLO to detect players in both broadcast.mp4 and tacticam.mp4. We then crop out the bounding boxes for each player and send them to the Siamese network.




In [None]:
class YOLOModel:
    def __init__(self):
        # Loading YOLO model (YOLOv5 as an example)
        self.model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # You can load a larger model if needed

    def detect(self, frame):
        results = self.model(frame)
        # Return player bounding boxes in xywh format
        return results.pandas().xywh

# Video capture initialization
video_broadcast = cv2.VideoCapture("broadcast.mp4")
video_tacticam = cv2.VideoCapture("tacticam.mp4")
yolo_model = YOLOModel()

def process_frame(video_capture):
    success, frame = video_capture.read()
    if not success:
        return None
    detections = yolo_model.detect(frame)
    return detections


### 4. Extract Features Using Siamese Network
The player bounding boxes will be cropped, and these cropped images will be passed to the Siamese network for feature extraction.



In [None]:
class FeatureExtractor:
    def __init__(self, model):
        self.model = model
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def extract_embedding(self, image):
        image = self.transform(image)
        image = image.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            embedding = self.model(image)
        return embedding[0]  # Return the first (and only) embedding

# Initialize the model and feature extractor
siamese_model = SiameseNetwork()
feature_extractor = FeatureExtractor(siamese_model)


### 5. Calculate Cosine Similarity for Matching
Once we extract the embeddings, we calculate the cosine similarity between the embeddings of players from the two videos

In [None]:
def calculate_similarity(features_broadcast, features_tacticam):
    # Cosine similarity between each pair of players (broadcast vs. tacticam)
    cosine_sim = cosine_similarity(features_broadcast, features_tacticam)
    return cosine_sim
def hungarian_algorithm(similarity_matrix):
    row_ind, col_ind = linear_sum_assignment(-similarity_matrix)  # Maximize similarity by minimizing the negative
    return row_ind, col_ind


### 6. Final Matching Output and Evaluation


In [None]:
def match_players(video_broadcast, video_tacticam):
    frame_id = 0
    while True:
        # Process frames from both videos
        broadcast_detections = process_frame(video_broadcast)
        tacticam_detections = process_frame(video_tacticam)

        if broadcast_detections is None or tacticam_detections is None:
            break

        # Extract features for players in both videos
        broadcast_features = []
        tacticam_features = []

        for _, row in broadcast_detections.iterrows():
            x1, y1, x2, y2 = row['xmin'], row['ymin'], row['xmax'], row['ymax']
            player_crop = video_broadcast.read()[1][int(y1):int(y2), int(x1):int(x2)]
            feature = feature_extractor.extract_embedding(player_crop)
            broadcast_features.append(feature)

        for _, row in tacticam_detections.iterrows():
            x1, y1, x2, y2 = row['xmin'], row['ymin'], row['xmax'], row['ymax']
            player_crop = video_tacticam.read()[1][int(y1):int(y2), int(x1):int(x2)]
            feature = feature_extractor.extract_embedding(player_crop)
            tacticam_features.append(feature)

        # Convert list to numpy array for cosine similarity
        broadcast_features = np.array([f.numpy() for f in broadcast_features])
        tacticam_features = np.array([f.numpy() for f in tacticam_features])

        # Calculate similarity and apply Hungarian algorithm
        similarity_matrix = calculate_similarity(broadcast_features, tacticam_features)
        row_ind, col_ind = hungarian_algorithm(similarity_matrix)

        # Output the matched players
        for i, j in zip(row_ind, col_ind):
            print(f"Player {i} in Broadcast.mp4 is matched with Player {j} in Tacticam.mp4")

        frame_id += 1

# Running the player matching function
match_players(video_broadcast, video_tacticam)
