# CSC5651 - Applied Project - Detecting soccer players in broadcast footage - Notebook

This notebook contains the experiments done with Tensorflow 2 and Faster R-CNN for my CSC5651 term project to identify and bound soccer players within live broadcast game footage.

## Loading the model

Let's start by verfying we're utilizing the GPU hardware.

In [6]:
import tensorflow as tf
import json
from IPython.display import Image
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import warnings
import pathlib
import time
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as viz_utils

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Next, we'll load the pre-trained Faster R-CNN model with Inception.

In [7]:
# Download and extract model
def download_model(model_name, model_date):
    base_url = 'http://download.tensorflow.org/models/object_detection/tf2/'
    model_file = model_name + '.tar.gz'
    model_dir = tf.keras.utils.get_file(fname=model_name,
                                        origin=base_url + model_date + '/' + model_file,
                                        untar=True)
    return str(model_dir)

MODEL_DATE = '20200711'
MODEL_NAME = 'faster_rcnn_resnet152_v1_1024x1024_coco17_tpu-8'
PATH_TO_MODEL_DIR = download_model(MODEL_NAME, MODEL_DATE)

In [8]:
PATH_TO_MODEL_DIR

'/home/jack/.keras/datasets/faster_rcnn_resnet152_v1_1024x1024_coco17_tpu-8'

In [9]:
PATH_TO_SAVED_MODEL = PATH_TO_MODEL_DIR + "/saved_model"

print('Loading model...', end='')
start_time = time.time()

# Load saved model and build the detection function
detect_fn = tf.saved_model.load(PATH_TO_SAVED_MODEL)

end_time = time.time()
elapsed_time = end_time - start_time
print('Done! Took {} seconds'.format(elapsed_time))

Loading model...Done! Took 16.93804144859314 seconds


Load up the labels and create a category index for classification.

In [10]:
def download_labels(filename):
    base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/'
    label_dir = tf.keras.utils.get_file(fname=filename,
                                        origin=base_url + filename,
                                        untar=False)
    label_dir = pathlib.Path(label_dir)
    return str(label_dir)

LABEL_FILENAME = 'mscoco_label_map.pbtxt'
PATH_TO_LABELS = download_labels(LABEL_FILENAME)

In [11]:
PATH_TO_LABELS

'/home/jack/.keras/datasets/mscoco_label_map.pbtxt'

In [12]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS,
                                                                    use_display_name=True)

## Kicking the tires

Take the model out for a spin by classifying 10 seconds worth of frames. The video is 25 frames/second, so we'll run object detection on 250 frames, draw bounding boxes on them, restructure them back into a video, and output it to a file.

In [27]:
warnings.filterwarnings('ignore')   # Suppress Matplotlib warnings

def load_image_into_numpy_array(path):
    """Load an image from file into a numpy array.

    Puts image into numpy array to feed into tensorflow graph.
    Note that by convention we put it into a numpy array with shape
    (height, width, channels), where channels=3 for RGB.

    Args:
      path: the file path to the image

    Returns:
      uint8 numpy array with shape (img_height, img_width, 3)
    """
    return np.array(Image.open(path))

VIDEO_PATH = '/home/jack/code/csc5651-applied-project/video/Film Role-0 ID-1 T-2 m00s00-000-m00s00-185.avi'
OUTPUT_DIR = '/mnt/d/school/csc5651/soc_output/'
NUM_SECONDS = 10

cap = cv2.VideoCapture(VIDEO_PATH)

fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_file = f'{OUTPUT_DIR}output_video.avi'
fps = 25
frame_size = (1024, 1024)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

i = 0
max_i = 25 * NUM_SECONDS

while i < max_i:
  ret, frame = cap.read()
  resized_frame = cv2.resize(frame, (1024, 1024))

  input_tensor = tf.convert_to_tensor(resized_frame)

  # The model expects a batch of images, so add an axis with `tf.newaxis`.
  input_tensor = input_tensor[tf.newaxis, ...]

  # input_tensor = np.expand_dims(image_np, 0)
  detections = detect_fn(input_tensor)

  # All outputs are batches tensors.
  # Convert to numpy arrays, and take index [0] to remove the batch dimension.
  # We're only interested in the first num_detections.
  num_detections = int(detections.pop('num_detections'))
  detections = {key: value[0, :num_detections].numpy()
                  for key, value in detections.items()}
  detections['num_detections'] = num_detections

  # detection_classes should be ints.
  detections['detection_classes'] = detections['detection_classes'].astype(np.int64)

  image = resized_frame.copy()

  viz_utils.visualize_boxes_and_labels_on_image_array(
    image,
    detections['detection_boxes'],
    detections['detection_classes'],
    detections['detection_scores'],
    category_index,
    use_normalized_coordinates=True,
    max_boxes_to_draw=200,
    min_score_thresh=.30,
    agnostic_mode=False)

  out.write(image)
  i += 1

cap.release()
out.release()
cv2.destroyAllWindows()

## Testing the pre-trained model

Now we'll load up the ISSIA dataset with the help of some utility functions borrowed from [FootAndBall](https://github.com/jac99/FootAndBall), a project by Jacek Komorowski et. al. which uses the same dataset for a similar use case.

In [29]:
import issia_utils as iu

# Start with video 1
annotations = iu.read_issia_ground_truth(1, "data/issia/")

Number of frames = 3021


The object `annotations.persons` now contains a dictionary, whose keys are frame numbers and values are lists of tuples containing `(player_id, height, width, x, y)` for each player in the frame.

In [30]:
annotations.persons.get(356)

[('9998', 103, 42, 1385, 320),
 ('9997', 102, 37, 1, 385),
 ('9999', 94, 42, 307, 235)]

For this project, we're only using the player data, so we'll discard the other information for now. If we have time, we'll come back to the ball and test tracking it too. The model we're using has an input size of 1024 x 1024, but the video is a different size. We need to resize the video and the annotations to match.

In [32]:
VIDEO_PATH = '/home/jack/code/csc5651-applied-project/data/issia/filmrole1.avi'
INPUT_WIDTH = 1024
INPUT_HEIGHT = 1024

cap = cv2.VideoCapture(VIDEO_PATH)

width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)

width_scale_factor = INPUT_WIDTH / width
height_scale_factor = INPUT_HEIGHT / height

resized_player_annotations = dict()

for k, v in annotations.persons.items():
  frame = k
  annotation = v
  resized_player_annotations[frame] = list()

  for player in annotation:
    player_id = player[0]
    height = int(player[1] * height_scale_factor)
    width  = int(player[2] * width_scale_factor)
    x = int(player[3] * width_scale_factor)
    y = int(player[4] * height_scale_factor)
    resized_player_annotations[frame].append((player_id, height, width, x, y))

In [33]:
resized_player_annotations.get(356)

[('9998', 96, 22, 738, 301),
 ('9997', 96, 19, 0, 362),
 ('9999', 88, 22, 163, 221)]

This resize operation looks pretty sound. Let's verify by resizing some frames from the video and painting on the boundary boxes to verify.

In [37]:
OUTPUT_DIR = '/mnt/d/school/csc5651/soc_output/'

cap = cv2.VideoCapture(VIDEO_PATH)
fourcc = cv2.VideoWriter_fourcc(*"XVID")
output_file = f"{OUTPUT_DIR}ground_truth_test.avi"
fps = 25
frame_size = (1024, 1024)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

i = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    resized_frame = cv2.resize(frame, (1024, 1024))

    image = resized_frame.copy()

    if i in resized_player_annotations:
        for player in resized_player_annotations.get(i):
            height = player[1]
            width = player[2]
            x = player[3]
            y = player[4]

            cv2.rectangle(image, (x, y), (x + width, y + height), (0, 0, 255), 2)
        out.write(image)

    i += 1

cap.release()
out.release()
cv2.destroyAllWindows()

Viewing back the output footage, the bounding boxes look approximately accurate, which means the algorithm for rescaling the bounding boxes isn't grossly incorrect. We'll continue with these, and resize and trim each video down to only the frames that have ground-truth data associated with them. The first few seconds of frames in each video are devoid of it, as the camera was calibrating during those. While we're at it, let's also dump out the resized annotations to JSON files for easier reading later.

In [38]:
for j in range(1, 7):
    cap = cv2.VideoCapture(f'/home/jack/code/csc5651-applied-project/data/issia/filmrole{j}.avi')
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    output_file = f'{OUTPUT_DIR}resized_filmrole{j}.avi'
    fps = 25
    frame_size = (1024, 1024)
    out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)
    annotations = iu.read_issia_ground_truth(1, "data/issia/")
    for k, v in annotations.persons.items():
        frame = k
        annotation = v
        resized_player_annotations[frame] = list()

        for player in annotation:
            player_id = player[0]
            height = int(player[1] * height_scale_factor)
            width = int(player[2] * width_scale_factor)
            x = int(player[3] * width_scale_factor)
            y = int(player[4] * height_scale_factor)
            resized_player_annotations[frame].append((player_id, height, width, x, y))

    i = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        resized_frame = cv2.resize(frame, (1024, 1024))
        out.write(resized_frame)

        i += 1

    with open(f'{OUTPUT_DIR}filmrole{j}.json', 'w') as outfile:
        json.dump(resized_player_annotations, outfile)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

Number of frames = 3021
Number of frames = 3021
Number of frames = 3021
Number of frames = 3021
Number of frames = 3021
Number of frames = 3021


Now that that's done, we can run object detection on one of the resized video clips, and measure the accuracy of the bounding boxes compared to the ground truth. Since we're only looking for persons, we'll filter all the other labels from the detection space of the model.

In [40]:
def filter_and_convert_detections(detections, label_id, threshold):
    scores = detections['detection_scores'][0].numpy()
    boxes = detections['detection_boxes'][0].numpy()
    classes = detections['detection_classes'][0].numpy().astype(int)

    mask = (classes == label_id) & (scores > threshold)
    scores = scores[mask]
    boxes = boxes[mask] * 1024
    converted_boxes = list()
    for box in boxes:
        ymin = box[0]
        xmin = box[1]
        ymax = box[2]
        xmax = box[3]
        height = ymax - ymin
        width = xmax - xmin
        box_x = xmin
        box_y = ymin
        converted_boxes.append([height, width, box_x, box_y])


    return {'detection_scores': scores, 'detection_boxes': np.array(converted_boxes)}

INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole1.avi"
ANNOTATION_FILENAME = "/mnt/d/school/csc5651/soc_output/filmrole1.json"
output_file = f'/mnt/d/school/csc5651/soc_output/dets_and_gt_filmrole1.avi'
fps = 25
frame_size = (1024, 1024)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

with open(ANNOTATION_FILENAME) as f:
    ground_truth_annotations = json.load(f)

ground_truth_annotations = {k: v for k, v in ground_truth_annotations.items() if v}

cap = cv2.VideoCapture(INPUT_FILENAME)

i = 0
while cap.isOpened():
    ret, frame = cap.read()
    i += 1
    if not ret:
        break

    if str(i) not in ground_truth_annotations:
        continue

    input_tensor = tf.convert_to_tensor(frame)
    input_tensor = input_tensor[tf.newaxis, ...]
    detections = filter_and_convert_detections(detect_fn(input_tensor), 1, 0.5)

    image_with_boxes = frame.copy()
    for box in detections['detection_boxes']:
        height = int(box[0])
        width = int(box[1])
        x = int(box[2])
        y = int(box[3])
        cv2.rectangle(image_with_boxes, (x, y), (x + width, y + height), (0, 0, 255), 2)
    
    for box in ground_truth_annotations.get(str(i)):
        height = box[1]
        width = box[2]
        x = box[3]
        y = box[4]
        cv2.rectangle(image_with_boxes, (x, y), (x + width, y + height), (0, 255, 0), 2)
    out.write(image_with_boxes)

cap.release()
out.release()
cv2.destroyAllWindows()