# CSC5651 - Applied Project - Detecting soccer players in broadcast footage - Notebook

This notebook contains the experiments done with Tensorflow 2 and Faster R-CNN for my CSC5651 term project to identify and bound soccer players within live broadcast game footage.

## Loading the model

Let's start by verfying we're utilizing the GPU hardware.

In [1]:
import tensorflow as tf
import json
from IPython.display import Image
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import warnings
import pathlib
import time
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as viz_utils

tf.config.list_physical_devices('GPU')

2023-11-28 21:06:39.135863: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 21:06:39.135964: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 21:06:39.141179: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 21:06:39.172206: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-28 21:06:42.065316: I external/local_xla/xla/

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Next, we'll load the pre-trained Faster R-CNN model with Inception.

In [2]:
# Download and extract model
def download_model(model_name, model_date):
    base_url = 'http://download.tensorflow.org/models/object_detection/tf2/'
    model_file = model_name + '.tar.gz'
    model_dir = tf.keras.utils.get_file(fname=model_name,
                                        origin=base_url + model_date + '/' + model_file,
                                        untar=True)
    return str(model_dir)

MODEL_DATE = '20200711'
MODEL_NAME = 'faster_rcnn_resnet152_v1_1024x1024_coco17_tpu-8'
PATH_TO_MODEL_DIR = download_model(MODEL_NAME, MODEL_DATE)

In [3]:
PATH_TO_MODEL_DIR

'/home/jack/.keras/datasets/faster_rcnn_resnet152_v1_1024x1024_coco17_tpu-8'

In [4]:
PATH_TO_SAVED_MODEL = PATH_TO_MODEL_DIR + "/saved_model"

print('Loading model...', end='')
start_time = time.time()

# Load saved model and build the detection function
detect_fn = tf.saved_model.load(PATH_TO_SAVED_MODEL)

end_time = time.time()
elapsed_time = end_time - start_time
print('Done! Took {} seconds'.format(elapsed_time))

Loading model...

2023-11-28 21:06:50.792204: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-28 21:06:50.792286: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-28 21:06:50.792314: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-28 21:06:52.125517: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-28 21:06:52.125616: I external/local_xla/xla/stream_executor

Done! Took 20.933287143707275 seconds


Load up the labels and create a category index for classification.

In [5]:
def download_labels(filename):
    base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/'
    label_dir = tf.keras.utils.get_file(fname=filename,
                                        origin=base_url + filename,
                                        untar=False)
    label_dir = pathlib.Path(label_dir)
    return str(label_dir)

LABEL_FILENAME = 'mscoco_label_map.pbtxt'
PATH_TO_LABELS = download_labels(LABEL_FILENAME)

In [6]:
PATH_TO_LABELS

'/home/jack/.keras/datasets/mscoco_label_map.pbtxt'

In [7]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS,
                                                                    use_display_name=True)

## Kicking the tires

Take the model out for a spin by classifying 10 seconds worth of frames. The video is 25 frames/second, so we'll run object detection on 250 frames, draw bounding boxes on them, restructure them back into a video, and output it to a file.

In [101]:
def load_image_into_numpy_array(path):
    return np.array(Image.open(path))

In [8]:
warnings.filterwarnings('ignore')   # Suppress Matplotlib warnings

VIDEO_PATH = 'data/issia/filmrole1.avi'
OUTPUT_DIR = '/mnt/d/school/csc5651/soc_output/'
NUM_SECONDS = 10

cap = cv2.VideoCapture(VIDEO_PATH)

fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_file = f'{OUTPUT_DIR}output_video.avi'
fps = 25
frame_size = (1024, 1024)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

i = 0
max_i = 25 * NUM_SECONDS

while i < max_i:
  ret, frame = cap.read()
  resized_frame = cv2.resize(frame, (1024, 1024))

  input_tensor = tf.convert_to_tensor(resized_frame)

  # The model expects a batch of images, so add an axis with `tf.newaxis`.
  input_tensor = input_tensor[tf.newaxis, ...]

  # input_tensor = np.expand_dims(image_np, 0)
  detections = detect_fn(input_tensor)

  # All outputs are batches tensors.
  # Convert to numpy arrays, and take index [0] to remove the batch dimension.
  # We're only interested in the first num_detections.
  num_detections = int(detections.pop('num_detections'))
  detections = {key: value[0, :num_detections].numpy()
                  for key, value in detections.items()}
  detections['num_detections'] = num_detections

  # detection_classes should be ints.
  detections['detection_classes'] = detections['detection_classes'].astype(np.int64)

  image = resized_frame.copy()

  viz_utils.visualize_boxes_and_labels_on_image_array(
    image,
    detections['detection_boxes'],
    detections['detection_classes'],
    detections['detection_scores'],
    category_index,
    use_normalized_coordinates=True,
    max_boxes_to_draw=200,
    min_score_thresh=.30,
    agnostic_mode=False)

  out.write(image)
  i += 1

cap.release()
out.release()
cv2.destroyAllWindows()

2023-11-21 21:34:12.260885: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-21 21:34:12.958828: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-11-21 21:34:13.311136: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


## Testing the pre-trained model

Now we'll load up the ISSIA dataset with the help of some utility functions borrowed from [FootAndBall](https://github.com/jac99/FootAndBall), a project by Jacek Komorowski et. al. which uses the same dataset for a similar use case.

In [8]:
import issia_utils as iu

# Start with video 1
annotations = iu.read_issia_ground_truth(1, "data/issia/")

Number of frames = 3021


The object `annotations.persons` now contains a dictionary, whose keys are frame numbers and values are lists of tuples containing `(player_id, height, width, x, y)` for each player in the frame.

In [9]:
annotations.persons.get(356)

[('9998', 103, 42, 1385, 320),
 ('9997', 102, 37, 1, 385),
 ('9999', 94, 42, 307, 235)]

For this project, we're only using the player data, so we'll discard the other information for now. If we have time, we'll come back to the ball and test tracking it too. The model we're using has an input size of 1024 x 1024, but the video is a different size. We need to resize the video and the annotations to match.

In [10]:
VIDEO_PATH = 'data/issia/filmrole1.avi'
INPUT_WIDTH = 1024
INPUT_HEIGHT = 1024

cap = cv2.VideoCapture(VIDEO_PATH)

width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)

width_scale_factor = INPUT_WIDTH / width
height_scale_factor = INPUT_HEIGHT / height

resized_player_annotations = dict()

for k, v in annotations.persons.items():
  frame = k
  annotation = v
  resized_player_annotations[frame] = list()

  for player in annotation:
    player_id = player[0]
    height = int(player[1] * height_scale_factor)
    width  = int(player[2] * width_scale_factor)
    x = int(player[3] * width_scale_factor)
    y = int(player[4] * height_scale_factor)
    resized_player_annotations[frame].append((player_id, height, width, x, y))

In [11]:
resized_player_annotations.get(356)

[('9998', 96, 22, 738, 301),
 ('9997', 96, 19, 0, 362),
 ('9999', 88, 22, 163, 221)]

This resize operation looks pretty sound. Let's verify by resizing some frames from the video and painting on the boundary boxes to verify.

In [13]:
OUTPUT_DIR = '/mnt/d/school/csc5651/soc_output/'

cap = cv2.VideoCapture(VIDEO_PATH)
fourcc = cv2.VideoWriter_fourcc(*"XVID")
output_file = f"{OUTPUT_DIR}ground_truth_test.avi"
fps = 25
frame_size = (1024, 1024)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

i = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    resized_frame = cv2.resize(frame, (1024, 1024))

    image = resized_frame.copy()

    if i in resized_player_annotations:
        for player in resized_player_annotations.get(i):
            height = player[1]
            width = player[2]
            x = player[3]
            y = player[4]

            cv2.rectangle(image, (x, y), (x + width, y + height), (0, 0, 255), 2)
        out.write(image)

    i += 1

cap.release()
out.release()
cv2.destroyAllWindows()

Viewing back the output footage, the bounding boxes look approximately accurate, which means the algorithm for rescaling the bounding boxes isn't grossly incorrect. We'll continue with these, and resize and trim each video down to only the frames that have ground-truth data associated with them. The first few seconds of frames in each video are devoid of it, as the camera was calibrating during those. While we're at it, let's also dump out the resized annotations to JSON files for easier reading later.

In [19]:
for j in range(1, 7):
    cap = cv2.VideoCapture(f'data/issia/filmrole{j}.avi')
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    output_file = f'{OUTPUT_DIR}resized_filmrole{j}.avi'
    fps = 25
    frame_size = (1024, 1024)
    out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)
    annotations = iu.read_issia_ground_truth(j, "data/issia/")
    resized_player_annotations = dict()
    for k, v in annotations.persons.items():
        frame = k
        annotation = v
        resized_player_annotations[frame] = list()

        for player in annotation:
            player_id = player[0]
            height = int(player[1] * height_scale_factor)
            width = int(player[2] * width_scale_factor)
            x = int(player[3] * width_scale_factor)
            y = int(player[4] * height_scale_factor)
            resized_player_annotations[frame].append((player_id, height, width, x, y))

    i = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        resized_frame = cv2.resize(frame, (1024, 1024))
        out.write(resized_frame)

        i += 1

    with open(f'{OUTPUT_DIR}filmrole{j}.json', 'w') as outfile:
        json.dump(resized_player_annotations, outfile)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

NameError: name 'OUTPUT_DIR' is not defined

Now that that's done, we can run object detection on one of the resized video clips, and measure the accuracy of the bounding boxes compared to the ground truth. Since we're only looking for persons, we'll filter all the other labels from the detection space of the model.

In [12]:
def filter_detections(detections, label_id, threshold):
    scores = detections['detection_scores'][0].numpy()
    boxes = detections['detection_boxes'][0].numpy()
    classes = detections['detection_classes'][0].numpy().astype(int)

    mask = (classes == label_id) & (scores > threshold)
    scores = scores[mask]
    boxes = boxes[mask] * 1024
    filtered_boxes = list()
    for box in boxes:
        filtered_boxes.append(box)

    return {'detection_scores': scores, 'detection_boxes': np.array(filtered_boxes)}

INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole1.avi"
ANNOTATION_FILENAME = "/mnt/d/school/csc5651/soc_output/filmrole1.json"

with open(ANNOTATION_FILENAME) as f:
    ground_truth_annotations = json.load(f)

ground_truth_annotations = {k: v for k, v in ground_truth_annotations.items() if v}

cap = cv2.VideoCapture(INPUT_FILENAME)

i = 0
all_detections = dict()
while cap.isOpened():
    ret, frame = cap.read()
    i += 1
    if not ret:
        break

    if str(i) not in ground_truth_annotations:
        continue

    input_tensor = tf.convert_to_tensor(frame)
    input_tensor = input_tensor[tf.newaxis, ...]
    detections = filter_detections(detect_fn(input_tensor), 1, 0.5)
    all_detections[str(i)] = detections

cap.release()
cv2.destroyAllWindows()

2023-11-28 21:10:49.155524: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906


Tensorflow 2 returns detections in the format of `(ymin, xmin, ymax, xmax)`, so we need to convert our ground truth annotations to that format.

In [13]:
def convert_ground_truth_box_to_tf_format(box):
    _, height, width, x, y = box
    ymin = int(y)
    xmin = int(x)
    ymax = int(y + height)
    xmax = int(x + width)
    return np.array([ymin, xmin, ymax, xmax])

converted_ground_truth_annotations = dict()
for k, v in ground_truth_annotations.items():
    converted_boxes = list()
    for box in v:
        converted_boxes.append(convert_ground_truth_box_to_tf_format(box))
    converted_boxes = np.array(converted_boxes)
    converted_ground_truth_annotations[k] = {'detection_boxes': converted_boxes}

In [39]:
converted_ground_truth_annotations.get('1000')

{'detection_boxes': array([[102,   1, 176,  20],
        [270, 692, 365, 718]])}

Write out the frames that had detections and were in the ground truth dictionary.

In [24]:
INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole1.avi"
output_file = f'/mnt/d/school/csc5651/soc_output/dets_and_gt_filmrole1_2.avi'
fourcc = cv2.VideoWriter_fourcc(*"XVID")
fps = 25
frame_size = (1024, 1024)

cap = cv2.VideoCapture(INPUT_FILENAME)
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

i = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    if str(i) not in all_detections:
        i += 1
        continue

    image_with_boxes = frame.copy()
    for box in all_detections.get(str(i))['detection_boxes']:
        ymin = int(box[0])
        xmin = int(box[1])
        ymax = int(box[2])
        xmax = int(box[3])
        cv2.rectangle(image_with_boxes, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

    for box in converted_ground_truth_annotations.get(str(i))['detection_boxes']:
        ymin = int(box[0])
        xmin = int(box[1])
        ymax = int(box[2])
        xmax = int(box[3])
        cv2.rectangle(image_with_boxes, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

    out.write(image_with_boxes)

    i += 1

out.release()

Let's do a quick smoke test and compare the detections with the ground truths for one of the frames.

In [14]:
all_detections['356']

{'detection_scores': array([0.7640616 , 0.66895443, 0.5417378 ], dtype=float32),
 'detection_boxes': array([[341.238   ,   0.      , 471.2253  ,  30.268375],
        [303.21304 , 739.10297 , 403.62918 , 761.4393  ],
        [309.772   ,   0.      , 556.49884 ,  29.684807]], dtype=float32)}

In [15]:
converted_ground_truth_annotations['356']

{'detection_boxes': array([[301, 738, 397, 760],
        [362,   0, 458,  19],
        [221, 163, 309, 185]])}

Now that the ground truths and the detections are in the same format, we'll calculate the accuracy and precision frame-by-frame. This algorithm uses _intersection over union_ to determine hits and misses. That is, the area of overlap between the detected box and the ground truth box, the intersection, as a fraction of the total combined area, the union, of both boxes. 0 means no overlap, and 1 means perfect overlap. For this test, we'll call any detection with an IoU value of >= 0.5 a true positive.

In [103]:
def calculate_iou(gt_box, pred_box):
    y1_t, x1_t, y2_t, x2_t = gt_box
    y1_p, x1_p, y2_p, x2_p = pred_box

    if (x1_p > x2_p) or (y1_p > y2_p):
        raise AssertionError(
            "Prediction box is malformed? pred box: {}".format(pred_box))
    if (x1_t > x2_t) or (y1_t > y2_t):
        raise AssertionError(
            "Ground Truth box is malformed? true box: {}".format(gt_box))

    if (x2_t < x1_p or x2_p < x1_t or y2_t < y1_p or y2_p < y1_t):
        return 0.0

    far_x = np.min([x2_t, x2_p])
    near_x = np.max([x1_t, x1_p])
    far_y = np.min([y2_t, y2_p])
    near_y = np.max([y1_t, y1_p])

    inter_area = (far_x - near_x + 1) * (far_y - near_y + 1)
    true_box_area = (x2_t - x1_t + 1) * (y2_t - y1_t + 1)
    pred_box_area = (x2_p - x1_p + 1) * (y2_p - y1_p + 1)
    iou = inter_area / (true_box_area + pred_box_area - inter_area)
    return iou

def calculate_precision_recall(gt_boxes, det_boxes, iou_threshold):
    if det_boxes is None:
        return 0, 0

    # Calculate precision and recall for a set of ground truth and detected bounding boxes
    tp = 0
    fp = 0
    fn = 0

    for det_box in det_boxes:
        max_iou = 0
        max_gt_box = None

        for gt_box in gt_boxes:
            iou = calculate_iou(gt_box, det_box)

            if iou > max_iou:
                max_iou = iou
                max_gt_box = gt_box

        if max_iou >= iou_threshold:
            tp += 1
            gt_boxes = [x for x in gt_boxes if not np.array_equal(x, max_gt_box)]
        else:
            fp += 1

    fn = len(gt_boxes)

    if tp != 0:
        precision = tp / float(tp + fp)
        recall = tp / float(tp + fn)
    else:
        precision = 0
        recall = 0

    return precision, recall

In [19]:
all_precisions = list()
all_recalls = list()
for frame_id in converted_ground_truth_annotations.keys():
    gt_boxes = converted_ground_truth_annotations.get(frame_id).get('detection_boxes')
    det_boxes = all_detections.get(frame_id).get('detection_boxes') if frame_id in all_detections else None
    precision, recall = calculate_precision_recall(gt_boxes, det_boxes, 0.5)
    all_precisions.append(precision)
    all_recalls.append(recall)

print(f'Average precision: {np.mean(all_precisions)}')
print(f'Average recall: {np.mean(all_recalls)}')

Average precision: 0.738977517655109
Average recall: 0.6425215336495264


This first test is about 74% accurate and showed about 64% recall. Let's now combine this with the other 5 videos and ground truth annotations.

In [42]:
INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole{}.avi"
ANNOTATION_FILENAME = "/mnt/d/school/csc5651/soc_output/filmrole{}.json"

master_ground_truth_annotations = dict()
master_all_detections = dict()
for i in range(1, 7):
    with open(ANNOTATION_FILENAME.format(i)) as f:
        ground_truth_annotations = json.load(f)

    master_ground_truth_annotations.update({f"{i}_{k}": v for k, v in ground_truth_annotations.items() if v})

    ## Uncomment thru cv2.destroyAllWindows() to run detections on all frames
    # cap = cv2.VideoCapture(INPUT_FILENAME.format(i))

    # j = 0
    # while cap.isOpened():
    #     ret, frame = cap.read()
    #     j += 1
    #     if not ret:
    #         break

    #     if str(j) not in ground_truth_annotations:
    #         continue

    #     input_tensor = tf.convert_to_tensor(frame)
    #     input_tensor = input_tensor[tf.newaxis, ...]
    #     detections = filter_detections(detect_fn(input_tensor), 1, 0.5)
    #     master_all_detections[f"{i}_{j}"] = detections

    # cap.release()
    # cv2.destroyAllWindows()

converted_master_ground_truth_annotations = dict()
for k, v in master_ground_truth_annotations.items():
    converted_boxes = list()
    for box in v:
        converted_boxes.append(convert_ground_truth_box_to_tf_format(box))
    converted_boxes = np.array(converted_boxes)
    converted_master_ground_truth_annotations[k] = {'detection_boxes': converted_boxes}

We'll just quickly compare the number of ground truth annotations we have and the number of detections to make sure most, if not all, of the footage was tested.

In [43]:
len(converted_master_ground_truth_annotations)

15992

In [40]:
len(master_all_detections)

0

Finally, the precision and recall calculations.

In [25]:
all_precisions = list()
all_recalls = list()
for frame_id in converted_master_ground_truth_annotations.keys():
    gt_boxes = converted_master_ground_truth_annotations.get(frame_id).get('detection_boxes')
    det_boxes = master_all_detections.get(frame_id).get('detection_boxes') if frame_id in master_all_detections else None
    precision, recall = calculate_precision_recall(gt_boxes, det_boxes, 0.5)
    all_precisions.append(precision)
    all_recalls.append(recall)

print(f'Average precision over frames in all videos: {np.mean(all_precisions)}')
print(f'Average recall over frames in all videos: {np.mean(all_recalls)}')

Average precision over frames in all videos: 0.6575846106380119
Average recall over frames in all videos: 0.5736950081377786


The second test, this time including _all_ the test footage in the dataset, was 65.8% accurate with a recall of 57.3%. 

## Training the model

In the next stage of the project, we'll split the data into a training set and a testing set, and train a fresh version of the model. We'll then recalculate the precision and recall against the validation set, of course, hoping for better results. But first, we need to dump each frame out to disk and convert the annotations to TFRecord format, which is the format the training API uses.

In [45]:
OUTPUT_DIR = '/mnt/d/school/csc5651/soc_output/training_workspace/images/'
INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole{}.avi"

for i in range(1,7):
    cap = cv2.VideoCapture(INPUT_FILENAME.format(i))

    j = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if f'{i}_{j}' in converted_master_ground_truth_annotations:
            cv2.imwrite(f'{OUTPUT_DIR}{i}_{j}.jpg', frame)

        j += 1

cap.release()
cv2.destroyAllWindows()

We'll use a utility function from the _TensorFlow 2 Object Detection API tutorial_ to partition our ~16,000 image dataset into a training set and testing set.

In [46]:
from partition_dataset import partition_dataset

partition_dataset('/mnt/d/school/csc5651/soc_output/training_workspace/images/', '/mnt/d/school/csc5651/soc_output/training_workspace/', 0.2, False)

Now, let's convert the ground truth annotations for the training and testing datasets into TFRecord files. We'll need one for each training and testing. The TFRecord format is composed of a list of `tf.train.Example`s, each including info about the file (height, width, filename, source ID (we'll use the filename for this field, too), encoded file data, and the file format), and info about the bounding boxes (coordinates, label text, and label ID). Fortunately, this isn't too difficult of a task for our use case, since we can make some assumptions automatically:
* We only have one class (person)
* All of our data is already pre-resized to 1024 X 1024
* Our ground truths dict is already keyed with the same format as the images in the dataset
* Our ground truths dict already contains coordinates in the format we need (x/y-min and x/y-max)

In [47]:
from object_detection.utils import dataset_util
import io
import os

def create_tf_example(filepath):
    with tf.io.gfile.GFile(filepath, 'rb') as fid:
        encoded_jpg = fid.read()

    filename = filepath.split('/')[-1]
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for box in converted_master_ground_truth_annotations.get(filename.split('.')[0])['detection_boxes']:
        ymins.append(box[0] / 1024.0)
        xmins.append(box[1] / 1024.0)
        ymaxs.append(box[2] / 1024.0)
        xmaxs.append(box[3] / 1024.0)
        classes_text.append(b"person")
        classes.append(1)

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(1024),
        'image/width': dataset_util.int64_feature(1024),
        'image/filename': dataset_util.bytes_feature(bytes(filename, 'utf-8')),
        'image/source_id': dataset_util.bytes_feature(bytes(filename, 'utf-8')),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example

TESTING_DIR = '/mnt/d/school/csc5651/soc_output/training_workspace/test/'

with tf.io.TFRecordWriter(TESTING_DIR + 'issia_testing.record') as writer:
    for filename in os.listdir(TESTING_DIR):
        if filename.endswith(".jpg"):
            filepath = os.path.join(TESTING_DIR, filename)
            tf_example = create_tf_example(filepath)
            writer.write(tf_example.SerializeToString())

Great! That's our testing TFRecord, and now we just need one for training.

In [48]:
TRAINING_DIR = '/mnt/d/school/csc5651/soc_output/training_workspace/train/'

with tf.io.TFRecordWriter(TRAINING_DIR + 'issia_training.record') as writer:
    for filename in os.listdir(TRAINING_DIR):
        if filename.endswith(".jpg"):
            filepath = os.path.join(TRAINING_DIR, filename)
            tf_example = create_tf_example(filepath)
            writer.write(tf_example.SerializeToString())

I wanted to verify the TFRecord files, so I wrote this function to simply dump out the first 10 results to spot-check against the master dict of ground truths.

In [49]:
def verify_tfrecord(tfrecord_file):
    raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

    for raw_record in raw_dataset.take(10):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        print(example)

verify_tfrecord('/mnt/d/school/csc5651/soc_output/training_workspace/train/issia_training.record')

features {
  feature {
    key: "image/encoded"
    value {
      bytes_list {
        value: "\377\330\377\340\000\020JFIF\000\001\001\000\000\001\000\001\000\000\377\333\000C\000\002\001\001\001\001\001\002\001\001\001\002\002\002\002\002\004\003\002\002\002\002\005\004\004\003\004\006\005\006\006\006\005\006\006\006\007\t\010\006\007\t\007\006\006\010\013\010\t\n\n\n\n\n\006\010\013\014\013\n\014\t\n\n\n\377\333\000C\001\002\002\002\002\002\002\005\003\003\005\n\007\006\007\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\377\300\000\021\010\004\000\004\000\003\001\"\000\002\021\001\003\021\001\377\304\000\037\000\000\001\005\001\001\001\001\001\001\000\000\000\000\000\000\000\000\001\002\003\004\005\006\007\010\t\n\013\377\304\000\265\020\000\002\001\003\003\002\004\003\005\005\004\004\000\000\001}\001\002\003\000\004\021\005\022!1A\006\023Qa\007\"q\0242\201\221\241\010#B\261\301\025R\321\360$3br\202\t\n\026\027\030\031\032%&\'()*4

Now that we have TFRecords and a downloaded model, we can prepare a training job. This is done by copying and modifying the `pipeline.config` file that ships with the pre-trained model from the model zoo. For the sake of brevity in this notebook, I'll commit the `pipeline.config` file to this repository for viewing alongside this file. With the TFRecord files and the convenience scripts included in the object detection API package, the actual training process can be kicked off with a simple script run. I did this outside the notebook so I could continue experiments without interrupting the training process, but it was started using the following command inside the activated virtual env:

```
python model_main_tf2.py --model_dir=models/faster_rcnn_inception_resnet_v2_1024x1024_coco17_tpu-8_2/ --pipeline_config_path=models/faster_rcnn_inception_resnet_v2_1024x1024_coco17_tpu-8_2/pipeline.config
```

I've also gone ahead and exported the model using the instructions in the Object Detection API tutorial for use in the next steps.

## Evaluating the custom-trained model

We can now load up the custom-trained model and validate it using the same metrics we used earlier in the notebook. Namely, using IoU to calculate precision and recall.

In [89]:
print('Loading model...', end='')
start_time = time.time()

# Load saved model and build the detection function
detect_fn = tf.saved_model.load('exported_models/custom_trained_faster_rcnn_5/saved_model')

end_time = time.time()
elapsed_time = end_time - start_time
print('Done! Took {} seconds'.format(elapsed_time))

Loading model...Done! Took 62.14924883842468 seconds


And run detections for the testing dataset.

In [105]:
INPUT_DIR = '/mnt/d/school/csc5651/soc_output/training_workspace/test/'

print('Starting inference on test data...', end='')
start_time = time.time()

precisions = list()
recalls = list()
for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".jpg"):
        filepath = os.path.join(INPUT_DIR, filename)
        input_tensor = tf.convert_to_tensor(load_image_into_numpy_array(filepath))
        input_tensor = input_tensor[tf.newaxis, ...]
        det_boxes = filter_detections(detect_fn(input_tensor), 1, 0.5)['detection_boxes']

        gt_boxes = converted_master_ground_truth_annotations.get(filename.split('.')[0])['detection_boxes']

        precision, recall = calculate_precision_recall(gt_boxes, det_boxes, 0.5)
        precisions.append(precision)
        recalls.append(recall)

end_time = time.time()
elapsed_time = end_time - start_time
print('Done! Took {} seconds'.format(elapsed_time))

print(f'Average precision over frames in all videos: {np.mean(precisions)}')
print(f'Average recall over frames in all videos: {np.mean(recalls)}')

Starting inference on test data...Done! Took 1300.7184975147247 seconds
Average precision over frames in all videos: 0.9270361572841177
Average recall over frames in all videos: 0.9560144484808843


In [99]:
converted_master_ground_truth_annotations

{'1_356': {'detection_boxes': array([[301, 738, 397, 760],
         [362,   0, 458,  19],
         [221, 163, 309, 185]])},
 '1_357': {'detection_boxes': array([[366,   0, 456,  17]])},
 '1_358': {'detection_boxes': array([[365,   0, 455,  14]])},
 '1_395': {'detection_boxes': array([[205,  97, 294, 124],
         [280, 712, 376, 732]])},
 '1_396': {'detection_boxes': array([[205,  95, 289, 120],
         [280, 707, 372, 731]])},
 '1_397': {'detection_boxes': array([[204,  94, 289, 115],
         [279, 704, 370, 731]])},
 '1_398': {'detection_boxes': array([[204,  92, 288, 111],
         [280, 704, 371, 731]])},
 '1_399': {'detection_boxes': array([[202,  90, 288, 111],
         [280, 704, 372, 730]])},
 '1_400': {'detection_boxes': array([[205,  87, 288, 110],
         [279, 704, 370, 729]])},
 '1_401': {'detection_boxes': array([[206,  84, 289, 110],
         [279, 704, 371, 729]])},
 '1_402': {'detection_boxes': array([[205,  81, 289, 109],
         [279, 704, 371, 729]])},
 '1_403'

In [91]:
INPUT_FILENAME = "/mnt/d/school/csc5651/soc_output/resized_filmrole1.avi"
ANNOTATION_FILENAME = "/mnt/d/school/csc5651/soc_output/filmrole1.json"

with open(ANNOTATION_FILENAME) as f:
    ground_truth_annotations = json.load(f)

ground_truth_annotations = {k: v for k, v in ground_truth_annotations.items() if v}

cap = cv2.VideoCapture(INPUT_FILENAME)

i = 0
all_detections = dict()
while cap.isOpened():
    ret, frame = cap.read()
    i += 1
    if not ret:
        break

    if str(i) not in ground_truth_annotations:
        continue

    input_tensor = tf.convert_to_tensor(frame)
    input_tensor = input_tensor[tf.newaxis, ...]
    detections = filter_detections(detect_fn(input_tensor), 1, 0.5)
    all_detections[str(i)] = detections
    if len(all_detections) == 100:
        break

cap.release()
cv2.destroyAllWindows()

In [92]:
all_detections

{'356': {'detection_scores': array([0.99888283, 0.9966702 , 0.9931651 ], dtype=float32),
  'detection_boxes': array([[299.91983  , 738.885    , 397.4682   , 760.77527  ],
         [218.98459  , 168.48064  , 308.62347  , 188.38553  ],
         [352.85687  ,   1.2709936, 456.92804  ,  25.177938 ]],
        dtype=float32)},
 '357': {'detection_scores': array([0.99884355, 0.9968863 , 0.983429  ], dtype=float32),
  'detection_boxes': array([[2.9943713e+02, 7.3860339e+02, 3.9777808e+02, 7.6044751e+02],
         [2.1760637e+02, 1.6783559e+02, 3.0797589e+02, 1.8705359e+02],
         [3.5304602e+02, 5.5472291e-01, 4.5793094e+02, 2.4124804e+01]],
        dtype=float32)},
 '358': {'detection_scores': array([0.9988746, 0.9977082, 0.9876364], dtype=float32),
  'detection_boxes': array([[299.46533 , 738.4814  , 398.0597  , 760.0414  ],
         [216.33461 , 166.05736 , 307.01425 , 186.53873 ],
         [349.04874 ,   0.      , 456.48032 ,  22.086952]], dtype=float32)},
 '395': {'detection_scores': a

In [115]:
vid = 4
j = 2699
filepath = f'/mnt/d/school/csc5651/soc_output/training_workspace/images/{vid}_{j}.jpg'
image_with_boxes = cv2.imread(filepath)
# for box in all_detections[str(j)]['detection_boxes']:
#     ymin = int(box[0])
#     xmin = int(box[1])
#     ymax = int(box[2])
#     xmax = int(box[3])
#     cv2.rectangle(image_with_boxes, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

input_tensor = tf.convert_to_tensor(load_image_into_numpy_array(filepath))
input_tensor = input_tensor[tf.newaxis, ...]
det_boxes = filter_detections(detect_fn(input_tensor), 1, 0.5)['detection_boxes']

for box in det_boxes:
    ymin = int(box[0])
    xmin = int(box[1])
    ymax = int(box[2])
    xmax = int(box[3])
    cv2.rectangle(image_with_boxes, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

for box in converted_master_ground_truth_annotations.get(f'{vid}_{j}')['detection_boxes']:
    ymin = box[0]
    xmin = box[1]
    ymax = box[2]
    xmax = box[3]
    cv2.rectangle(image_with_boxes, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
cv2.imwrite(f'/mnt/d/school/csc5651/soc_output/training_workspace/results/{vid}_{j}_detections.jpg', image_with_boxes)

True

In [106]:
converted_master_ground_truth_annotations

{'1_356': {'detection_boxes': array([[301, 738, 397, 760],
         [362,   0, 458,  19],
         [221, 163, 309, 185]])},
 '1_357': {'detection_boxes': array([[366,   0, 456,  17]])},
 '1_358': {'detection_boxes': array([[365,   0, 455,  14]])},
 '1_395': {'detection_boxes': array([[205,  97, 294, 124],
         [280, 712, 376, 732]])},
 '1_396': {'detection_boxes': array([[205,  95, 289, 120],
         [280, 707, 372, 731]])},
 '1_397': {'detection_boxes': array([[204,  94, 289, 115],
         [279, 704, 370, 731]])},
 '1_398': {'detection_boxes': array([[204,  92, 288, 111],
         [280, 704, 371, 731]])},
 '1_399': {'detection_boxes': array([[202,  90, 288, 111],
         [280, 704, 372, 730]])},
 '1_400': {'detection_boxes': array([[205,  87, 288, 110],
         [279, 704, 370, 729]])},
 '1_401': {'detection_boxes': array([[206,  84, 289, 110],
         [279, 704, 371, 729]])},
 '1_402': {'detection_boxes': array([[205,  81, 289, 109],
         [279, 704, 371, 729]])},
 '1_403'

In [72]:
with tf.io.gfile.GFile("/mnt/d/school/csc5651/soc_output/training_workspace/images/1_356.jpg", 'rb') as fid:
    encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = Image.open(encoded_jpg_io)

width, height = image.size
print(width, height)

1024 1024


In [74]:
def extract_image_and_boxes(tfrecord_file, output_dir):
    raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

    i = 0
    for raw_record in raw_dataset.take(100):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        encoded_jpg = example.features.feature['image/encoded'].bytes_list.value[0]
        image = Image.open(io.BytesIO(encoded_jpg))
        image_np = np.array(image)
        width, height = image.size

        xmins = example.features.feature['image/object/bbox/xmin'].float_list.value
        xmaxs = example.features.feature['image/object/bbox/xmax'].float_list.value
        ymins = example.features.feature['image/object/bbox/ymin'].float_list.value
        ymaxs = example.features.feature['image/object/bbox/ymax'].float_list.value

        for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs):
            start_point = (int(xmin * width), int(ymin * height))
            end_point = (int(xmax * width), int(ymax * height))
            color = (0, 255, 0)  # Green color in BGR
            thickness = 2
            image_np = cv2.rectangle(image_np, start_point, end_point, color, thickness)

        cv2.imwrite(output_dir + f"{i}.jpg", image_np)
        i += 1

# Call the function with the paths to your TFRecord file and output image file
# replace '/path/to/your/tfrecord/file' and '/path/to/your/output/image/file' with the actual paths
extract_image_and_boxes('/mnt/d/school/csc5651/soc_output/training_workspace/train/issia_training.record', '/mnt/d/school/csc5651/soc_output/training_workspace/results/')