# Pedestrian tracking with YOLOv3 object dection model and SORT tracking algorithm


## Overview
This notebook implements a two-step (detect-then-track) pedestrian tracking service. Tasks include:
- 1. Import input video and extract static frames
- 2. For each frame, use YOLOv3 model to detect human objects, and send detected bounding boxes to an SORT (Simple online and realtime tracking) algorithm that tracks and associates each box with person ID.
- 3. Combine processed frames to create output video with boxes and IDs.
- 4. (to be implemented) Export trajectories of the bottom center point of each bounding boxes, as the movement trajectories of people.

Notes:
- In this notebook, a pretrained YOLOv3 model is deploymend on a AWS Sagemaker Endpoint for testing purpose. In production, trained YOLOv3 model will be included in a Docker image and making predictions within a AWS ECS or Fargate service.

## 1) Setup environment


In [None]:
# install tools and packages
!conda install ffmpeg -y  # for generating output video from output frames

In [None]:
import os, datetime, json, math, shutil, tarfile, random
import os.path as osp
import subprocess as sb
from IPython.display import clear_output

import gluoncv
from gluoncv import model_zoo, data, utils
from gluoncv.utils.viz import plot_image
import mxnet
from mxnet import gluon, image, nd

import boto3

import sagemaker
from sagemaker import get_execution_role
from sagemaker.mxnet.model import MXNetModel

import cv2
from PIL import Image

sess = sagemaker.Session()
s3 = boto3.client('s3')

# S3 bucket name and directories for the YOLOv3 pre-trained model
bucket = 'pedestrian-tracker'  
s3key_model = 'detection-artifact'

## 2) Import YOLOv3 model from gluoncv model zoo and save to S3
- gluoncv is built upon the mxnet framework and provides SOTA deep learning algorithms in computer vision. 
- YOLOv3 model is imported, configurated, and saved to AWS S3 for later use.

In [None]:
# select the model to import
model_name = 'yolo3_darknet53_coco'
net = model_zoo.get_model(model_name, pretrained=True)

# reset the detector to detect only the "person" class
classes = ['person']
net.reset_class(classes=classes, reuse_weights=classes)
net.hybridize()  # switch to declarative execution to optimize computation

In [None]:
# save the full model (both weights and graph)
net.export(model_name, epoch=0)

# compress
packname = 'YOLOv3-darknet53-coco-model.tar.gz'
tar = tarfile.open(packname, 'w:gz')
tar.add('{}-symbol.json'.format(model_name))
tar.add('{}-0000.params'.format(model_name))
tar.close()

# send to S3
s3.upload_file(packname, bucket, s3key_model + '/' + packname)

## 3) Deploy YOLOv3 model on a SageMaker endpoint
Steps:
-  1. Prepare requirements.txt and detection_server.py
-  2. Instantiate model and deploy onto an endpoint

In [None]:
# Prepare script in the 'repo' directory
! mkdir repo

In [None]:
%%writefile repo/requirements.txt
# create a requirements.txt to add an extra dependency to the SageMaker MXNet container
gluoncv==0.6.0

In [None]:
%%writefile repo/detection_server.py
# create a detection_server.py 

import argparse
import ast
import logging
import os

from gluoncv import model_zoo, data, utils
import mxnet as mx
from mxnet import nd, gluon

def get_ctx():
    "function to get machine hardware context"
    try:
        _ = mx.nd.array([0], ctx=mx.gpu())
        ctx = mx.gpu()
    except:
        try:
            _ = mx.nd.array([0], ctx=mx.eia())
            ctx = mx.eia()
        except: 
            ctx = mx.cpu()
    return ctx


def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network)
    
    assumes that the parameters artifact is {model_name}.params
    """
    
    ctx = get_ctx()
    logging.info('Using ctx {}'.format(ctx))
    logging.info('Dir content {}'.format(os.listdir()))
    
    # instantiate net and reset to classes of interest
    net = gluon.nn.SymbolBlock.imports(
        symbol_file=[f for f in os.listdir() if f.endswith('json')][0],
        input_names=['data'],
        param_file=[f for f in os.listdir() if f.endswith('params')][0],
        ctx=ctx)
    
    return net


def input_fn(request_body, request_content_type):
    """prepares the input"""
        
    im_array = mx.image.imdecode(request_body)
    
    # Run YOLO pre-processing on CPU
    x, _ = data.transforms.presets.yolo.transform_test(im_array)
    logging.info('input_fn returns NDArray of shape ' + str(im_array.shape))
    
    return x


def predict_fn(input_object, model):
    """function used for prediction"""
    
    ctx = get_ctx()
    logging.info('Using ctx {}'.format(ctx))
    
    # forward pass and display
    box_ids, scores, bboxes = model(input_object.as_in_context(ctx))
    
    return nd.concat(box_ids, scores, bboxes, dim=2)  # return a single tensor

In [None]:
# Instantiate model
model = MXNetModel(
    model_data='s3://{}/{}/{}'.format(bucket, s3key, packname),
    role=get_execution_role(),
    py_version='py3',
    entry_point='detection_server.py',
    source_dir='repo',
    framework_version='1.6.0')

In [None]:
# Set the name of the SageMaker endpoint, and deploy
# - specify the EC2 instance type
endpoint_key = ((model_name + '-detection').replace('_', '-').replace('.', '') + '-' 
                + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))

# this may take 5 to 10min
model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name=endpoint_key)

## 4) Implement SORT object tracking algorithm
Steps of SORT
- 1. 
- 2. 

In [None]:
# download sort.py of the SORT algorithm from source
!wget 'https://raw.githubusercontent.com/abewley/sort/master/sort.py'

In [5]:
# download input video data from s3 to local disk
#  - assume that raw input video file is stored in S3://{bukcet}/raw-data
s3key_input_video = 'raw-data'
video_fname = 'shopping-mall2'
video_ftype = 'mp4'
full_fname = '{}.{}'.format(video_fname, video_ftype)
s3.download_file(bucket, s3key_input_video+ '/' + full_fname, 'video/'+ full_fname)

In [None]:
# Instantiate a SageMaker predictor using the endpoint in service
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_key,
    content_type='image/jpeg')

In [None]:
# function for calling the prediction endpoint
# - send a single video frame to endpoint and get object detection results
def detect(pic, predictor, input_type):
    """elementary function to send a picture to a predictor"""
    if input_type == 'url':
        with open(pic, 'rb') as image:
            f = image.read()
    elif input_type == 'byte':
        f = pic
    tensor = nd.array(json.loads(predictor.predict(f)))
    box_ids, scores, bboxes = tensor[:,:,0], tensor[:,:,1], tensor[:,:,2:]
    return box_ids, scores, bboxes

In [None]:
videopath = 'video/'+ full_fname
%pylab inline 

cmap = plt.get_cmap('tab20b')
colors = [cmap(i)[:3] for i in np.linspace(0, 1, 20)]

# get video parameters
vid = cv2.VideoCapture(videopath) 
vid_width = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
vid_height = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
vid_FPS = vid.get(cv2.cv2.CAP_PROP_FPS)
vid_n_frame = int(vid.get(cv2.cv2.CAP_PROP_FRAME_COUNT))
vid_pos_frame = vid.get(cv2.cv2.CAP_PROP_POS_FRAMES)

# initialize Sort object and set hyperparameters
from sort import *
mot_tracker = Sort(max_age = 20,  # Maximum number of frames to keep alive a track without associated detections
                   min_hits = 3,  # Minimum number of associated detections before track is initialised.
                   iou_threshold = 0.4)  # Minimum intersection-over-union (IOU) for calling a match

In [None]:
# Iterate over individual video frames, update Sort object for object tracking
import io

# set prediction score threshold below which a bounding box is removed
viz_tracking_prob_threshold = 0.75


for ii in range(vid_n_frame):
    ret, frame = vid.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # get current frame as an image
    pilimg = Image.fromarray(frame)
    temp = io.BytesIO() 
    pilimg.save(temp, format='PNG')
    pilimg.save('temp/temp.png', format='PNG') # save the current video frame
    img_byte_arr = temp.getvalue()

    # run object detection on current frame image, by calling the Sagemaker endpoint
    detections = detect(img_byte_arr, predictor, input_type='byte')

    # get image frame for later adding tracking results
    _, orig_img = data.transforms.presets.yolo.load_test('temp/temp.png')

    # Extract bounding boxes and prediction scores
    #  - remove boxes that have lower prediction scores than the specified threshold
    idx_positive = np.argwhere(detections[1][0].asnumpy() > viz_tracking_prob_threshold).flatten()
    bboxes = detections[2][0][idx_positive].asnumpy()
    scores = detections[1][0][idx_positive].asnumpy().reshape(len(idx_positive),1)
    det = numpy.concatenate([bboxes,scores],axis=1)
    
    # Send the location of bounding boxes and their prediction scores to Sort tracker
    tracked_objects = mot_tracker.update(det)

    # Plot and save the tracking outputs
    # - plotting parameters
    labels=None
    class_names = classes
    linewidth=1.5
    fontsize=10

    # - add bounding boxes and object ID to the current frame image
    ax = plot_image(orig_img, ax=None, reverse_rgb=None);
    if len(idx_positive) > 0:
        # use random colors if None is provided
        if colors is None:  
            colors = dict()
        for i, bbox_and_id in enumerate(tracked_objects):
            cls_id = int(labels.flat[i]) if labels is not None else -1
            if cls_id not in colors:
                if class_names is not None:
                    colors[cls_id] = plt.get_cmap('hsv')(cls_id / len(class_names))
                else:
                    colors[cls_id] = (random.random(), random.random(), random.random())
            xmin, ymin, xmax, ymax = [int(x) for x in bbox_and_id[0:4]]
            rect = plt.Rectangle((xmin, ymin), xmax - xmin,
                                 ymax - ymin, fill=False,
                                 edgecolor=colors[cls_id],
                                 linewidth=linewidth)
            ax.add_patch(rect);
            if class_names is not None and cls_id < len(class_names):
                class_name = class_names[cls_id]
            else:
                class_name = str(cls_id) if cls_id >= 0 else ''
            objid = int(bbox_and_id[4])
            if class_name or score:
                ax.text(xmin, ymin - 2,
                        '{}'.format(objid),
                        #bbox=dict(facecolor=colors[cls_id], alpha=0.5),
                        fontsize=fontsize, color=colors[cls_id]);
    
    # Save the annotated frame image
    output_folder = 'outputs/SORT-{}'.format(video_fname)
    fname = 'SORT-{}-frame-{}.jpg'.format(video_fname,str(ii).zfill(5))
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, fname), dpi=200, bbox_inches='tight', pad_inches=0.0)
    plt.close()
    print('Frame number {} is processed.'.format(str(ii)))

vid.release()
cv2.destroyAllWindows()

In [None]:
# Combine annotated frames into an output video
#  - use ffmpeg
output_fname = 'SORT-{}-results.mp4'.format(video_fname)
output_video_path = osp.join('outputs/videos', output_fname)
cmd_str = 'ffmpeg -f image2 -i {}-%05d.jpg -b 5000k -c:v mpeg4 {}'.format(output_folder + '/SORT-' + video_fname + '-frame', output_video_path)
os.system(cmd_str)

In [None]:
# upload output video to s3
s3key_outputs = 'outputs'
s3.upload_file(output_video_path, bucket, s3key_outputs + '/' + output_fname)