In [None]:
import cv2
import sys
from imutils.video import VideoStream
import argparse
import imutils
import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as matimg
import dlib
import seaborn as sns
% matplotlib inline

In [None]:
# object detection

# initialize the list of class labels MobileNet SSD was trained to
# detect
CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat",
    "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
    "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
    "sofa", "train", "tvmonitor"]
 
# load our serialized model from disk
net = cv2.dnn.readNetFromCaffe("mobilenet_ssd\MobileNetSSD_deploy.prototxt", 
                              "mobilenet_ssd\MobileNetSSD_deploy.caffemodel")

This function does the initial tracking. It alternates bewteen object detection (using a pretrained neural network model from https://github.com/Zehaos/MobileNet) and object tracking. Running the deep neural network for object detection is computationally intensive and thus increases processing time, so it is only used occassionally to identify the players and ensure that both are being tracked.

In [None]:
def playerTracker(vidFH, skipFrames):
    # INPUTS:
    #  vidFH = file handle for video to analyze
    #  skipFrames = number of frames to run tracker between object detection
    #
    # OUTPUTS:
    #  playerPositions: [frames x players x [X, Y]]. [X, Y] is the position of the bottom of the
    #   detected player box,approximately at their feet.
    #  playerBoxes: [frames x players x [X, Y, W, H]]. [X, Y] is the center of the detected player
    #    box. [W, H] is the width and height of the box.

    nDetected = 0
    totalFrames = 0
    trackers = []
    W = None
    H = None

    vs = cv2.VideoCapture(vidFH)

    playerPositions = np.zeros([1,2,2])
    playerBoxes = np.zeros([1,2,4])

    # loop over frames from the video stream
    while True:
        frame = vs.read()
        frame = frame[1]

        if frame is None:
            break

        frame = imutils.resize(frame, width=500)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if W is None or H is None:
            (H, W) = frame.shape[:2]

        status = "Waiting"
        rects = []

        tmpBox = np.zeros([2,4])
        # run detection every 10th frame of if less than 2 players are detected
        if totalFrames % skipFrames == 0 or nDetected<2:
            status = "Detecting"
            # create tracker to be used after detection
            trackers = cv2.MultiTracker_create()

            blob = cv2.dnn.blobFromImage(frame, 0.007843, (W, H), 127.5)
            net.setInput(blob)
            detections = net.forward()

            #determine confidence level that detects two people
            tmpConfidence = detections[0,0,:,2]
            tmpClasses = detections[0,0,:,1]
            persons = np.where(tmpClasses==15)
            persons = persons[0]
            personConfidence = tmpConfidence[persons]
            personConfidence = np.sort(personConfidence)
            if len(personConfidence)>=2:
                confidenceLevel = personConfidence[-2]-0.01
                nDetected = 2
            elif len(personConfidence)==1:
                confidenceLevel = personConfidence[0]-0.01
                nDetected = 1
            else:
                confidenceLevel = 1
                nDetected = 0

            for i in np.arange(0, detections.shape[2]):
                confidence = detections[0, 0, i, 2]
                
                if confidence > confidenceLevel:
                    idx = int(detections[0, 0, i, 1])

                    if CLASSES[idx] != "person":
                        continue

                    box = detections[0, 0, i, 3:7] * np.array([W, H, W, H])
                    (startX, startY, endX, endY) = box.astype("int")
                    box = (startX, startY, endX-startX, endY-startY)
                    rects.append(box)

                    # create tracker to be used after detection
                    tracker = cv2.TrackerCSRT_create()
                    trackers.add(tracker, frame, box)

                    (x, y, w, h) = [startX, startY, endX-startX, endY-startY]
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                    if i<2:
                        tmpBox[i,:] = [x, y, w, h]

        else:
            status = "Tracking"

            # update trackers using new video frame
            (success, boxes) = trackers.update(frame)

            # loop over the bounding boxes and draw them on the frame
            for box in boxes:
                (x, y, w, h) = [int(v) for v in box]
                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                rects.append(box)
            nDetected = len(boxes)
            tmpBox = rects

        info = [
            ("Status: ", status),
            ("Frame", totalFrames)
        ]

        tmpPos = np.zeros([2,2])
        # compute "feet" position and save
        for iBox in range(0,np.min([np.shape(rects)[0],2])):
            text = "Player %d" % int(iBox)
            box = rects[iBox]
            (x, y, w, h) = [int(v) for v in box]
            X = x+int(w/2)
            Y = y+int(h/2)
            tmpPos[iBox,0] = X
            tmpPos[iBox,1] = Y
            cv2.putText(frame, text, (X, Y),
                cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 255), 2)

        tmpPos = [[tmpBox[0][0]+tmpBox[0][2]/2, tmpBox[0][1]+tmpBox[0][3]],
                  [tmpBox[1][0]+tmpBox[1][2]/2, tmpBox[1][1]+tmpBox[1][3]]]
        playerPositions[-1,:,:] = tmpPos
        playerBoxes[-1,:,:] = tmpBox[0:2]
        playerPositions = np.append(playerPositions,np.zeros([1,2,2]),axis=0)
        playerBoxes = np.append(playerBoxes,np.zeros([1,2,4]), axis=0)

        # loop over the text tuples and draw them on our frame
        for (i, (k, v)) in enumerate(info):
            text = "{}: {}".format(k, v)
            cv2.putText(frame, text, (10, H - ((i * 20) + 20)),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

        # show the output frame
        cv2.imshow("Frame", frame)
        key = cv2.waitKey(1) & 0xFF

        # if the `q` key was pressed, break from the loop
        if key == ord("q"):
            break

        totalFrames += 1

    vs.release()

    # close all windows
    cv2.destroyAllWindows()
    
    return playerPositions, playerBoxes

The initial tracker doesn't always identify two players. For example, when one player crosses in front of the other, the detection algorithm may only detect a single player or the tracker may drop the occluded player. I'll fix this by perfoming a linear interpolation where the tracker fails.

In [None]:
def fixDropouts(playerPositions):
    # INPUTS:
    #  playerPositions: output from playerTracker
    #
    # OUTPUTS:
    #  playerPositionsFixed: same as playerPositions with missing data interpolated

    playerPositionsFixed = playerPositions.copy()
    for i in range(0,2):
        playerPosX = playerPositionsFixed[:,i,0]
        dropOuts = np.where(np.bitwise_and(playerPosX[0:-1]>1, playerPosX[1:]<=1))
        dropOuts = dropOuts[0]
        returns = np.where(np.bitwise_and(playerPosX[0:-1]<1, playerPosX[1:]>=1))
        returns = returns[0]
        for iDrop in range(0,min(len(dropOuts), len(returns))):
            startPos = playerPositionsFixed[dropOuts[iDrop],i,:]
            endPos = playerPositionsFixed[returns[iDrop]+1,i,:]
            interpX = np.linspace(startPos[0], endPos[0], returns[iDrop]-dropOuts[iDrop]+2)
            interpY = np.linspace(startPos[1], endPos[1], returns[iDrop]-dropOuts[iDrop]+2)
            playerPositionsFixed[dropOuts[iDrop]:returns[iDrop]+2,i,0] = interpX
            playerPositionsFixed[dropOuts[iDrop]:returns[iDrop]+2,i,1] = interpY
    
    return playerPositionsFixed

def fixDropouts_Box(playerBoxes):
    # INPUTS:
    #  playerBoxes: output from playerTracker
    #
    # OUTPUTS:
    #  playerBoxessFixed: same as playerBoxes with missing data interpolated

    playerBoxesFixed = playerBoxes.copy()
    # fix dropout--replace with linear map
    for i in range(0,2):
        playerPosX = playerBoxesFixed[:,i,0]
        dropOuts = np.where(np.bitwise_and(playerPosX[0:-1]>1, playerPosX[1:]<=1))
        dropOuts = dropOuts[0]
        returns = np.where(np.bitwise_and(playerPosX[0:-1]<1, playerPosX[1:]>=1))
        returns = returns[0]
        for iDrop in range(0,min(len(dropOuts), len(returns))):
            startPos = playerBoxesFixed[dropOuts[iDrop],i,:]
            endPos = playerBoxesFixed[returns[iDrop]+1,i,:]
            interpX = np.linspace(startPos[0], endPos[0], returns[iDrop]-dropOuts[iDrop]+2)
            interpY = np.linspace(startPos[1], endPos[1], returns[iDrop]-dropOuts[iDrop]+2)
            playerBoxesFixed[dropOuts[iDrop]:returns[iDrop]+2,i,0] = interpX
            playerBoxesFixed[dropOuts[iDrop]:returns[iDrop]+2,i,1] = interpY
            playerBoxesFixed[dropOuts[iDrop]:returns[iDrop]+2,i,2] = playerBoxesFixed[dropOuts[iDrop]-1,i,2]
            playerBoxesFixed[dropOuts[iDrop]:returns[iDrop]+2,i,3] = playerBoxesFixed[dropOuts[iDrop]-1,i,3]
               
    return playerBoxesFixed

In addition to dropping players, the tracker often switches the labels of the two players. The next function sets the positions and boxes so that playersPositions[:,i,:] corresponds consistently to player i. It does this by setting each players identity based on the histograms for the R, G, and B channels in the initial detections boxes. This works best when the players are wearing different color shirts.

In [None]:
def identifyPlayer(playerPositions, playerBoxes, vidFH):
    # INPUTS:
    #  playerPositions: output from playerTracker or fixDropouts
    #  playerBoxes: output from playerTracker or fixDropouts_Box
    #  vidFH: file handle for video that was analyzed
    #
    # OUTPUTS:
    #  playerPositionsFixed: identity-corrected positions
    
    playerPositionsFixed = playerPositions.copy()
    pB = playerBoxes.copy()

    #get players initial range
    vs = cv2.VideoCapture(vidFH)
    frame = vs.read()
    frame = frame[1]
    frame = imutils.resize(frame, width=500)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    player0 = frame[int(pB[0,0,1]):int(pB[0,0,1]+pB[0,0,3]),
                    int(pB[0,0,0]):int(pB[0,0,0]+pB[0,0,2])]
    player1 = frame[int(pB[0,1,1]):int(pB[0,1,1]+pB[0,1,3]),
                    int(pB[0,1,0]):int(pB[0,1,0]+pB[0,1,2])]

    h0R,b = np.histogram(player0[:,:,0].ravel(),256,[0,256])
    h0G,b = np.histogram(player0[:,:,1].ravel(),256,[0,256])
    h0B,b = np.histogram(player0[:,:,2].ravel(),256,[0,256])
    h1R,b = np.histogram(player1[:,:,0].ravel(),256,[0,256])
    h1G,b = np.histogram(player1[:,:,1].ravel(),256,[0,256])
    h1B,b = np.histogram(player1[:,:,2].ravel(),256,[0,256])

    totalFrames = 1

    # loop over frames and test detected player aganst initial histograms
    while True:
        frame = vs.read()
        frame = frame[1]

        if frame is None:
            break

        frame = imutils.resize(frame, width=500)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        try:
            player0 = frame[int(pB[totalFrames,0,1]):int(pB[totalFrames,0,1]+pB[totalFrames,0,3]),
                            int(pB[totalFrames,0,0]):int(pB[totalFrames,0,0]+pB[totalFrames,0,2])]
            player1 = frame[int(pB[totalFrames,1,1]):int(pB[totalFrames,1,1]+pB[totalFrames,1,3]),
                            int(pB[totalFrames,1,0]):int(pB[totalFrames,1,0]+pB[totalFrames,1,2])]

            h0R_tmp,b = np.histogram(player0[:,:,0].ravel(),256,[0,256])
            h0G_tmp,b = np.histogram(player0[:,:,1].ravel(),256,[0,256])
            h0B_tmp,b = np.histogram(player0[:,:,2].ravel(),256,[0,256])
            h1R_tmp,b = np.histogram(player1[:,:,0].ravel(),256,[0,256])
            h1G_tmp,b = np.histogram(player1[:,:,1].ravel(),256,[0,256])
            h1B_tmp,b = np.histogram(player1[:,:,2].ravel(),256,[0,256])

            dist0R = np.linalg.norm(h0R_tmp-h0R)
            dist0G = np.linalg.norm(h0G_tmp-h0G)
            dist0B = np.linalg.norm(h0B_tmp-h0B)
            dist1R = np.linalg.norm(h0R_tmp-h1R)
            dist1G = np.linalg.norm(h0G_tmp-h1G)
            dist1B = np.linalg.norm(h0B_tmp-h1B)

            dist0 = np.mean([dist0R,dist0G,dist0B])
            dist1 = np.mean([dist1R,dist1G,dist1B])

            if dist0<dist1:
                playerIDs = [0, 1]
            else:
                playerIDs = [1, 0]

            playerPositionsFixed[totalFrames,0,:] = playerPositions[totalFrames,int(playerIDs[0]),:]
            playerPositionsFixed[totalFrames,1,:] = playerPositions[totalFrames,int(playerIDs[1]),:]
        except:
            badWindow = 1

        totalFrames += 1

    vs.release()

    cv2.destroyAllWindows()
    
    return playerPositionsFixed



The next function plots a heatmap of the player positions on an image from the video. This can be used to show the locations on the court that the players tend to visit.

The function can plot the data for a single player at a time using playerPositions[:,i,:] or both players together using simply playerPositions.

In [None]:
def heatMapPositions(vidFH, playerPositions, frameNumber=200):
    # INPUTS:
    #  vidFH: file handle for video that was analyzed
    #  playerPositions: output from playerTracker or fixDropouts. Can be [time x [X , Y]] or
    #   [time x players x [X, Y]]
    #  frameNumber: frame for image that is used as the background image. Defaults to 200
    #
    # OUTPUTS:
    #  None
    

    nSamples = np.shape(playerPositions)[0]
    
    # get background image to overlay heatmap on
    vs = cv2.VideoCapture(vidFH)
    for i in range(0, frameNumber):
        frame = vs.read()
        frame = frame[1]
        frame = imutils.resize(frame, width=500)
    vs.release()

    sImag = np.shape(frame)

    # figure out if we're looking at data for one or two players
    if len(np.shape(playerPositions))==2:
        nPlayers = 1
    else:
        nPlayers = 2
        
    # combine data if two players
    if nPlayers == 1:
        tmpX = playerPositions[0:nSamples,0]
        tmpY = playerPositions[0:nSamples,1]
    else:
        tmpX = np.concatenate((playerPositions[0:nSamples,0,0], playerPositions[0:nSamples,1,0]), axis=0)
        tmpY = np.concatenate((playerPositions[0:nSamples,0,1], playerPositions[0:nSamples,1,1]), axis=0)

    #compute 2d histogram from location data
    playerLocs, xedges, yedges = np.histogram2d(tmpX,tmpY,bins=[int(sImag[0]/10), int(sImag[1]/10)])

    # plot over background image
    playerLocs = playerLocs.T
    fig = plt.figure(figsize=(7, 3))
    ax = fig.add_subplot(111)
    ax.imshow(playerLocs, interpolation='gaussian', origin='low',
               extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]],
              alpha=1)
    ax.imshow(frame, alpha=0.5)
    plt.show()

This final function makes a video with the tracker location plotted for each player.

In [None]:
def makeVideo(vidFH, playerPositions, vidOut):
    # INPUTS:
    #  vidFH: file handle for video that was analyzed
    #  playerPositions: output from playerTracker/fixDropouts/identifyPlayer
    #  vidOut: file name for output video. ".avi" is automatically added so should not be
    #   included in vidOut
    #
    # OUTPUTS:
    #  None
    
    # initialize output info
    vs = cv2.VideoCapture(vidFH)
    frame = vs.read()
    frame = frame[1]
    vs.release()
    dims = np.shape(frame)
    frame = imutils.resize(frame, width=500)
    dims = np.shape(frame)
    frame_width = dims[1]
    frame_height = dims[0]

    vs = cv2.VideoCapture(vidFH)
    fourcc = cv2.VideoWriter_fourcc('X','V','I','D')
    fhOut = '%s.avi' % vidOut
    out = cv2.VideoWriter(fhOut,fourcc, 30.0, (frame_width,frame_height))
    totalFrames = 0

    W = None
    H = None

    # draw player positions over frames
    while True:
        frame = vs.read()
        frame = frame[1]

        if frame is None:
            break

        frame = imutils.resize(frame, width=500)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if W is None or H is None:
            (H, W) = frame.shape[:2]

        for iPlayer in range(0,2):
            X = int(playerPositions[totalFrames,iPlayer,0])
            Y = int(playerPositions[totalFrames,iPlayer,1])
            #text = "Player %d" % int(iPlayer)
            #cv2.putText(frame, text, (X, Y),
            #    cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 255), 2)
            cv2.rectangle(frame, (X-1, Y-1), (X+1, Y+1), (0, 255, 0), 2)

        out.write(frame)

        # show the output frame
        cv2.imshow("Frame", frame)
        key = cv2.waitKey(1) & 0xFF

        # if the `q` key was pressed, break from the loop
        if key == ord("q"):
            break

        totalFrames += 1

    vs.release()
    out.release()

    cv2.destroyAllWindows()

In [None]:
vidFH = "ProSquash_Cropped.mp4"
vidOut = "ProSquash_Tracked"
skipFrames = 10

playerPositions, playerBoxes = playerTracker(vidFH, skipFrames)
playerBoxesFixed = fixDropouts_Box(playerBoxes)
playerPositionsFixed = fixDropouts(playerPositions)
playerPositionsFixed = identifyPlayer(playerPositionsFixed, playerBoxesFixed, vidFH)
heatMapPositions(vidFH, playerPositionsFixed[:,0,:], frameNumber=200)
heatMapPositions(vidFH, playerPositionsFixed[:,1,:], frameNumber=200)
makeVideo(vidFH, playerPositionsFixed, vidOut)

print("DONE")