### Assignment 4 - Question 5: Mean shift Tracking

#### 5.1 Performance Evaluations

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cv2

In [None]:
def calculate_iou(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    # compute the IoU by taking the intersection area and dividing
    # it by the sum of prediction + ground-truth areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the IoU value
    return iou


In [None]:
cap = cv2.VideoCapture("./KylianMbappe.mp4") # video

# capture one frame
ret,frame = cap.read()

# detect a face on the first frame
face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
face_boxes = face_detector.detectMultiScale(frame)

if len(face_boxes)==0:
    print('no face detected')
    assert(False)

# initialize the tracing window around the (first) detected face
(x,y,w,h) = tuple(face_boxes[0]) 
track_window = (x,y,w,h)

## region of interest for tracking
roi = frame[y:y+h, x:x+w]

# convert the roi to HSV so we can construct a histogram of Hue 
hsv_roi =  cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)

# why do we need this mask? (remember the cone?)
# read the description for Figure 3 in the original Cam Shift paper: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.7673 
mask = cv2.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))

# form histogram of hue in the roi
roi_hist = cv2.calcHist([hsv_roi],[0],mask,[180],[0,180])

# normalize the histogram array values so they are in the min=0 to max=255 range
cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)

# termination criteria for mean shift: 10 iteration or shift less than 1 pixel
term_crit = ( cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 1 )


ious = []
frames = []
lower_iou_frames = []
count = 0

while True:
    
    # grab a frame
    ret ,frame = cap.read() 
    
    if ret == True: 
  
        # convert to HSV
        hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
        
        # histogram back projection using roi_hist 
        dst = cv2.calcBackProject([hsv],[0],roi_hist,[0,180],1)
        
        # use meanshift to shift the tracking window
        ret, track_window = cv2.meanShift(dst, track_window, term_crit)
        
        # display tracked window
        x,y,w,h = track_window
        img = cv2.rectangle(frame, (x,y), (x+w,y+h), (0,0,255),5)
        
        # detector
        face_boxes = face_detector.detectMultiScale(frame)
        if len(face_boxes) == 0:
            iou = 0
        else:
            x2,y2,w2,h2 = 0, 0, 0, 0
            max_iou = -1
            for face_box in face_boxes:
                tx2,ty2,tw2,th2 = face_box
                iou = calculate_iou([x,y, x+w,y+h], [tx2,ty2,tx2+tw2,ty2+th2])
                
                if iou > max_iou:
                    max_iou = iou
                    x2,y2,w2,h2 = tx2,ty2,tw2,th2
            iou = max_iou
            img2 = cv2.rectangle(frame, (x2,y2), (x2+w2,y2+h2), (0,255,0),5)
        
        frames.append(frame)
        ious.append(iou)
        if iou < 0.5 and iou != 0:
            lower_iou_frames.append(frame)
        
        cv2.imshow('mean shift tracking demo',img)
        
        if cv2.waitKey(33) & 0xFF == 27: # wait a bit and exit is ESC is pressed
            break
        
    else:
        break
        
cv2.destroyAllWindows()
cap.release()



In [None]:
frame_count = [i for i in range(2, len(ious)+1)]
plt.plot(frame_count, ious[1:])
plt.xlabel("Frame #")
plt.ylabel("IoU")
plt.show()

In [None]:
# plot the highest/lowest smaple frame
min_iou_index = ious.index(min(ious))
max_iou_index = ious.index(max(ious))

lowest_frame = frames[min_iou_index]
highest_frame = frames[max_iou_index]

fig, axes = plt.subplots(1, 2, figsize=(15, 13))
axes[0].imshow(cv2.cvtColor(highest_frame, cv2.COLOR_BGR2RGB))
axes[1].imshow(cv2.cvtColor(lowest_frame, cv2.COLOR_BGR2RGB))
axes[0].set_title(f'Highest IoU Sample {ious[max_iou_index]}')
axes[1].set_title(f'Lowest IoU Sample {ious[min_iou_index]}')

In [None]:
# IoU > 50%
ious_arr = np.array(ious)
iou_50 = len(np.where(ious_arr > 0.5)[0])/len(ious_arr)

print(f'{iou_50*100}% of frames have IoU larger than 50%')

Inspecting the histogram, we notice only 1 IoU is lower than 10%, which is 0% indeed. To analyse the lower IoUs, we will pick those smaller than 50%.

In [None]:
# plot some frames with IoU < 50%
print(f'There are {len(lower_iou_frames)} frames that has 0% < accurcy < 50%')
fig, axes = plt.subplots(2, 2, figsize=(13, 10))
fig.suptitle('4 frames with IoU < 50%',y=0.87, fontsize=12)
axes[0,0].imshow(cv2.cvtColor(lower_iou_frames[0], cv2.COLOR_BGR2RGB))
axes[0,1].imshow(cv2.cvtColor(lower_iou_frames[1], cv2.COLOR_BGR2RGB))
axes[1,0].imshow(cv2.cvtColor(lower_iou_frames[2], cv2.COLOR_BGR2RGB))
axes[1,1].imshow(cv2.cvtColor(lower_iou_frames[3], cv2.COLOR_BGR2RGB))

##### Comment:
- Red box: mean shift tracked box. Green box: Viola-Jones detected box
- Comparing the four pairs of boxes, we can see green box of Viola-Jones detection is more accurate in facial detection. 
- Viola-Jones is a trained machine learning approach to detect faces, while mean shift tracking relies on appearance and motion cues to track objects (we used hue histogram here). Therefore, mean shift tracking is more sensitive to variations in environment. For example, in the video Mbappe touched his nose.

#### 5.2 Implement A Simple Variation

In [None]:
cap = cv2.VideoCapture("./KylianMbappe.mp4") # video

# capture one frame
ret,frame = cap.read()

# detect a face on the first frame
face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
face_boxes = face_detector.detectMultiScale(frame)

if len(face_boxes)==0:
    print('no face detected')
    assert(False)

# initialize the tracing window around the (first) detected face
(x,y,w,h) = tuple(face_boxes[0]) 
track_window = (x,y,w,h)

## region of interest for tracking
roi = frame[y:y+h, x:x+w]

# gray_roi: mag, angle
gray_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
gray_roi = cv2.GaussianBlur(gray_roi, (5, 5), 1) # gaussian blur
Ix = cv2.Sobel(gray_roi, cv2.CV_32F, 1, 0, ksize=5)
Iy = cv2.Sobel(gray_roi, cv2.CV_32F, 0, 1, ksize=5)

magnitude = np.zeros_like(Ix)
angle = np.zeros_like(Ix, dtype='uint8')  # range 0-360 degrees
cv2.cartToPolar(Ix, Iy, magnitude, angle, angleInDegrees=True)

mask = cv2.inRange(magnitude, np.max(magnitude) * .1, np.inf)

# mag, angle = cv2.cartToPolar(Ix, Iy, angleInDegrees=True)
# angle = np.array([angle], dtype=np.uint16)

# mask = cv2.inRange(mag, np.max(mag) / 20.0, float(np.max(mag)))

# form histogram of hue in the roi
roi_hist = cv2.calcHist([angle],[0],mask,[24],[0,360])

# normalize the histogram array values so they are in the min=0 to max=255 range
cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)

# termination criteria for mean shift: 10 iteration or shift less than 1 pixel
term_crit = ( cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 1 )

ious = []
frames = []
lower_ious = []
lower_iou_frames = []
count = 0

while True:
    
    # grab a frame
    ret ,frame = cap.read() 
    
    if ret == True: 
  
        # convert to HSV
        hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
        
        # histogram back projection using roi_hist 
        dst = cv2.calcBackProject([hsv],[0],roi_hist,[0,360],1)
        
        # use meanshift to shift the tracking window
        ret, track_window = cv2.meanShift(dst, track_window, term_crit)
        
        # display tracked window
        x,y,w,h = track_window
        img = cv2.rectangle(frame, (x,y), (x+w,y+h), (0,0,255),5)
        
        # detector
        face_boxes = face_detector.detectMultiScale(frame)
        if len(face_boxes) == 0:
            iou = 0
        else:
            x2,y2,w2,h2 = 0, 0, 0, 0
            max_iou = -1
            for face_box in face_boxes:
                tx2,ty2,tw2,th2 = face_box
                iou = calculate_iou([x,y, x+w,y+h], [tx2,ty2,tx2+tw2,ty2+th2])
                
                if iou > max_iou:
                    max_iou = iou
                    x2,y2,w2,h2 = tx2,ty2,tw2,th2
            iou = max_iou
            img2 = cv2.rectangle(frame, (x2,y2), (x2+w2,y2+h2), (0,255,0),5)
        
        frames.append(frame)
        ious.append(iou)
        if iou < 0.5 and iou != 0:
            lower_iou_frames.append(frame)
            lower_ious.append(iou)
        
        cv2.imshow('mean shift tracking demo',img)
        
        if cv2.waitKey(33) & 0xFF == 27: # wait a bit and exit is ESC is pressed
            break
        
    else:
        break
        
cv2.destroyAllWindows()
cap.release()



In [None]:
frame_count = [i for i in range(2, len(ious)+1)]
plt.plot(frame_count, ious[1:])
plt.xlabel("Frame #")
plt.ylabel("IoU")
plt.show()

In [None]:
# IoU > 70%
ious_arr = np.array(ious)
iou_higher_70 = len(np.where(ious_arr > 0.7)[0])/len(ious_arr)

print(f'{iou_higher_70*100}% of frames have IoU larger than 70%')

In [None]:
# IoU < 50%
ious_arr = np.array(ious)
iou_less_50 = len(np.where(ious_arr < 0.5)[0])/len(ious_arr)
iou_larger_50 = 1-iou_less_50

print(f'{iou_less_50*100}% of frames have IoU smaller than 50%')
print(f'{iou_larger_50*100}% of frames have IoU larger than 50%')

In [None]:
# plot the high/low smaple frames
max_iou_index = ious.index(max(ious))
highest_frame = frames[max_iou_index]

fig, axes = plt.subplots(1, 2, figsize=(15, 13))
axes[0].imshow(cv2.cvtColor(highest_frame, cv2.COLOR_BGR2RGB))
axes[1].imshow(cv2.cvtColor(lower_iou_frames[0], cv2.COLOR_BGR2RGB))
axes[0].set_title(f'High IoU Sample {ious[max_iou_index]}')
axes[1].set_title(f'Low IoU Sample {lower_ious[0]}')