# Real-time, Single-thread

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms.functional as tf
from torchvision import models
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import cv2
import json
import time

import face_recognizer
from model import PNet,RNet,ONet,FaceNet

In *face_recognizer.py*, there are three functions:
- `mtcnn`<br>
**input**: pnet, rnet, onet, image<br>
**output**: o_bounding_boxes, pred_ldmk, aligned_faces<br>
Given an image and the mtcnn, it returns the predicted bounding boxes, the landmarks and the cropped faces after alignment.
<br><br>
- `fn`<br>
**input**: facenet, image, embeddings, threshold<br>
**output**: most_likely, min_dist<br>
Given the facenet, its threshold, an image and the embeddings of candidates, it returns the name of the predicted name and the distance between the predicted embedding and the ground truth embedding.
<br><br>
- `decode_face`<br>
**input**: facenet, image, threshold<br>
**output**: decode<br>
Given the facenet, its threshold, and an image, it returns the embedding of that image. With this function, the embedding dictionary can be created.

### Load nets and assign the optimal threshold for facenet

In [2]:
Pnet = PNet().cuda()
Pnet.load_state_dict(torch.load('Pnet.pth'))
Rnet = RNet().cuda()
Rnet.load_state_dict(torch.load('Rnet.pth'))
Onet = ONet().cuda()
Onet.load_state_dict(torch.load('Onet.pth'))
Facenet = FaceNet().cuda()
Facenet.load_state_dict(torch.load('Fnet.pth'))

THRESHOLD = 3.91
PATH = 'The Shawshank Redemption.mp4'

### Dictionary of candidates

In [3]:
frame_count = cv2.CAP_PROP_FRAME_COUNT
frame_fps = cv2.CAP_PROP_FPS
print(frame_count)
print(frame_fps)

andy_img = Image.open('Andy.png')
red_img = Image.open('Red.png')
brooks_img = Image.open('Brooks.png')
heywood_img = Image.open('Heywood.png')
man1_img = Image.open('Man1.png')
man2_img = Image.open('Man2.png')
andy_code = face_recognizer.decode_face(Facenet, andy_img, threshold=THRESHOLD)
red_code = face_recognizer.decode_face(Facenet, red_img, threshold=THRESHOLD)
brooks_code = face_recognizer.decode_face(Facenet, brooks_img, threshold=THRESHOLD)
heywood_code = face_recognizer.decode_face(Facenet, heywood_img, threshold=THRESHOLD)
man1_code = face_recognizer.decode_face(Facenet, man1_img, threshold=THRESHOLD)
man2_code = face_recognizer.decode_face(Facenet, man2_img, threshold=THRESHOLD)

embeddings = {'Andy': andy_code, 'Red': red_code, 'Brooks': brooks_code, 
              'Heywood': heywood_code, 'Man1': man1_code, 'Man2': man2_code}

7
5


### Run
Please note that only processing is carried out here. Code for visualization is in *visualization.ipynb* and has to be run on a local computer.<br>
All the information needed for visualization is stored in a dictionary, which contains three keys: `'face_bbox'`, `'face_ldmk'` and `'face_names'`
The value of each key is a list whose length is equal to the number of frames of the video. For example,
```
statistics['face_bbox'][5] gives all the predicted bounding boxes in the fifth frame of the video
statistics['face_names'][5] gives all the corresponding predicted names in the fifth frame of the video
```
However, when a local computer is equipped with GPU, it is not necessary to pre-record the video and store the dictionary. Please assign `0` to `PATH` to use the camera and umcomment the commented code below and visualize the results right away. 

In [4]:
face_bbox, face_ldmk, face_embd = [], [], []
process_this_frame = True

since = time.time()
video_capture = cv2.VideoCapture(PATH)
# w = int(video_capture.get(3))
# h = int(video_capture.get(4))
# fourcc = cv2.VideoWriter_fourcc(*'DIVX')
# outvideo = cv2.VideoWriter('output-shawshank.mp4', fourcc, 50.0, (w, h))

while (video_capture.isOpened()):
    ret, frame = video_capture.read()
    if not ret:
        break
    small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
    rgb_small_frame = small_frame[:, :, ::-1]

    if process_this_frame:
        o_bounding_boxes, pred_ldmk, aligned_faces = face_recognizer.mtcnn(Pnet, Rnet, Onet, rgb_small_frame)
        if aligned_faces is None:
            face_names = None
        else:
            pred_names = []
            for image in aligned_faces:
                decode, dist = face_recognizer.fn(Facenet, image, embeddings, threshold=THRESHOLD)
                pred_names.append([decode,dist])

        face_bbox.append(o_bounding_boxes)
        face_ldmk.append(pred_ldmk)
        face_names.append(pred_names)
#         process_this_frame = not process_this_frame


# #     visualize
#     if o_bounding_boxes is not None:
#         for (left, top, right, bottom, prob), (name, dist) in zip(o_bounding_boxes, face_names):

#             top, right, bottom, left = int(4*top), int(4*right), int(4*bottom), int(4*left)
#             cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
#             cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED)
#             font = cv2.FONT_HERSHEY_DUPLEX
#             cv2.putText(frame, name, (left + 6, bottom - 6), font, 1.0, (255, 255, 255), 1)
    
#     outvideo.write(frame)
#     cv2.imshow('Video', frame)
#     cv2.waitKey(50)

print('exit')
video_capture.release()
spent = time.time() - since
print('Complete in {:.0f}m {:.0f}s'.format(spent // 60, spent % 60))
# outvideo.release()
# cv2.destroyAllWindows()

exit
Complete in 1m 26s


In [5]:
print(len(face_bbox))
print(len(face_ldmk))
print(len(face_names))

478
478
478


### Store the dictionary

In [6]:
# change array to list
for a in [face_bbox, face_ldmk]:
    for i in range(len(a)):
        if isinstance(a[i], np.ndarray):
            a[i] = a[i].tolist()
            
statistics = {'face_bbox': face_bbox, 'face_ldmk': face_ldmk, 'face_names': face_names}
json.dump(statistics, open('realtime-singlethread-shawshank.json', 'w', encoding='utf8'))