Efficient Net Solution

- Inference on 30 frames per video at 10 frame intervals

In [None]:
import os
os.makedirs("/root/.cache/torch/hub", exist_ok=True)

try:
    os.remove("/root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master")
except:
    pass

os.symlink("/kaggle/input/rwightman/rwightman_gen-efficientnet-pytorch_master",
           "/root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master")

import sys
sys.path.insert(0, "/kaggle/input/retinaface")
sys.path.insert(0, "/kaggle/input/efnetensembles8")

import numpy as np
import pandas as pd
import cv2
import torch
import torch.backends.cudnn as cudnn
import torchvision
from torch import nn
from torchvision import transforms
from torchvision.ops import nms
from data import cfg_mnet, cfg_re50
from layers.functions.prior_box import PriorBox
from utils.nms.py_cpu_nms import py_cpu_nms
from models.retinaface import RetinaFace
from utils.box_utils import decode, decode_landm
from utils.timer import Timer
from glob import glob
from math import ceil
from os.path import basename
from tqdm.notebook import tqdm
from time import time
from PIL import Image

GPU_ID = 0

VIDEO_GRAB_FRAMES = 60
VIDEO_MAX_SIZE = 960

FACE_NMS_THRESHOLD = 0.4
FACE_CONFIDENCE_THRESHOLD = 0.99
FACE_SCALE = 1.1

NO_FACES_BUT_TOTAL_FRAMES_IN_VIDEO_PROBA = 0.5 # Assume fake
NO_FACES_BUT_NOT_TOTAL_FRAMES_IN_VIDEO_PROBA = 0.5 # No idea


DEFAULT_PROBA = 0.5 # If all else fails

IMAGE_SIZE = 300
TORCH_DEVICE='cuda:0'

SAMPLE_SIZE=0

VIDEO_FILE_GLOB = "/kaggle/input/deepfake-detection-challenge/test_videos/*.mp4"

FACE_MODEL_FILE = '/kaggle/input/retinaface/weights/Resnet50_Final.pth'
INFERENCE_MODEL_FILES = (
    "/kaggle/input/efnetensembles8/effnet_b3_pretrained_on_30_40_validation_plus_augs_dropout_0.2__size_300-2-0.1661-ckpt.pth",
)


def process_video(video_file, grab_frames = VIDEO_GRAB_FRAMES, resize = VIDEO_MAX_SIZE, face_scale = FACE_SCALE, debug = False):
    
    filename = basename(video_file)
        
    if debug: print(f"Processing {video_file}")

    frames = []
        
    try:
        cap = cv2.VideoCapture(video_file)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if frame_count < 1:
            frame_count = 300
            
        get_frameids = np.linspace(0, frame_count - 1, grab_frames, endpoint=True, dtype=np.int)
        
        parsed_frame = 0
        
        while True:
            success = cap.grab()
            if not success:
                break
                
            if parsed_frame in get_frameids:

                success, frame = cap.retrieve()
                if not success or frame is None:
                    break

                if len(frame.shape) == 2:
                    frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
                elif len(frame.shape) == 3:
                    channels = frame.shape[-1]
                    if channels == 4:
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR)
                    elif channels > 4:
                        frame = cv2.cvtColor(frame[:,:,0:3], cv2.COLOR_RGB2BGR)

                frames.append([frame, parsed_frame, None])

            parsed_frame += 1
            
        cap.release()
    except:
        pass

    num_frames = len(frames)
    
    # Nothing to process
    #
    if num_frames < 1:
        return filename, 0, 0, np.array([]), np.array([])
    
    # Resize frames before detection
    resized_frames = []
    
    for i, (frame, frameid, _) in enumerate(frames):
        resized_frame, downsample = resize_img(frame, resize)

        if debug:
            print(f"Resized frame shape is {resized_frame.shape}")

        upsample = 1/downsample
        if upsample < 1:
            upsample = 1

        resized_frames.append(resized_frame)
        frames[i][2] = upsample # Update upsample ratio
    
    # Detect and classify faces
    confidence_threshold = FACE_CONFIDENCE_THRESHOLD
    nms_threshold = FACE_NMS_THRESHOLD
    
    global face_detector
    global cfg_re50
    global image_transform
    global face_classifiers
    global device
    global faces
        
    imgs = np.float32(resized_frames)
    batch_size, im_height, im_width, _ = imgs.shape

    imgs -= (104, 117, 123)
    imgs = torch.from_numpy(imgs)
    imgs = imgs.permute(0, 3, 1, 2)
    imgs = imgs.to(device)

    scale = torch.Tensor([im_width, im_height, im_width, im_height])
    scale = scale.to(device)
    scale1 = torch.Tensor([im_width, im_height, im_width, im_height,
                           im_width, im_height, im_width, im_height,
                           im_width, im_height])

    scale1 = scale1.to(device)

    b_loc, b_conf, b_landms = face_detector(imgs)  # forward pass
    
    priorbox = PriorBox(cfg_re50, image_size=(im_height, im_width))
    priors = priorbox.forward()
    priors = priors.to(device)
    prior_data = priors.data
    
    face_records = []
    faces = []
    max_faceid_file = 0

    if debug: print(f"Image shape is {imgs.shape}")
            
    for i in range(len(imgs)):

        frame, frameid, upsample = frames[i]
        frame_height, frame_width, _ = frame.shape
            
        boxes = decode(b_loc[i].data.squeeze(0), prior_data, cfg_re50['variance'])
        boxes = boxes * scale

        scores = b_conf[i].squeeze(0)

        landms = decode_landm(b_landms[i].data.squeeze(0), prior_data, cfg_re50['variance'])            
        landms = landms * scale1

        # ignore low scores
        inds = scores[:,1] > confidence_threshold
        boxes = boxes[inds]
        landms = landms[inds]
        scores = scores[inds][:,1]

        keep = nms(boxes, scores, nms_threshold)
        boxes = boxes[keep].cpu().numpy()
        landms = landms[keep].cpu().numpy()
        scores = scores[keep].cpu().numpy()

        # Upsample
        boxes = boxes * upsample
        landms = landms * upsample
        
        faceid = 0
        
        for box, land, score in zip(boxes, landms, scores):
            
            score = round(score, 2)
            
            w1,h1,w2,h2 = map(int, box)

            face = crop_and_align_face(w1, w2, h1, h2, frame, scale=face_scale)
            face = image_transform(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
            
            face_record = [frameid, faceid, score, *land]
            face_records.append(face_record)
            faces.append(face)

            faceid += 1            
            
        if faceid > max_faceid_file:
            max_faceid_file = faceid
            
    del imgs, scale, scale1
    
    if len(faces) < 1:
        print("Found no faces")

        return filename, num_frames, max_faceid_file, np.array([]), np.array([])

    probas = []
    with torch.set_grad_enabled(False):
        faces = torch.stack([i for i in faces]).to(device)
        for face_classifier in face_classifiers:
            probas.append(torch.sigmoid(face_classifier(faces).squeeze()).cpu().numpy())

    return filename, num_frames, max_faceid_file, np.array(face_records), np.array(probas)

def check_keys(model, pretrained_state_dict):
    ckpt_keys = set(pretrained_state_dict.keys())
    model_keys = set(model.state_dict().keys())
    used_pretrained_keys = model_keys & ckpt_keys
    unused_pretrained_keys = ckpt_keys - model_keys
    missing_keys = model_keys - ckpt_keys
    print('Missing keys:{}'.format(len(missing_keys)))
    print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
    print('Used keys:{}'.format(len(used_pretrained_keys)))
    assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
    return True

def remove_prefix(state_dict, prefix):
    ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
    print('remove prefix \'{}\''.format(prefix))
    f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
    return {f(key): value for key, value in state_dict.items()}

def load_model(model, pretrained_path, load_to_cpu):
    print('Loading pretrained model from {}'.format(pretrained_path))
    if load_to_cpu:
        pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
    else:
        device = torch.cuda.current_device()
        pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
    if "state_dict" in pretrained_dict.keys():
        pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
    else:
        pretrained_dict = remove_prefix(pretrained_dict, 'module.')
    check_keys(model, pretrained_dict)
    model.load_state_dict(pretrained_dict, strict=False)
    return model

def resize_img(img, max_size):

    im_shape = img.shape
    im_size_max = np.max(im_shape[0:2])
    downsample = float(max_size) / float(im_size_max)
    
    if downsample < 1:
        img = cv2.resize(img, None, None, fx=downsample, fy=downsample, interpolation=cv2.INTER_LINEAR)
        
    return img, downsample

def crop_and_align_face(w1, w2, h1, h2, frame, scale=1.1):
    size_bb = int(max(w2-w1, h2-h1) * scale)
    height, width = frame.shape[:2]
    center_w, center_h = (w1 + w2) // 2, (h1 + h2) // 2
    # Check for out of bounds, x-y top left corner
    w1 = max(int(center_w - size_bb // 2), 0)
    h1 = max(int(center_h - size_bb // 2), 0)
    size_bb = min(width - w1, size_bb)
    size_bb = min(height - h1, size_bb)
    square_face = frame[h1:h1+size_bb, w1:w1+size_bb]

    return square_face

## Main

proba_by_filename = {}

image_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), Image.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

cfg_re50['pretrain'] = False

face_detector = RetinaFace(cfg=cfg_re50, phase='test')
face_detector = load_model(face_detector, FACE_MODEL_FILE, False)
print('Finished loading model!')
cudnn.benchmark = True
device = torch.device("cuda")
face_detector = face_detector.to(device)
for param in face_detector.parameters():
    param.requires_grad = False
face_detector.eval()

device = torch.device(TORCH_DEVICE)

face_classifiers = []

for i, model_file in enumerate(INFERENCE_MODEL_FILES):

    face_classifiers.append(
        nn.Sequential(
            torch.hub.load('rwightman/gen-efficientnet-pytorch', 'tf_efficientnet_b3_ns', pretrained=False),
            nn.Dropout(0.0),
            nn.Linear(1000, 1)
    ))
    face_classifiers[i].load_state_dict(torch.load(model_file, map_location={'cuda:0':TORCH_DEVICE})['state_dict'])
    face_classifiers[i] = face_classifiers[i].to(device)
    face_classifiers[i].eval()
    print(f"Loaded model from {model_file}")

video_files = glob(VIDEO_FILE_GLOB)

if SAMPLE_SIZE > 0:
    video_files = video_files[0:SAMPLE_SIZE]
    print(f"Sampled {len(video_files)} video files")


print(f"Found {len(video_files)} video files")

###########################
# Process Videos
#

video_res = {}

pbar = tqdm(enumerate(video_files), total=len(video_files), dynamic_ncols=True)

for i, video_file in pbar:

    filename, num_frames, max_faceid_file, face_records, probabilities = process_video(video_file)
    
    no_faces = len(probabilities) < 1
    multiple_faces = max_faceid_file > 1
    
    video_res[filename] = {'fr': face_records,
                           'probabilities': probabilities,
                           'no_faces': no_faces,
                           'num_frames': num_frames,
                           'max_faces': max_faceid_file,
                           'multiple_faces': multiple_faces}

    video_proba = DEFAULT_PROBA
    
    if no_faces:
        video_proba = NO_FACES_BUT_NOT_TOTAL_FRAMES_IN_VIDEO_PROBA
        if num_frames == VIDEO_GRAB_FRAMES:
            video_proba = NO_FACES_BUT_TOTAL_FRAMES_IN_VIDEO_PROBA
    elif multiple_faces:
        # Get probabilities by frame, pick max per frame then average all max
        max_probs = []

        for i in range(len(face_classifiers)):
            dd = list(zip(video_res[filename]['fr'][:,0], video_res[filename]['probabilities'][i]))
            myd = {}
            for frameid, prob in dd:
                if frameid not in myd:
                    myd[frameid] = [prob]
                else:
                    myd[frameid].append(prob)

            max_probs.append([])
            for frameid in myd:
                p = np.array(myd[frameid])
                res = DEFAULT_PROBA
                if (p > 0.5).all():
                    res = np.max(p)
                elif (p <= 0.5).all():
                    res = np.min(p)
                max_probs[i].append(res)

        video_proba = np.mean(max_probs)
    else:
        video_proba = video_res[filename]['probabilities'].mean()

    video_res[filename]['prediction'] = video_proba

###########################
# Submission code
#

submission_videos = []
submission_predictions = []

for video_file in video_files:
    basefile = basename(video_file)
    proba = DEFAULT_PROBA
    if basefile in video_res:
        proba = video_res[basefile]['prediction']
    submission_videos.append(basefile)
    submission_predictions.append(proba)
    
submission_df = pd.DataFrame({"filename": submission_videos, "label": submission_predictions})
submission_df.to_csv("submission.csv", index=False)

print("Submit Results")