## Inference with Single Model

We now have what looks to be a reasonably capable model and we would like to test it against the real test set.

In [1]:
import os
import cv2
import sys
import numpy as np 
import pandas as pd 
import torch

from pathlib import Path
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from fastai.data_block import get_files
from fastai.vision import Learner, load_learner, ImageList
from fastai.metrics import accuracy

from EasyBlazeFace import EasyBlazeFace
from EasyRetinaFace import EasyRetinaFace
from video_utils import plot_detections, read_frames, bb_intersection_over_union

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cuda')

In [128]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
from functools import partial

__all__ = [
    'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
    'resnet152', 'resnet200'
]


def conv3x3x3(in_planes, out_planes, stride=1):
    # 3x3x3 convolution with padding
    return nn.Conv3d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=1,
        bias=False)


def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(
        out.size(0), planes - out.size(1), out.size(2), out.size(3),
        out.size(4)).zero_()
    if isinstance(out.data, torch.cuda.FloatTensor):
        zero_pads = zero_pads.cuda()

    out = Variable(torch.cat([out.data, zero_pads], dim=1))

    return out


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)
        self.conv2 = nn.Conv3d(
            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self,
                 block,
                 layers,
                 sample_size,
                 sample_duration,
                 shortcut_type='B',
                 num_classes=400):
        self.inplanes = 64
        super(ResNet, self).__init__()
        
        self.sample_duration = sample_duration
        self.conv1 = nn.Conv3d(
            3,
            64,
            kernel_size=7,
            stride=(1, 2, 2),
            padding=(3, 3, 3),
            bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
        self.layer2 = self._make_layer(
            block, 128, layers[1], shortcut_type, stride=2)
        self.layer3 = self._make_layer(
            block, 256, layers[2], shortcut_type, stride=2)
        self.layer4 = self._make_layer(
            block, 512, layers[3], shortcut_type, stride=2)
        
        last_duration = int(math.ceil(sample_duration / 16))
        last_size = int(math.ceil(sample_size / 32))
        self.avgpool = nn.AvgPool3d(
            (last_duration, last_size, last_size), stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(
                    downsample_basic_block,
                    planes=planes * block.expansion,
                    stride=stride)
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(
                        self.inplanes,
                        planes * block.expansion,
                        kernel_size=1,
                        stride=stride,
                        bias=False), nn.BatchNorm3d(planes * block.expansion))

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        
        if len(x.shape) == 4:
            batch_size, stacked_size, height, width = x.shape
            rgb_channels = 3

            # fastai requires that inputs be of shape:
            # (BATCH, CHANNELS * NUM_FRAMES, HEIGHT, WIDTH)
            # PyTorch's 3D convolution operations require that inputs be of shape:
            # (BATCH, CHANNELS, NUM_FRAMES, HEIGHT, WIDTH)
            x = x.view(batch_size, 16, rgb_channels, height, width).permute(0, 2,1,3,4)

    #         print(x.shape) #torch.Size([64, 3, 10, 128, 128])
    #         plt.imshow(x[0,:,0,:,:].permute(1,2,0))
    #         plt.show()

            # Reshaping and permuting puts x back on the CPU, so we'll move it back to the GPU.
            x = x.cuda()
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def get_fine_tuning_parameters(model, ft_begin_index):
    if ft_begin_index == 0:
        return model.parameters()

    ft_module_names = []
    for i in range(ft_begin_index, 5):
        ft_module_names.append('layer{}'.format(i))
    ft_module_names.append('fc')

    parameters = []
    for k, v in model.named_parameters():
        for ft_module in ft_module_names:
            if ft_module in k:
                parameters.append({'params': v})
                break
        else:
            parameters.append({'paramsbatch_size, height, width, stacked_size = input.shape': v, 'lr': 0.0})

    return parameters


def resnet10(**kwargs):
    """Constructs a ResNet-18 model.
    """
    model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
    return model


def resnet18(**kwargs):
    """Constructs a ResNet-18 model.
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    return model


def resnet34(**kwargs):
    """Constructs a ResNet-34 model.
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    return model


def resnet50(**kwargs):
    """Constructs a ResNet-50 model.
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    return model


def resnet101(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    return model


def resnet152(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    return model


def resnet200(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
    return model

In [129]:
# Load our model and weights
sz = 256
sample_duration = 16
model = resnet18(sample_size=sz, sample_duration=sample_duration, num_classes=400, shortcut_type='A')
# Adjust the last layer to our problems classification task
model.fc = torch.nn.Linear(in_features=512, out_features=2, bias=True)

state_dict = torch.load('../data/16_frames/models/best.pth')
model.load_state_dict(state_dict['model'])

model = model.eval()
model = model.to(device)

In [130]:
# SOURCE_TEST = Path('../input/deepfake-detection-challenge/test_videos/')
# submission = pd.read_csv('../input/deepfake-detection-challenge/sample_submission.csv')
SOURCE_TEST = Path('../data/cropped_faces/valid_videos/')
submission = pd.read_csv('../data/sample_submission.csv')

In [131]:
def group_bounding_boxes_by_overlap(detections_for_frames, iou_threshold=0.7):
    """
    Many videos have multiple people in them which leads to multiple detections in a given frame.
    This method takes a set of detections across multiple frames and attempts to group them
    according to the person each detection represents.
    
    We break apart detections by comparing each detection to the IOU value of the detections in 
    previous frames. If there is enough intersection (ie. high IOU) we match that detection to 
    the one from the previous frame.
    """
    
    grouped_frame_detections = []
    
    for frame_detections in detections_for_frames:

        for detections in frame_detections:

            # Figure out where in `grouped_frame_detections` this belongs
            best_iou = -1
            best_index = -1

            for i, group in enumerate(grouped_frame_detections):

                # Get the most recent detection
                last_detections = group[-1]

                # Calculate iou
                iou = bb_intersection_over_union(detections, last_detections)

                if iou > best_iou:
                    best_iou = iou
                    best_index = i

            if best_iou < iou_threshold:
                # No suitable group was found, add a new one
                grouped_frame_detections.append([detections])
            else:
                # Place in group
                grouped_frame_detections[best_index].append(detections)
                
    return grouped_frame_detections    

In [177]:
def get_largest_bounding_box(detections_for_frames):
    """
    Given a set of detections that span across multiple frames (but represent only ONE person)
    return the largest bounding box that contains all other bounding boxes within it.
    """
    
    # A list of the detections for each face in the video.
    # Each face has one set of coordinates that contains ALL of the bounding boxes from every frame.
    largest_detections = []

    # Get detections for the first frame
    firstFrameDetections = detections_for_frames[0]
    
    x_min, y_min, x_max, y_max, _  = firstFrameDetections
    largest_detections = [x_min, y_min, x_max, y_max]
        
    for detections in detections_for_frames[1:]:
            
        x_min, y_min, x_max, y_max, _ = detections

        current_largest_detection = largest_detections
        current_x_min, current_y_min, current_x_max, current_y_max = current_largest_detection

        # Expand the bounding box if neccessary to include this one
        current_x_min = min(x_min, current_x_min)
        current_y_min = min(y_min, current_y_min)
        current_x_max = max(x_max, current_x_max)
        current_y_max = max(y_max, current_y_max)
        
        largest_detections = [current_x_min, current_y_min, current_x_max, current_y_max]

    return largest_detections

In [178]:
def get_faces_from_multiple_frames(frames, detections_for_frames, target_size=256):
    """
    Given sequential random frames return any faces found within the frames.
    """
    
    # Ignore empty detections
    detections_for_frames = [x for x in detections_for_frames if len(x) != 0]

    if len(detections_for_frames) == 0:
        return []
    
    frame_height, frame_width, _ = frames[0].shape
    
    # Group detections by each person in the video
    grouped_detections = group_bounding_boxes_by_overlap(detections_for_frames)
    
    all_faces = []
    
    for detected_group in grouped_detections:
        largest_detections = get_largest_bounding_box(detected_group)

        # Now that we have a set of detections, apply them against the frames and 
        # return only the portions of the frames that contain the face
        x_min, y_min, x_max, y_max = largest_detections            
        
        # Make sure dets are within the frame when cropping
        x_min = max(x_min, 0)
        y_min = max(y_min, 0)
        x_max = min(x_max, frame_width)
        y_max = min(y_max, frame_height)
        
        face_frames = frames[:, int(y_min):int(y_max), int(x_min):int(x_max)]
        
        # Pre-allocate space for output faces
        output_faces = np.zeros((len(face_frames), target_size, target_size, 3), dtype=np.uint8)

        for i, face in enumerate(face_frames):
    
            # Resize to 256, 256
            longest_size = np.max(face.shape[:2])
            resize = float(target_size) / float(longest_size)
            face = cv2.resize(face, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_CUBIC)

            # Place within frame_faces (by default 0 where there is no image data)
            height, width, _ = face.shape            
            
            output_faces[i, :height, :width, :] = face
        
        all_faces.append(output_faces)
        
    return all_faces

In [179]:
imagenet_mean = torch.Tensor([0.485, 0.456, 0.406]).to(device)
imagenet_std = torch.Tensor([0.229, 0.224, 0.225]).to(device)

#easyBlazeFace = EasyBlazeFace(weights='../input/blazeface/blazeface.pth', anchors='../input/blazeface/anchors.npy')
easyBlazeFace = EasyBlazeFace()
easyRetinaFace = EasyRetinaFace()

def get_predictions_for_video(path):

    frames = read_frames(path)
    detections = easyBlazeFace.get_detections_with_multiple_crops(frames)
    
    grouped_faces = get_faces_from_multiple_frames(frames, detections)
    
    # If we cannot find any faces, try using EasyRetinaFace
    if len(grouped_faces) == 0:
        print("Trying RetinaFace for: ", path)
        detections = easyRetinaFace.detect_on_multiple_frames(frames)

        grouped_faces = get_faces_from_multiple_frames(frames, detections)
        
        if len(grouped_faces) == 0:
            print("NO FACES FOR ", path)
            return 0.5
        
    all_results = []
    
    for faces in grouped_faces:
        
        batched_faces = np.stack(faces, axis=0)

        # Create tensor and divide by 255
        input = (torch.from_numpy(batched_faces).float()) / 255.
        
        # N F H W C -> N C F H W
        input = input.unsqueeze(0).permute(0, 4, 1, 2, 3).to(device)

        # Normalize with ImageNet stats
        # TODO: are we doing this in .forward()?
        # input.sub_(imagenet_mean[None, :, None, None]).div_(imagenet_std[None, :, None, None])

        results = torch.softmax(model(input), axis=-1).detach().cpu().numpy()
        
        all_results.append(results[:,0])

    # TODO: Is there are a better way than just averaging across every person in the video?
    return np.mean(all_results)

Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


In [180]:
preds = {}

for path in tqdm(SOURCE_TEST.ls()):
    
        results = get_predictions_for_video(path)
        preds[path.name] = results
        

        

HBox(children=(FloatProgress(value=0.0, max=1148.0), HTML(value='')))

Trying RetinaFace for:  ../data/cropped_faces/valid_videos/xhegjwkfaa.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/ehkjdctavq.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/rlqbowounu.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/lkqotnclpd.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/xkicasophk.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/nycmyuzpml.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/iorbtaarte.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/gxbkcxyfjm.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/czifmcopho.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/olxuxttfce.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/wvbzxaspaa.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/kwfdyqofzw.mp4
Trying RetinaFace for:  ../data/cropped_faces/valid_videos/nraqgmsnmm.mp4
Trying RetinaFace for:  ../data/croppe

In [None]:
# Save them for analysis
np.save('retinaface_raw_preds.npy', preds)

In [183]:
from sklearn.metrics import log_loss
from video_utils import load_all_metadata

In [184]:
raw_preds = preds

In [185]:
all_metadata = load_all_metadata()

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [186]:
all_metadata

Unnamed: 0,fname,label,split,original,directory
0,owxbbpjpch.mp4,FAKE,train,wynotylpnm.mp4,../data/dfdc_train_part_0
1,vpmyeepbep.mp4,REAL,train,,../data/dfdc_train_part_0
2,fzvpbrzssi.mp4,REAL,train,,../data/dfdc_train_part_0
3,htorvhbcae.mp4,FAKE,train,wclvkepakb.mp4,../data/dfdc_train_part_0
4,fckxaqjbxk.mp4,FAKE,train,vpmyeepbep.mp4,../data/dfdc_train_part_0
...,...,...,...,...,...
3129,pdooqxqfrm.mp4,FAKE,train,ikebomnsiq.mp4,../data/dfdc_train_part_49
3130,djjdcnhlma.mp4,FAKE,train,kudvvlgiff.mp4,../data/dfdc_train_part_49
3131,fgmbxfqoze.mp4,REAL,train,,../data/dfdc_train_part_49
3132,cywebjaezn.mp4,REAL,train,,../data/dfdc_train_part_49


In [187]:
folder_0_avg_preds = []
folder_0_y_true = []

folder_1_avg_preds = []
folder_1_y_true = []

folder_2_avg_preds = []
folder_2_y_true = []

for path, preds in raw_preds.items():
    # Note that we clip values
    avg = np.mean(preds).clip(0.01, 0.99)

    row = all_metadata.loc[all_metadata['fname'] == path].iloc[0]
    
    if row['directory'] == '../data/dfdc_train_part_0':
        y_true = folder_0_y_true
        avg_preds = folder_0_avg_preds
    elif row['directory'] == '../data/dfdc_train_part_1':
        y_true = folder_1_y_true
        avg_preds = folder_1_avg_preds
    elif row['directory'] == '../data/dfdc_train_part_2':
        y_true = folder_2_y_true
        avg_preds = folder_2_avg_preds
    else:
        raise Exception("Invalid entry")
    
    avg_preds.append(avg)
    y = 1 if row['label'] == 'FAKE' else 0
    y_true.append(y)
    
print("folder 0", log_loss(folder_0_y_true, folder_0_avg_preds))
print("folder 1", log_loss(folder_1_y_true, folder_1_avg_preds))
print("folder 2", log_loss(folder_2_y_true, folder_2_avg_preds))

all_true = folder_0_y_true + folder_1_y_true + folder_2_y_true
all_preds = folder_0_avg_preds + folder_1_avg_preds + folder_2_avg_preds
print("all", log_loss(all_true, all_preds))

folder 0 0.39684384101631764
folder 1 0.3635189005380322
folder 2 0.38617420159061233
all 0.3823777159509993
