# Face Sequence Ensemble - Inference

This notebook combines a series of pretrained video classifiers by averaging the output logits of each model.

The input to each model are the faces from every frame in a video. The dataset class used is able to identify distinct face sequences (i.e., from different people in a video), and these are classified separately. The output from each submodel is then the logit from the face sequence with the highest probability of being fake.

## Contents

1. <a href="#1">Install dependencies</a>
1. <a href="#2">Imports</a>
1. <a href="#3">Define ensemble module</a>
1. <a href="#4">Load pretrained face models</a>
1. <a href="#5">Create dataset object</a>
1. <a href="#6">Load pretrained video models</a>
1. <a href="#7">Sense checking</a>
1. <a href="#8">Inference on test set</a>
1. <a href="#9">Save predictions</a>

## <a id='1'>Install dependencies</a>

In [None]:
%%capture
!pip install /kaggle/input/facenet-pytorch-vggface2/facenet_pytorch-2.2.9-py3-none-any.whl
!pip install /kaggle/input/xt-training/pynvml-8.0.4-py3-none-any.whl
!pip install /kaggle/input/xt-training/xt_training-1.4.0-py3-none-any.whl
!pip install /kaggle/input/imageio-ffmpeg/imageio_ffmpeg-0.3.0-py3-none-manylinux2010_x86_64.whl
!pip install /kaggle/input/imutils/imutils-0.5.3/
!cp -R /kaggle/input/xtract-ai-dfdc/dfdc ./

## <a id='2'>Imports</a>

In [None]:
import os
import glob

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Subset
from torch import optim
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import albumentations as A
from tqdm.notebook import tqdm
from xt_training import metrics, Runner
from xt_training.runner import Logger

from dfdc.datasets.video_dataset import VideoDataset
from dfdc.models.video_models import FaceSequenceClassifier, FaceClassifier

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

## <a id='3'>Ensemble module</a>

In [None]:
class Ensemble(torch.nn.Module):
    
    def __init__(self, unpack=None, permute=False, mapping=None, **kwargs):
        super().__init__()
        self.permute = permute
        self.mapping = mapping
        if unpack is None:
            self.unpack = lambda x, i: x
        else:
            self.unpack = unpack
        for name, model in kwargs.items():
            self.add_module(name, model)
    
    def forward(self, x):
        out = []

        for i, layer in enumerate(self._modules.values()):
            if self.mapping is None:
                in_i = i
            else:
                in_i = self.mapping[i]
            x_i = self.unpack(x, in_i)
            out.append(layer(x_i))
        
        out = torch.stack(out)
        
        if self.permute:
            out = out.permute((1, 0, 2))

        return out

## <a id='4'>Load pretrained face models</a>

The primary component of each video classifier is a resnet-based model that returns logits for each face from each frame.

In [None]:
face_model1 = FaceClassifier(pretrained=False, base_model='resnext')
face_model1.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/face_model_best_alltrain_moreaug_20200317.pt'))
face_model1.classifier.fc = torch.nn.Sequential()

face_model2 = FaceClassifier(pretrained=False, base_model='resnet')
face_model2.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/face_model_best_lessaug_resnet101_20200321.pt'))
face_model2.classifier.fc = torch.nn.Sequential()

face_model3 = FaceClassifier(pretrained=False, base_model='resnext')
face_model3.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/face_model_best_20200314.pt'))
face_model3.classifier.fc = torch.nn.Sequential()

face_model4 = FaceClassifier(pretrained=False, base_model='resnext')
face_model4.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/face_model_best_20200223.pt'))
face_model4.classifier.fc = torch.nn.Sequential()


face_model = Ensemble(
    unpack=None,
    m1=face_model1,
    m2=face_model2,
    m3=face_model3,
    m4=face_model4,
)
face_model.to(device)
face_model.eval()

@torch.no_grad()
def face_model_transform(x):
    return face_model(x.to(device))

## <a id='5'>Create dataset object</a>

This dataset object is responsible for:

1. Loading images
1. Applying transformations
1. Detecting all faces
1. Constructing sequences of individual faces

In [None]:
test_trans = A.ReplayCompose([
    A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    A.Resize(160, 160, always_apply=True)
])

video_root = '/kaggle/input/deepfake-detection-challenge/'

test_dataset = VideoDataset(
    video_root,
    transform=test_trans,
    out_transform=face_model_transform,
    is_test=True,
    sample_frames=-1,
    shuffle=False,
    stride=10,
    n_frames=-1,
    device=device,
    reader='imutils',
    path_include='test_videos/',
)

batch_size = 1
num_workers = 0
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    num_workers=num_workers
)

## <a id='6'>Load pretrained video models</a>

Each of these models takes as input the feature vectors returned from the above face models for each face in each frame of a video.

In [None]:
video_model1 = FaceSequenceClassifier(mode='linear')
video_model1.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/video_model_best_alltrain_lessaug_20200319.pt'))

video_model2 = FaceSequenceClassifier(mode='linear')
video_model2.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/video_model_best_lessaug_resnet101_20200321.pt'))

video_model3 = FaceSequenceClassifier(mode='linear')
video_model3.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/video_model_best_ep8_20200314.pt'))

video_model4 = FaceSequenceClassifier(mode='linear')
video_model4.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/video_model_best_20200223.pt'))

video_model5 = FaceSequenceClassifier(mode='conv')
video_model5.load_state_dict(torch.load('/kaggle/input/face-sequence-classifier/video_model_best_alltrain_lessaug_conv_20200319.pt'))

scales = [1, 1, 1, 1, 0.5]

def unpack(x, i):
    return [x[0][:, i], x[1]]

video_model = Ensemble(
    unpack=unpack,
    permute=True,
    m1=video_model1,
    m2=video_model2,
    m3=video_model3,
    m4=video_model4,
    m5=video_model5,
    mapping=[0, 1, 2, 3, 0]
)
video_model.to(device)
video_model.eval()

## <a id='7'>Sense checking</a>

Produce a prediction to ensure the value matches what I've seen locally.

In [None]:
sample_dataset = VideoDataset(
    video_root,
    transform=test_trans,
    out_transform=face_model_transform,
    is_test=True,
    sample_frames=-1,
    shuffle=False,
    stride=10,
    n_frames=-1,
    device=device,
    reader='imutils',
    path_include='train_sample_videos/',
)

sample_dataset.samples[0] = ('/kaggle/input/deepfake-detection-challenge/train_sample_videos/aagfhgtpmv.mp4', 1)

sample_loader = DataLoader(
    sample_dataset,
    batch_size=batch_size,
    num_workers=num_workers
)

for x, y in sample_loader:
    break
print(x[0].abs().mean())
x = [x_i.to(device) for x_i in x]
y.to(device)

with torch.no_grad():
    y_pred = video_model(x)

for i, scale in enumerate(scales):
    y_pred[:, i] = y_pred[:, i] * scale

y_pred

## <a id='8'>Inference on test set</a>

In [None]:
runner = Runner(model=video_model, device=device)

y_pred, _ = runner(test_loader, 'test', return_preds=True)

## <a id='9'>Save predictions</a>

In [None]:
for i, scale in enumerate(scales):
    y_pred[:, i] = y_pred[:, i] * scale

labels = torch.nn.functional.softmax(y_pred.mean(dim=1), dim=1)[:, 1].numpy()
filenames = [os.path.basename(f) for f in test_dataset.video_files]

submission = pd.DataFrame({'filename': filenames, 'label': labels})
# submission.label = submission.label.clip(0.005, 0.995)
submission.to_csv('submission.csv', index=False)
plt.hist(submission.label, 30)
submission