In [None]:
import os
# Custom packages
import sys
# Standalone packages
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from time import time

import scipy
import cv2
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from PIL import Image
from scipy.special import expit

sys.path.append('/kaggle/input/faceutils/')
sys.path.append('/kaggle/input/efficientnet-pytorch-master/EfficientNet-PyTorch-master/')

from blazeface import BlazeFace
from blazeface import FaceExtractor
from blazeface import VideoReader
from efficientnet_pytorch import EfficientNet
from isplutils.utils import extract_bb

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Network definition

In [None]:
import torch.nn as nn

class FeatureExtractor(nn.Module):
    """
    Abstract class to be extended when supporting features extraction.
    It also provides standard normalized and parameters
    """

    def features(self, x: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError

    def get_trainable_parameters(self):
        return self.parameters()

    @staticmethod
    def get_normalizer():
        return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    
class EfficientNetAutoAtt(EfficientNet):
    def init_att(self, model: str):
        """
        Initialize attention
        :param model: efficientnet-bx, x \in {0,..,7}
        :return:
        """
        if model == 'efficientnet-b0':
            self.att_block_idx = 4
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=40, out_channels=1)
        elif model == 'efficientnet-b1':
            self.att_block_idx = 6
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=40, out_channels=1)
        elif model == 'efficientnet-b2':
            self.att_block_idx = 7
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=48, out_channels=1)
        elif model == 'efficientnet-b3':
            self.att_block_idx = 7
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=48, out_channels=1)
        elif model == 'efficientnet-b4':
            self.att_block_idx = 9
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)
        elif model == 'efficientnet-b5':
            self.att_block_idx = 12
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=64, out_channels=1)
        elif model == 'efficientnet-b6':
            self.att_block_idx = 14
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=72, out_channels=1)
        elif model == 'efficientnet-b7':
            self.att_block_idx = 17
            self.attconv = nn.Conv2d(kernel_size=1, in_channels=80, out_channels=1)
        else:
            raise ValueError('Model not valid: {}'.format(model))

    def get_attention(self, x: torch.Tensor) -> torch.Tensor:

        # Placeholder
        att = None

        # Stem
        x = self._swish(self._bn0(self._conv_stem(x)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)
            x = block(x, drop_connect_rate=drop_connect_rate)
            if idx == self.att_block_idx:
                att = torch.sigmoid(self.attconv(x))
                break

        return att

    def extract_features(self, x: torch.Tensor) -> torch.Tensor:
        # Stem
        x = self._swish(self._bn0(self._conv_stem(x)))

        # Blocks
        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)
            x = block(x, drop_connect_rate=drop_connect_rate)
            if idx == self.att_block_idx:
                att = torch.sigmoid(self.attconv(x))
                x = x * att

        # Head
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

class EfficientNetGenAutoAtt(FeatureExtractor):
    def __init__(self, model: str):
        super(EfficientNetGenAutoAtt, self).__init__()

        self.efficientnet = EfficientNetAutoAtt.from_name(model)
        self.efficientnet.init_att(model)
        self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
        del self.efficientnet._fc

    def features(self, x: torch.Tensor) -> torch.Tensor:
        x = self.efficientnet.extract_features(x)
        x = self.efficientnet._avg_pooling(x)
        x = x.flatten(start_dim=1)
        return x

    def forward(self, x):
        x = self.features(x)
        x = self.efficientnet._dropout(x)
        x = self.classifier(x)
        return x

    def get_attention(self, x: torch.Tensor) -> torch.Tensor:
        return self.efficientnet.get_attention(x)
        
class EfficientNetAutoAttB4(EfficientNetGenAutoAtt):
    def __init__(self):
        super(EfficientNetAutoAttB4, self).__init__(model='efficientnet-b4')

class SiameseTuning(FeatureExtractor):
    def __init__(self, feat_ext: FeatureExtractor, num_feat: int):
        super(SiameseTuning, self).__init__()
        self.feat_ext = feat_ext()
        if not hasattr(self.feat_ext, 'features'):
            raise NotImplementedError('The provided feature extractor needs to provide a features() method')

        self.classifier = nn.Sequential(
            nn.BatchNorm1d(num_features=num_feat),
            nn.Linear(in_features=num_feat, out_features=1),
        )

    def features(self, x):
        x = self.feat_ext.features(x)
        return x

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            x = self.features(x)
        x = self.classifier(x)
        return x

    def get_trainable_parameters(self):
        return self.classifier.parameters()
        
class EfficientNetAutoAttB4ST(SiameseTuning):
    def __init__(self):
        super(EfficientNetAutoAttB4ST, self).__init__(feat_ext=EfficientNetAutoAttB4, num_feat=1792)

# Initializations

In [None]:
# Parameters

"""
Model
See dataset description
"""

# Kaggle
input_folder = '/kaggle/input/deepfake-detection-challenge/test_videos/'
model_paths = [
    '/kaggle/input/efficientnetautoattb4st-folds-weights/net-EfficientNetAutoAttB4ST_size-224_face-scale_split-fourtyfolders-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_foldslow/it000350.pth',
    '/kaggle/input/efficientnetautoattb4st-folds-weights/net-EfficientNetAutoAttB4ST_size-224_face-scale_split-fourtyfolders1-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_foldslow/it000300.pth',
    '/kaggle/input/efficientnetautoattb4st-folds-weights/net-EfficientNetAutoAttB4ST_size-224_face-scale_split-fourtyfolders2-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_foldslow/it000300.pth',
    '/kaggle/input/efficientnetautoattb4st-folds-weights/net-EfficientNetAutoAttB4ST_size-224_face-scale_split-fourtyfolders3-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_foldslow/it000400.pth',
    '/kaggle/input/efficientnetautoattb4st-folds-weights/net-EfficientNetAutoAttB4ST_size-224_face-scale_split-fourtyfolders4-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_foldslow/it000300.pth',
    '/kaggle/input/efficientnetautoattb4-folds-weights/net-EfficientNetAutoAttB4_size-224_face-scale_split-fourtyfolders-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_ffpp-True_fold/it003000.pth',
    '/kaggle/input/efficientnetautoattb4-folds-weights/net-EfficientNetAutoAttB4_size-224_face-scale_split-fourtyfolders1-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_ffpp-True_fold/it026000.pth',
    '/kaggle/input/efficientnetautoattb4-folds-weights/net-EfficientNetAutoAttB4_size-224_face-scale_split-fourtyfolders2-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_ffpp-True_fold/it017500.pth',
    '/kaggle/input/efficientnetautoattb4-folds-weights/net-EfficientNetAutoAttB4_size-224_face-scale_split-fourtyfolders3-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_ffpp-True_fold/it009000.pth',
    '/kaggle/input/efficientnetautoattb4-folds-weights/net-EfficientNetAutoAttB4_size-224_face-scale_split-fourtyfolders4-oneface_subset-original_aug-1.00_trainloss-bce_optim-adam_ffpp-True_fold/it009000.pth',
]
blazeface_anchors_path = "/kaggle/input/faceutils/blazeface/anchors.npy"
blazeface_weights_path = "/kaggle/input/faceutils/blazeface/blazeface.pth"

frames_per_video = 72
num_workers = 8

submission_path = 'submission.csv'

In [None]:
# Instantiate nets
patch_size = 224

nets = []
for model_path in model_paths:
    if 'net-EfficientNetAutoAttB4ST_' in model_path:
        net = EfficientNetAutoAttB4ST()
    elif 'net-EfficientNetAutoAttB4_' in model_path:
        net = EfficientNetAutoAttB4()
    else:
        raise ValueError('Unknown model for: {}'.format(model_path))
    net = net.eval().to(device)
    net.load_state_dict(torch.load(model_path, map_location='cpu')['net'])
    nets.append(net)
del net #Avoid human errors in using net, if possible

facedet = BlazeFace().eval().to(device)
facedet.load_weights(blazeface_weights_path)
facedet.load_anchors(blazeface_anchors_path)

video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [None]:
# Transformers

import albumentations as A
from albumentations.pytorch import ToTensorV2

patch_size_load = patch_size
loading_transformations = [
    A.LongestMaxSize(max_size=patch_size_load, always_apply=True),
    A.PadIfNeeded(min_height=patch_size_load, min_width=patch_size_load,
                  border_mode=cv2.BORDER_CONSTANT, value=0),
]
final_transformations = [
    A.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
                ),
    ToTensorV2(),
]

val_transformer = A.Compose(loading_transformations +  final_transformations)

In [None]:
# Index files
video_paths = sorted(Path(input_folder).glob('*.mp4'))
df = pd.DataFrame({'path': map(str, video_paths)}, index=[p.name for p in video_paths])
print('Found {} video files'.format(len(df)))

In [None]:
from threading import Semaphore
net_lock = Semaphore(1)

def process_video(name,debug:bool=False):
    path = os.path.join(input_folder, name)
    locked_by_me = False
    try:
        if debug:
            t0 = time()
            
        # Find the faces for N frames in the video.
        frames = face_extractor.process_video(path)
        
        # Only look at one face per frame.
#         face_extractor.keep_only_best_face(frames)

        # save blazeface scores for each frame
        blzf_score_list = list()
        faces_list = list()
        for i in range(len(frames)):

            frame_im = Image.fromarray(frames[i]['frame'])
            scores = frames[i]['scores']
            if len(scores) == 0:
#                 print(‘Found empy score vector’)
                continue
    
            if not np.any(np.array(scores) > 0.9):
                
                idx_max = np.argmax(scores)               
                detection = frames[i]['detections'][idx_max]
                face_im = extract_bb(frame=frame_im,
                                        bb=(detection[1],detection[0],detection[3],detection[2]),
                                        scale='scale',
                                        size=patch_size_load)                    
                faces_list.append(np.asarray(face_im))            
                blzf_score_list.append([scores[idx_max]])
                
            else:
                                
                idx_max = np.where(np.array(scores) > 0.9)
                if debug:
                    print('idx_max {}'.format(idx_max))
                
                for detection in frames[i]['detections'][idx_max]:
                    face_im = extract_bb(frame=frame_im,
                                            bb=(detection[1],detection[0],detection[3],detection[2]),
                                            scale='scale',
                                            size=patch_size_load)                    
                    faces_list.append(np.asarray(face_im))            
                
                blzf_score_list.append([scores[j] for j in idx_max[0]])
       
        if debug:
            t1 = time()
            print('Face detection: {:.3f}s'.format(t1-t0))

        if debug:
            t2 = time()
            print('Face extraction: {:.3f}s'.format(t2-t1))
        
        faces_tensor = torch.stack([val_transformer(image=face)['image'] for face in faces_list])
        faces_tensor = faces_tensor.to(device)
        
        if debug:
            t3 = time()
            print('Face transformation: {:.3f}s'.format(t3-t2))
    
        
        net_lock.acquire()
        locked_by_me = True
        with torch.no_grad():
            S = torch.zeros((len(nets), len(faces_list)))
            for idx,net in enumerate(nets):
                net_y_pred = net(faces_tensor)
                S[idx, :] = net_y_pred.squeeze()
        net_lock.release()
        locked_by_me = False
        
        if debug:
            t4 = time()
            print('Prediction: {:.3f}s'.format(t4-t3))
        
        
        # aggregation of the face scores
        S = S.numpy()
        cnt = 0
        score_video = []
        print(len(faces_list))
        for i in range(len(blzf_score_list)):
            blz_score = blzf_score_list[i]
            score_frame = S[:, cnt:cnt+len(blz_score)]
            score_out_frame = np.max(score_frame, axis=1)
            for idx_net, score_net in enumerate(score_out_frame):
                if expit(score_net) <= 0.5:
                    score_out_frame[idx_net] = np.min(score_frame[idx_net, :])
            score_video.append(score_out_frame.mean())
            cnt = cnt + len(blz_score)
        # final video score: average of all the frames
        score = expit(np.mean(score_video))
            
    except Exception as e:
        print('Prediction error on video %s: %s' % (name, str(e)))
        score = 0.5
        if locked_by_me:
            net_lock.release()
    
    return name, score

In [None]:
if len(df) == 400:
    # Process a video at random
#     out = process_video(np.random.choice(df.index),debug=True)
    out = process_video('wixbuuzygv.mp4',debug=True)
    print(out)

In [None]:
def predict(video_paths: list):
    video_paths = list(video_paths)
    if num_workers == 0:
        predictions = []
        for name in tqdm(video_paths):
            predictions.append(process_video(name))
    else:
        with ThreadPoolExecutor(max_workers=num_workers) as ex:
            predictions = list(tqdm(ex.map(process_video, video_paths),total=len(video_paths)))
    return predictions

In [None]:
if len(df) == 400:
    speedtest = True
    test_samples = num_workers * 2
    if speedtest:
        t0 = time()
        predictions = predict(df.index[:num_workers*2])

        t1 = time()
        elapsed = t1 - t0
        print('Elapsed time for {:d} samples: {:.2f}s'.format(test_samples, elapsed))
        print('Average speed: {:.2f}s/video'.format(elapsed / test_samples))
        print(
            'Forecasted prediction on small test set: {:.0f}min'.format(elapsed / test_samples * len(video_paths) / 60))
        print('Forecasted prediction on full test set: {:.1f}h'.format(elapsed / test_samples * 4000 / 3600))

In [None]:
# Predict on all
predictions = predict(df.index)

In [None]:
df['label'] = 0.5
for name, video_pred in predictions:
    df.loc[name, 'label'] = video_pred

In [None]:
df_out = df.loc[:, ['label', ]]
df_out.index.name = 'filename'

# Save submission
df_out.to_csv(submission_path, index=True)

In [None]:
df_out.hist(bins=50);

In [None]:
print('Num records: {}'.format(len(df_out)))
print('Min: {}'.format(df_out['label'].min()))
print('Max: {}'.format(df_out['label'].max()))