# resnext and audio
+ This kernel is modified from Inference Kernel Demo.
+ I‘ve added a voice processing framework to the previous video processing framework.
+ When the voice-video model exceeds the single video model, it means that the voice model is useful.

# Inference Kernel Demo

This is the kernel I’ve used for my recent submissions. It takes about 5-6 hours on the test set, using only CPU. 

I’ve provided this kernel because a lot of people have problems making submissions. This method works and has never errored out for me. (Although I haven't tried making a submission using the GPU yet -- so no guarantees there.)

It uses BlazeFace for face extraction (see also [my BlazeFace kernel](https://www.kaggle.com/humananalog/starter-blazeface-pytorch)) and ResNeXt50 as the classifier model.

We take the average prediction over 17 frames from each video. (Why 17? Using more frames makes the kernel slower, but doesn't appear to improve the score much. I used an odd number so we don't always land on even frames.)

**Please use this kernel only to learn from...** Included is the checkpoint for a ResNeXt50 model that hasn't really been trained very well yet. I'm sure you can improve on it by training your own model!

You could use the included trained weights to get yourself an easy top-50 score on the leaderboard (as of 24 Jan 2020) but it’s nicer to use it as a starting point for your own work. :-)

In [None]:
! tar xvf ../input/ffmpeg/ffmpeg-git-amd64-static.tar.xz

In [None]:
import os
import librosa
import subprocess
import shutil
from pathlib import Path
output_format = 'wav'  # can also use aac, wav, etc

output_dir = Path(f"{output_format}s")
Path(output_dir).mkdir(exist_ok=True, parents=True)
WAV_PATH = './wavs/'

In [None]:
def create_wav(file):
    command = f"../working/ffmpeg-git-20191209-amd64-static/ffmpeg -i {file} -ab 192000 -ac 2 -ar 44100 -vn {output_dir/file[-14:-4]}.{output_format}"
    subprocess.call(command, shell=True)

In [None]:
# test_dir = "/kaggle/input/deepfake-detection-challenge/test_videos/"
# tmp_dir = "./kaggle/working/"
# test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
# for test_video in test_videos:
#     video_path = os.path.join(test_dir, test_video)
#     create_wav(video_path)
#     name = test_video[:-4] + ".wav"
#     wave, sr = librosa.load(WAV_PATH+f'{name}', mono=True)
#     os.unlink(WAV_PATH+f'{name}')
#     print(sr)

In [None]:
import os, sys, time
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
import matplotlib.pyplot as plt

## Get the test videos

In [None]:
test_dir = "/kaggle/input/deepfake-detection-challenge/test_videos/"

test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
len(test_videos)

## Create helpers

In [None]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

In [None]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu

In [None]:
import sys
sys.path.insert(0, "/kaggle/input/blazeface-pytorch")
sys.path.insert(0, "/kaggle/input/deepfakes-inference-demo")

In [None]:
from blazeface import BlazeFace
facedet = BlazeFace().to(gpu)
facedet.load_weights("/kaggle/input/blazeface-pytorch/blazeface.pth")
facedet.load_anchors("/kaggle/input/blazeface-pytorch/anchors.npy")
_ = facedet.train(False)

In [None]:
from helpers.read_video_1 import VideoReader
from helpers.face_extract_1 import FaceExtractor

frames_per_video = 21

video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [None]:
input_size = 224

In [None]:
from torchvision.transforms import Normalize

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)

In [None]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

def load_audio_mfcc(audio_path):
    wave, sr = librosa.load(audio_path, mono=True)
    mfcc = librosa.feature.mfcc(wave, sr)
    if mfcc.shape[1] > 400:
        mfcc = mfcc[:, :400]
    else:
        mfcc = np.pad(mfcc, ((0, 0), (0, 400 - len(mfcc[0]))), mode='constant', constant_values=0)
    return mfcc

In [None]:
import torch.nn as nn
import torchvision.models as models

class MyResNeXt(models.resnet.ResNet):
    def __init__(self, training=True):
        super(MyResNeXt, self).__init__(block=models.resnet.Bottleneck,
                                        layers=[3, 4, 6, 3], 
                                        groups=32, 
                                        width_per_group=4)
        self.fc = nn.Linear(2048, 1)

# audio_model

In [None]:
class MASRCNN_activate(nn.Module):
    def __init__(self, max_sent_len, embedding_dim, num_conv_blocks, init_neurons, num_classes=2):
        super(MASRCNN_activate, self).__init__()
        num_dense_neurons = 50
        convnet_3 = []
        convnet_5 = []
        convnet_7 = []
        for ly in range(0, num_conv_blocks):
            if ly == 0:
                convnet_3.append(nn.Conv1d(in_channels=embedding_dim, out_channels=init_neurons, kernel_size=3))
                convnet_3.append(nn.LeakyReLU(0.2))
                convnet_5.append(nn.Conv1d(in_channels=embedding_dim, out_channels=init_neurons, kernel_size=5))
                convnet_5.append(nn.LeakyReLU(0.2))
                convnet_7.append(nn.Conv1d(in_channels=embedding_dim, out_channels=init_neurons, kernel_size=7))
                convnet_7.append(nn.LeakyReLU(0.2))
            elif ly == 1:
                convnet_3.append(nn.Conv1d(in_channels=init_neurons, out_channels=init_neurons*(ly*2), kernel_size=3))
                convnet_3.append(nn.LeakyReLU(0.2))
                convnet_5.append(nn.Conv1d(in_channels=init_neurons, out_channels=init_neurons*(ly*2), kernel_size=5))
                convnet_5.append(nn.LeakyReLU(0.2))
                convnet_7.append(nn.Conv1d(in_channels=init_neurons, out_channels=init_neurons*(ly*2), kernel_size=7))
                convnet_7.append(nn.LeakyReLU(0.2))
            else:
                convnet_3.append(nn.Conv1d(in_channels=init_neurons*((ly - 1)*2), out_channels=init_neurons*(ly*2), kernel_size=3))
                convnet_3.append(nn.LeakyReLU(0.2))
                convnet_5.append(nn.Conv1d(in_channels=init_neurons*((ly - 1)*2), out_channels=init_neurons*(ly*2), kernel_size=5))
                convnet_5.append(nn.LeakyReLU(0.2))
                convnet_7.append(nn.Conv1d(in_channels=init_neurons*((ly - 1)*2), out_channels=init_neurons*(ly*2), kernel_size=7))
                convnet_7.append(nn.LeakyReLU(0.2))
        self.conv_blocks_3 = nn.Sequential(*convnet_3)
        self.conv_blocks_5 = nn.Sequential(*convnet_5)
        self.conv_blocks_7 = nn.Sequential(*convnet_7)
        self.leakyrelu = nn.LeakyReLU(0.2)
        self.maxpool = nn.AdaptiveMaxPool1d(1)
        self.dense = nn.Sequential(nn.Linear(448*3, num_dense_neurons),
                                    nn.BatchNorm1d(num_dense_neurons),
                                    nn.LeakyReLU(0.2)
                                    )
        self.fc = nn.Linear(50, num_classes)
    
    def forward(self, x):
        x_3 = self.conv_blocks_3(x)
        x_5 = self.conv_blocks_5(x)
        x_7 = self.conv_blocks_7(x)
        x_3 = self.maxpool(x_3)
        x_5 = self.maxpool(x_5)
        x_7 = self.maxpool(x_7)
        x = torch.cat([x_3, x_5, x_7], 2)
        x = x.view(x.size(0), -1)
        x = self.dense(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc(x)
        return x

# load weights

In [None]:
def load_weights(model, checkpoint_path, multi_gpu=False):
    if torch.cuda.is_available():
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    if multi_gpu:
        model.module.load_state_dict(checkpoint['model'], strict=False)
    else:
        model.load_state_dict(checkpoint['model'], strict=False)
    del checkpoint
    torch.cuda.empty_cache()
    return model

In [None]:
checkpoint_path = "/kaggle/input/audio-model/ASRCNN_27000.pth"
audio_model = MASRCNN_activate(max_sent_len=400, embedding_dim=20, num_conv_blocks=8, init_neurons=32)
audio_model = load_weights(audio_model, checkpoint_path)
audio_model = audio_model.to(gpu)
audio_model.eval()

In [None]:
def predict_on_audio(audio_path):
    try:
        mfcc = load_audio_mfcc(audio_path)
        mfcc = torch.tensor(mfcc, device=gpu).float()
        mfcc = torch.unsqueeze(mfcc, dim=0)
        output = audio_model(mfcc)
        output = torch.softmax(output, dim=1)
        pred = output.detach().cpu().numpy()[0][1]
        return pred
    except Exception as e:
        print("Prediction error on audio %s: %s" % (audio_path, str(e)))
    return 0.5

In [None]:
checkpoint = torch.load("/kaggle/input/deepfakes-inference-demo/resnext.pth", map_location=gpu)

model = MyResNeXt().to(gpu)
model.load_state_dict(checkpoint)
_ = model.eval()

del checkpoint

## Prediction loop

In [None]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)
        
        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))
                    
                    # Test time augmentation: horizontal flips.
                    # TODO: not sure yet if this helps or not
                    #x[n] = cv2.flip(resized_face, 1)
                    #n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze())
                    return y_pred[:n].mean().item()

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

# 多线程(快十倍？)

In [None]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        video_path = os.path.join(test_dir, filename)
        # predict video
        # y_pred_frame = predict_on_video(video_path, batch_size=frames_per_video)
        # predict audio
        create_wav(video_path)
        audio_path = WAV_PATH + filename[:-4] + ".wav"
        y_pred_audio = predict_on_audio(audio_path)
        os.unlink(audio_path)
        y_pred = y_pred_audio
        # y_pred = max(y_pred_frame, y_pred_audio)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

# 单线程

In [None]:
def predict_on_video_set_single(videos):
    predictions = []
    for filename in videos:
        video_path = os.path.join(test_dir, filename)
        # predict video
        y_pred_frame = predict_on_video(video_path, batch_size=frames_per_video)
        # predict audio
        try:
            create_wav(video_path)
        except Exception as e:
            print("create wav error", e)
            predictions.append(y_pred_frame)
            continue
        audio_path = WAV_PATH + filename[:-4] + ".wav"
        if not os.path.exists(audio_path):
            predictions.append(y_pred_frame)
            continue
        y_pred_audio = predict_on_audio(audio_path)
        os.unlink(audio_path)
        if y_pred_frame > 0.5:
            y_pred = y_pred_frame
        elif y_pred_audio >= 0.5:
            y_pred = y_pred_audio
        else:
            y_pred = min(y_pred_frame, y_pred_audio)
        predictions.append(y_pred)
    return predictions

## Speed test

The leaderboard submission must finish within 9 hours. With 4000 test videos, that is `9*60*60/4000 = 8.1` seconds per video. So if the average time per video is greater than ~8 seconds, the kernel will be too slow!

In [None]:
speed_test = True  # you have to enable this manually

In [None]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[:5]
    # predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    predictions = predict_on_video_set_single(speedtest_videos)
    print(predictions)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

## Make the submission

In [None]:
if not speed_test:
    # predictions = predict_on_video_set(test_videos, num_workers=4)
    predictions = predict_on_video_set_single(test_videos)
    submission_df = pd.DataFrame({"filename": test_videos, "label": predictions})
    submission_df.to_csv("submission.csv", index=False)

In [None]:
shutil.rmtree("../working/ffmpeg-git-20191209-amd64-static")
shutil.rmtree("../working/wavs")

In [None]:
#submission_df.head()