# Deep Learning Model for Video Prediction

This notebook demonstrates the use of a deep learning model to predict outcomes based on video input. It includes steps from setting up the environment to processing video data for predictions.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Library Installation

The following libraries are necessary for face recognition and image processing. They provide the tools required to manipulate video and image data effectively.

In [12]:
!pip install face_recognition
import face_recognition

  pid, fd = os.forkpty()


Collecting face_recognition
  Downloading face_recognition-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting face-recognition-models>=0.3.0 (from face_recognition)
  Downloading face_recognition_models-0.3.0.tar.gz (100.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting dlib>=19.7 (from face_recognition)
  Downloading dlib-19.24.4.tar.gz (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading face_recognition-1.3.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: dlib, face-recognition-models
  Building wheel for dlib (pyproject.toml) ... [?25l|

ModuleNotFoundError: No module named 'face_recognition'

## Imports and Setup

Here we import necessary libraries and define any initial setup configurations. This includes setting up the environment and importing various modules needed throughout the notebook.

## Seeding for Reproducibility

To ensure that our experiments can be replicated, we set a seed which makes all random operations deterministic.

In [None]:
import os
import torch
import random
import time
import cv2

from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50, ResNet50_Weights

from PIL import Image
import glob

import torch.nn.functional as F
import torchvision.models as models
import torch.nn as nn

import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt

## Model Definition

This section defines the architecture of our deep learning model. The model is designed to classify images based on multiple labels.

## Model Initialization

Here, we initialize the model with the predefined architecture and prepare it for training or inference.

In [None]:
SEED =  0

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

is_gpu_available = torch.cuda.is_available()
print(f'Is using GPU: {is_gpu_available}')
device = torch.device('cuda' if is_gpu_available else 'cpu')

## Image Processing

We define the transformations that will be applied to the images for normalization and augmentation. These are critical for preparing the data for processing by our model.

In [None]:
class MultiLabelImageClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultiLabelImageClassifier, self).__init__()
        self.base_model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        num_features = self.base_model.fc.in_features
        self.base_model.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return torch.sigmoid(self.base_model(x))

## Face Detection Function

This function is responsible for detecting and extracting faces from images. It uses pre-trained models from the `face_recognition` library.

In [None]:
model = MultiLabelImageClassifier(num_classes=2).to(device)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable parameters:", total_params)

## Video to Image Conversion

This function converts video files into a list of image frames. This is essential for processing video data where each frame is treated as a separate input to the model.

In [None]:
IMG_SIZE = 224
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

## Parameter Settings

Here we set various parameters that will be used in the model and data processing. These include frame count thresholds and other relevant settings for the video processing tasks.

In [None]:
def get_face_img(image):
    face_locations = face_recognition.face_locations(image)

    for _, face_location in enumerate(face_locations):
        top, right, bottom, left = face_location
        face_image = image[top:bottom, left:right]
        
        face_image = Image.fromarray(face_image)
        
        return transform(face_image)

In [None]:
def video_to_image_list(video_path, frame_count):
    cap = cv2.VideoCapture(video_path)
    image_list = []
    cnt = 0

    while True:
        ret, frame = cap.read()
        cnt += 1

        if not ret or cnt > frame_count:
            break
        
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = get_face_img(frame)
        if frame != None:
            image_list.append(frame)
    
    cap.release()
    return torch.stack(image_list, dim=0).to(device)

## Evaluation and Testing

After training, we evaluate the model's performance.

In [None]:
FRAME_COUNT = 15
THRESHOLD = (FRAME_COUNT//2)+1

def test_video(video_path_lst, is_df):
    
    video_cnt = 0
    class_target = 0
    if is_df:
        class_target = 1
    
    correct_cnt = 0
    for video_path in video_path_lst:
        if video_cnt % 10 == 0:
            print("Done", video_cnt, "videos")

        img_list = video_to_image_list(video_path, FRAME_COUNT)
        pred_output = model(img_list)
        
        _, pred_class_img = pred_output.topk(1, dim=1)
        
        pred_class_vid = 0
        if sum(pred_class_img) >= THRESHOLD:
            pred_class_vid = 1
        
        video_cnt += 1
        if pred_class_vid == class_target:
            correct_cnt += 1
        
    return [correct_cnt, video_cnt-correct_cnt]

def count_acc(df_stat, real_stat):
    total_vid = sum(df_stat)+sum(real_stat)
    
    pred_a = df_stat[0]+real_stat[0]
    pred_b = df_stat[1]+real_stat[1]

    if pred_a > pred_b:
        print("Accuracy deepfake:", df_stat[0]/sum(df_stat)*100, "%")
        print("Accuracy real video:", real_stat[0]/sum(real_stat)*100, "%")
        return pred_a/total_vid
    
    print("Accuracy deepfake:", df_stat[1]/sum(df_stat)*100, "%")
    print("Accuracy real video:", real_stat[1]/sum(real_stat)*100, "%")
    return pred_b/total_vid

In [None]:
model_path = "/kaggle/input/deepfake-detection/pytorch/cosine_5e-3/1/epoch39.pth"
model = MultiLabelImageClassifier(num_classes=2).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

print("Predicting deepfake...")
video_list = glob.glob("/kaggle/input/celeb-df-deep-learning/Celeb-DF-v2_separated/Celeb-DF-v2_separated/test/deepfake/*")
df_stat = test_video(video_list, is_df=True)

print("Predicting real...")
video_list = glob.glob("/kaggle/input/celeb-df-deep-learning/Celeb-DF-v2_separated/Celeb-DF-v2_separated/test/real/*")
real_stat = test_video(video_list, is_df=False)

acc = count_acc(df_stat, real_stat)
print("Average accuracy:", acc*100,"%")