In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import Dataset

import torchvision
import torchvision.transforms.functional as t_F
import torchvision.models as models
import torchvision.transforms as transforms

import cv2
import numpy as np
import pandas as pd
import json
import os
import shutil
import copy
import time
import random

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

plt.ion()   # interactive mode

In [21]:
# set data path
train_data_path = '/home/ec2-user/SageMaker/pytorch_learning/data/deepfake/train'
val_data_path = '/home/ec2-user/SageMaker/pytorch_learning/data/deepfake/val'
meta_data = 'metadata.json'
val_meta_data_path = '/home/ec2-user/.fastai/data/deepfake/dfdc_train_part_49'
save_model_path = "./"

res_size = 224        # ResNet image size

# training parameters
k = 2             # number of target category
epochs = 30        # training epochs
batch_size = 32
learning_rate = 1e-3
log_interval = 10   # interval for displaying training info


# from pytorch tutorial
dataset_sizes = {'train': len(os.listdir(train_data_path)), 'val': len(os.listdir(val_data_path)) }
class_names = [0, 1]

In [11]:
class FrameDataset(Dataset):
    """Dataset Class for Loading Video"""

    def __init__(self, files, labels, num_frames, transform=None, test=False):
        """
        """
        self.files = files
        self.labels  = labels
        self.num_frames = num_frames
        self.max_num_frames = 60
        self.transform = transform
        self.test = test
        self.frame_no = num_frames
        self.face_cascade = cv2.CascadeClassifier('/home/ec2-user/SageMaker/kaggle/input/single-frame/haarcascade_frontalface_default.xml')

    def face_detect(self, frame):
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # Resize frame of video to 1/4 size for faster face detection processing
        small_frame = cv2.resize(gray, (0, 0), fx=0.25, fy=0.25)
        # Detect the faces
        faces = self.face_cascade.detectMultiScale(small_frame, 1.1, 4)
        return faces


    def __len__(self):
        return len(self.files)


    def readVideo(self, videoFile):

        # Load the cascade

        # Open the video file
        cap = cv2.VideoCapture(videoFile)
        # cap.set(1, self.frame_no)
        # nFrames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        # frames = torch.FloatTensor(self.channels, self.timeDepth, self.xSize, self.ySize)

        attempts = 0
        while attempts < self.max_num_frames:
            ret, frame = cap.read()
            attempts += 1
            if ret:
                last_good_frame = frame
                try:
                    faces = self.face_detect(frame)
                    # Face detected
                    if len(faces) > 0:
                        # Get the face, if more than two, use the whole frame
                        if len(faces) > 1:
                            break
                        x, y, w, h = faces[0] * 4
                        face_img = frame[y: y + h, x: x + w]
                        frame = torch.from_numpy(face_img)
                        # HWC2CHW
                        frame = frame.permute(2, 0, 1)
                        if self.transform is not None:
                            frame = t_F.to_pil_image(frame)
                            frame = self.transform(frame)
                            cap.release()
                            return frame
                except:
                    print("Face detection error")
            else:
                break

        frame = torch.from_numpy(last_good_frame)
        # HWC2CHW
        frame = frame.permute(2, 0, 1)
        if self.transform is not None:
            frame = t_F.to_pil_image(frame)
            frame = self.transform(frame)
        cap.release()
        return frame

    def __getitem__(self, index):

        file = self.files[index]
        X = self.readVideo(file)
        if self.test:
            y = self.labels[index]
        else:
            y = torch.LongTensor([self.labels[index]])  # (labels) LongTensor are for int64 instead of FloatTensor

        return X, y

In [12]:
def get_X(data_folder, valid=False):
    X = []
    y = []
    videos = os.listdir(data_folder)
    if valid:
         with open(os.path.join(data_folder, meta_data)) as json_file:
            label_data = json.load(json_file)
    for v in videos:
        if v.endswith('mp4'):
            X.append(os.path.join(data_folder, v))
            if valid:
                if label_data[v]['label'] == 'FAKE':
                    y.append(1)
                else:
                    y.append(0)
    return X, y

In [22]:
def get_V(data_folder, valid=False):
    X = []
    y = []
    videos = os.listdir(data_folder)
    if valid:
         with open(os.path.join(val_meta_data_path, meta_data)) as json_file:
            label_data = json.load(json_file)
    
    # only keep 125 items in the label file and delete the rest
    for k, v in list(label_data.items()):
        if k not in videos:
            del(label_data[k])
        
    for v in videos:
        if v.endswith('mp4'):
            X.append(os.path.join(data_folder, v))
            if valid:
                if label_data[v]['label'] == 'FAKE':
                    y.append(1)
                else:
                    y.append(0)
    return X, y

In [40]:
train_x, train_y = get_X(train_data_path, valid=True)
val_x, val_y = get_V(val_data_path, valid=True)

In [41]:
# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda:0" if use_cuda else "cpu")   # use CPU or GPU

# Data loading parameters
params = {'batch_size': batch_size, 'shuffle': True, 'pin_memory': True} if use_cuda else {}

In [42]:
transform = transforms.Compose([transforms.Resize([res_size, res_size]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

# selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()
num_frames = 60
res_size = 224

train_set = FrameDataset(train_x, train_y, num_frames, transform=transform)
val_set = FrameDataset(val_x, val_y, num_frames, transform=transform)
dataloaders = { "train": data.DataLoader(train_set, **params), "val": data.DataLoader(val_set, **params)}

In [59]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, torch.max(labels, 1)[1])

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [60]:
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
#optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
optimizer_ft = torch.optim.Adam(model_ft.parameters(), lr=0.001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [None]:
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)