### Image DeepFake Detection Model Training, Validation, Testing on FF++ Dataset

> **NOTE:**  
> Activate the virtual environment before running:  
> `conda activate new_env`
>
> If the environment does not exist, create it using the following commands:
>
> ```
> conda create -n new_env python=3.12 -y
> conda activate new_env
> ```
>
> Upgrading/installing necessary libraries/packages
>
> ```
> pip install --upgrade pip
> pip install torch torchvision transformers pillow scikit-learn jupyter ipywidgets
> ```
>
> Verifying
>
> ```
> which python
> python -c "import torch; print(torch.__version__)"
> python -c "from torch.cuda.amp import GradScaler, autocast; print('Imports successful')"
> python -c "from sklearn.model_selection import train_test_split; print('scikit-learn installed')"
> pip show torch torchvision transformers pillow scikit-learn
> ```
>
> **Select the interpreter from the upper right-hand side (RHS) in Jupyter:**  
> `new_env (Python 3.12.9) miniconda3/envs/new_env/bin/python - Conda Env`


In [1]:
# Import libraries for dataset handling
import os
from sklearn.model_selection import train_test_split

# Define dataset paths in the ghulam container
base_dir = "/home/ghulam/FF++/cropped_face_mtcnn"
train_dir = os.path.join(base_dir, "train")
val_dir = os.path.join(base_dir, "val")
test_dir = os.path.join(base_dir, "test")

# Collect training images from nested real/fake subdirectories
real_train_images = [os.path.join(train_dir, "real", subdir, img) 
                    for subdir in os.listdir(os.path.join(train_dir, "real")) 
                    for img in os.listdir(os.path.join(train_dir, "real", subdir))]
fake_train_images = [os.path.join(train_dir, "fake", subdir, img) 
                    for subdir in os.listdir(os.path.join(train_dir, "fake")) 
                    for img in os.listdir(os.path.join(train_dir, "fake", subdir))]

# Combine images and assign labels (0 for real, 1 for fake)
train_image_paths = real_train_images + fake_train_images
train_labels = [0] * len(real_train_images) + [1] * len(fake_train_images)

# Split training data into train and validation
train_paths, val_paths, train_labels, val_labels = train_test_split(
    train_image_paths, train_labels, test_size=0.2, stratify=train_labels
)

# Collect test images
real_test_images = [os.path.join(test_dir, "real", subdir, img) 
                   for subdir in os.listdir(os.path.join(test_dir, "real")) 
                   for img in os.listdir(os.path.join(test_dir, "real", subdir))]
fake_test_images = [os.path.join(test_dir, "fake", subdir, img) 
                   for subdir in os.listdir(os.path.join(test_dir, "fake")) 
                   for img in os.listdir(os.path.join(test_dir, "fake", subdir))]

test_paths = real_test_images + fake_test_images
test_labels = [0] * len(real_test_images) + [1] * len(fake_test_images)

print(f"Train samples: {len(train_paths)}, Validation samples: {len(val_paths)}, Test samples: {len(test_paths)}")

Train samples: 17276, Validation samples: 4319, Test samples: 4200


In [None]:
# Import libraries for dataset creation, image processing, and PyTorch functionality
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
import torch
import os

# Define image transformations to preprocess FF++ dataset
transform = transforms.Compose([
    # Resize images to 224x224 pixels for model compatibility
    transforms.Resize((224, 224)),
    # Convert images to PyTorch tensors
    transforms.ToTensor(),
    # Normalize pixel values using ImageNet mean and standard deviation
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define custom dataset class for FF++ images with error handling
class DeepfakeDataset(Dataset):
    # Initialize with image paths, labels, and transformation pipeline
    def __init__(self, image_paths, labels, transform):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        self.valid_indices = []
        # Specify log file to record invalid images
        log_file = "/home/ghulam/FF++/invalid_images.txt"
        # Open log file in write mode
        with open(log_file, "w") as log:
            # Check each image for validity
            for idx, path in enumerate(self.image_paths):
                # Skip if the file does not exist
                if not os.path.isfile(path):
                    print(f"Skipping missing file: {path}")
                    log.write(f"{path}: File does not exist\n")
                    continue
                try:
                    # Attempt to open and convert image to RGB to verify integrity
                    img = Image.open(path).convert("RGB")
                    img.close()
                    # Store index of valid image
                    self.valid_indices.append(idx)
                except Exception as e:
                    # Log and skip invalid images
                    print(f"Skipping invalid image: {path} ({e})")
                    log.write(f"{path}: {e}\n")
        # Report the number of valid images loaded
        print(f"Loaded {len(self.valid_indices)} valid images out of {len(self.image_paths)}")
        # Warn if no valid images are found
        if len(self.valid_indices) == 0:
            print(f"Error: No valid images found. Check {log_file} for details.")

    # Return the number of valid images in the dataset
    def __len__(self):
        return len(self.valid_indices)

    # Retrieve image and label for a given index
    def __getitem__(self, idx):
        # Map index to valid image index
        actual_idx = self.valid_indices[idx]
        try:
            # Load and convert image to RGB
            image = Image.open(self.image_paths[actual_idx]).convert("RGB")
            # Apply transformations (resize, normalize, etc.)
            image = self.transform(image)
            # Convert label to PyTorch tensor with long dtype
            label = torch.tensor(self.labels[actual_idx], dtype=torch.long)
            return image, label
        except Exception as e:
            # Log errors during image loading and return None
            print(f"Error loading image {self.image_paths[actual_idx]}: {e}")
            return None

In [None]:
# Import PyTorch DataLoader for batch processing
from torch.utils.data import DataLoader

# Create dataset instances for training, validation, and testing
# - Use DeepfakeDataset with pre-defined paths, labels, and transformations
train_dataset = DeepfakeDataset(train_paths, train_labels, transform)
val_dataset = DeepfakeDataset(val_paths, val_labels, transform)
test_dataset = DeepfakeDataset(test_paths, test_labels, transform)

# Validate that datasets are not empty
# - Raise an error if no valid images are found, directing to the log file
if len(train_dataset) == 0:
    raise ValueError("Train dataset is empty. Check /home/ghulam/FF++/invalid_images.txt for details.")
if len(val_dataset) == 0:
    raise ValueError("Validation dataset is empty. Check /home/ghulam/FF++/invalid_images.txt for details.")
if len(test_dataset) == 0:
    raise ValueError("Test dataset is empty. Check /home/ghulam/FF++/invalid_images.txt for details.")

# Define a custom collate function to handle None entries in batches
def collate_fn(batch):
    # Filter out None items (failed image loads)
    batch = [item for item in batch if item is not None]
    # Return None if the batch is empty
    if len(batch) == 0:
        return None
    # Use default collate to combine valid items into tensors
    return torch.utils.data.dataloader.default_collate(batch)

# Create DataLoader for training dataset
train_loader = DataLoader(
    train_dataset, 
    batch_size=4,  # Process 4 images per batch
    shuffle=True,  # Shuffle data for training to improve generalization
    num_workers=4,  # Use 4 CPU workers for parallel data loading
    pin_memory=True,  # Enable pinned memory for faster data transfer to GPU
    collate_fn=collate_fn  # Use custom collate function to handle None
)

# Create DataLoader for validation dataset
val_loader = DataLoader(
    val_dataset, 
    batch_size=4,  # Process 4 images per batch
    shuffle=False,  # No shuffling to maintain order for evaluation
    num_workers=4,  # Use 4 CPU workers for parallel data loading
    pin_memory=True,  # Enable pinned memory for faster data transfer to GPU
    collate_fn=collate_fn  # Use custom collate function to handle None
)

# Create DataLoader for test dataset
test_loader = DataLoader(
    test_dataset, 
    batch_size=4,  # Process 4 images per batch
    shuffle=False,  # No shuffling to maintain order for evaluation
    num_workers=4,  # Use 4 CPU workers for parallel data loading
    pin_memory=True,  # Enable pinned memory for faster data transfer to GPU
    collate_fn=collate_fn  # Use custom collate function to handle None
)

# Print the number of batches in each DataLoader
print(f"Train loader batches: {len(train_loader)}, Val loader batches: {len(val_loader)}, Test loader batches: {len(test_loader)}")

Loaded 17276 valid images out of 17276
Loaded 4319 valid images out of 4319
Loaded 4200 valid images out of 4200
Train loader batches: 4319, Val loader batches: 1080, Test loader batches: 1050


In [None]:
# Import libraries for PyTorch, neural networks, functional operations, EfficientNet, and Vision Transformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from transformers import ViTConfig, ViTForImageClassification

# Define a custom model combining EfficientNet and Vision Transformer (ViT)
class CombinedModel(nn.Module):
    # Initialize with pre-trained EfficientNet and ViT models
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        # Extract EfficientNet layers, excluding the last two (avgpool and classifier)
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])
        # Store ViT model
        self.vit = vit
        # Add adaptive average pooling to reduce EfficientNet features to 1x1
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)
        # Define feature dimensions: EfficientNet (1536), ViT (from config)
        eff_features_dim = 1536
        vit_features_dim = vit.config.hidden_size
        # Create a fully connected layer to combine features and output 2 classes (real/fake)
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)

    # Define forward pass for input images
    def forward(self, x):
        # Extract features from EfficientNet
        eff_features = self.efficientnet(x)
        # Apply average pooling to reduce spatial dimensions
        eff_features = self.efficientnet_avgpool(eff_features)
        # Flatten features to 1D
        eff_features = eff_features.view(eff_features.size(0), -1)
        # Resize input for ViT to ensure 224x224 resolution
        vit_input = F.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
        # Pass through ViT, retrieving hidden states
        vit_outputs = self.vit(vit_input, output_hidden_states=True, return_dict=True)
        # Extract CLS token features from the last hidden state
        vit_features = vit_outputs.hidden_states[-1][:, 0]
        # Concatenate EfficientNet and ViT features
        combined_features = torch.cat((eff_features, vit_features), dim=1)
        # Pass combined features through fully connected layer for classification
        output = self.fc(combined_features)
        return output

# Set device to GPU (cuda:1) for single-GPU training
device = torch.device("cuda:1")

# Load pre-trained EfficientNet-B3 model with ImageNet weights
efficientnet = models.efficientnet_b3(weights="EfficientNet_B3_Weights.IMAGENET1K_V1")
# Modify the classifier to output 2 classes (real/fake)
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 2)
# Move EfficientNet model to the specified GPU
efficientnet.to(device)

# Load configuration for Vision Transformer (ViT) with 2 output labels
config = ViTConfig.from_pretrained("google/vit-base-patch16-224")
config.num_labels = 2
# Initialize ViT model with pre-trained weights, ignoring mismatched classifier sizes
vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    config=config,
    ignore_mismatched_sizes=True
)
# Replace ViT classifier with a new layer for 2 classes
vit.classifier = nn.Linear(vit.config.hidden_size, 2)
# Move ViT model to the specified GPU
vit.to(device)

# Create an instance of the combined model with EfficientNet and ViT
combined_model = CombinedModel(efficientnet, vit)
# Move the combined model to the specified GPU
combined_model.to(device)

# Define the loss function (cross-entropy for classification)
criterion = nn.CrossEntropyLoss()
# Define the Adam optimizer with a learning rate of 1e-4 for model parameters
optimizer = torch.optim.Adam(combined_model.parameters(), lr=1e-4)

# Confirm that the model is initialized and ready for training on a single GPU
print("CombinedModel initialized for single-GPU training.")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CombinedModel initialized for single-GPU training.


In [None]:
# Import PyTorch AMP modules for mixed precision training
from torch.amp import GradScaler, autocast

# Define a function to train the combined model with early stopping
def train_combined_model(model, train_loader, val_loader, criterion, optimizer, epochs, device, patience=2):
    # Move the model to the specified device (GPU)
    model.to(device)
    # Initialize gradient scaler for mixed precision training on CUDA
    scaler = GradScaler('cuda')
    # Initialize best validation loss to infinity
    best_val_loss = float('inf')
    # Counter for epochs without improvement
    epochs_no_improve = 0
    # Variable to store the best model state
    best_model_state = None

    # Loop over the specified number of epochs
    for epoch in range(epochs):
        # Set model to training mode (enables dropout and batch normalization updates)
        model.train()
        # Initialize metrics for training
        train_loss, train_correct, train_total = 0.0, 0, 0
        # Iterate over batches in the training DataLoader
        for images, labels in train_loader:
            # Skip batches with None (invalid images)
            if images is None:
                continue
            # Move images and labels to the specified device
            images, labels = images.to(device), labels.to(device)
            # Clear gradients from previous iteration
            optimizer.zero_grad()
            # Enable mixed precision for forward pass
            with autocast('cuda'):
                # Get model predictions
                outputs = model(images)
                # Compute loss using the provided criterion
                loss = criterion(outputs, labels)
            # Scale loss and perform backpropagation
            scaler.scale(loss).backward()
            # Update model parameters using scaled gradients
            scaler.step(optimizer)
            # Update the scaler for the next iteration
            scaler.update()
            # Accumulate training loss (weighted by batch size)
            train_loss += loss.item() * images.size(0)
            # Count correct predictions
            train_correct += (outputs.argmax(1) == labels).sum().item()
            # Track total samples processed
            train_total += labels.size(0)
        # Compute average training loss
        train_loss /= train_total
        # Compute training accuracy
        train_accuracy = train_correct / train_total

        # Set model to evaluation mode (disables dropout and batch normalization updates)
        model.eval()
        # Initialize metrics for validation
        val_loss, val_correct, val_total = 0.0, 0, 0
        # Disable gradient computation for validation
        with torch.no_grad():
            # Iterate over batches in the validation DataLoader
            for images, labels in val_loader:
                # Skip batches with None (invalid images)
                if images is None:
                    continue
                # Move images and labels to the specified device
                images, labels = images.to(device), labels.to(device)
                # Enable mixed precision for forward pass
                with autocast('cuda'):
                    # Get model predictions
                    outputs = model(images)
                    # Compute loss
                    loss = criterion(outputs, labels)
                # Accumulate validation loss (weighted by batch size)
                val_loss += loss.item() * images.size(0)
                # Count correct predictions
                val_correct += (outputs.argmax(1) == labels).sum().item()
                # Track total samples processed
                val_total += labels.size(0)
        # Compute average validation loss
        val_loss /= val_total
        # Compute validation accuracy
        val_accuracy = val_correct / val_total

        # Print training and validation metrics for the current epoch
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2%}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2%}")

        # Check if validation loss improved
        if val_loss < best_val_loss:
            # Update best validation loss
            best_val_loss = val_loss
            # Save the current model state
            best_model_state = model.state_dict()
            # Reset counter for epochs without improvement
            epochs_no_improve = 0
        else:
            # Increment counter for epochs without improvement
            epochs_no_improve += 1
            # Check if early stopping criterion is met
            if epochs_no_improve >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                # Restore the best model state
                model.load_state_dict(best_model_state)
                break
    # Return the trained model
    return model

# Confirm that the training function with early stopping is defined
print("Training function with early stopping defined.")

Training function with early stopping defined.


In [None]:
# Import PyTorch for GPU memory management
import torch

# Clear GPU memory to free up resources before training
torch.cuda.empty_cache()

# Train the CombinedModel for 3 epochs
# - Pass the model, data loaders, loss function, optimizer, number of epochs, and device
combined_model = train_combined_model(
    model=combined_model,  # Pre-initialized CombinedModel (EfficientNet + ViT)
    train_loader=train_loader,  # DataLoader for training dataset
    val_loader=val_loader,  # DataLoader for validation dataset
    criterion=criterion,  # Cross-entropy loss function
    optimizer=optimizer,  # Adam optimizer with learning rate
    epochs=3,  # Train for 3 epochs
    device=device  # GPU device (cuda:1)
)

# Check GPU memory usage after training
# - Retrieve free and total memory in bytes
free, total = torch.cuda.mem_get_info()
# Convert free memory to gigabytes and print with 2 decimal places
print(f"Free GPU memory: {free / 1024**3:.2f} GB")
# Convert total memory to gigabytes and print with 2 decimal places
print(f"Total GPU memory: {total / 1024**3:.2f} GB")

# Confirm that training has completed
print("Training completed.")

  scaler = GradScaler()


  with autocast():
  with autocast():


Epoch 1/3, Train Loss: 0.1216, Train Acc: 95.58%, Val Loss: 0.0174, Val Acc: 99.40%
Epoch 2/3, Train Loss: 0.0436, Train Acc: 98.52%, Val Loss: 0.0159, Val Acc: 99.47%
Epoch 3/3, Train Loss: 0.0324, Train Acc: 99.09%, Val Loss: 0.0197, Val Acc: 99.44%
Free GPU memory: 9.92 GB
Total GPU memory: 14.58 GB
Training completed.


In [7]:
torch.save(combined_model.state_dict(), "/home/ghulam/FF++/combined_model_epoch3.pth")
print("Model saved to /home/ghulam/FF++/combined_model_epoch3.pth")

Model saved to /home/ghulam/FF++/combined_model_epoch3.pth


In [None]:
# Define a function to evaluate the model on the test dataset
def evaluate_model(model, test_loader, criterion, device):
    # Set the model to evaluation mode (disables dropout and batch normalization updates)
    model.eval()
    # Initialize variables to track loss, correct predictions, and total samples
    test_loss, test_correct, test_total = 0.0, 0, 0
    # Disable gradient computation for efficiency during evaluation
    with torch.no_grad():
        # Iterate over batches in the test DataLoader
        for images, labels in test_loader:
            # Skip batches with None (invalid images)
            if images is None:
                continue
            # Move images and labels to the specified device (GPU)
            images, labels = images.to(device), labels.to(device)
            # Enable mixed precision for forward pass on CUDA
            with autocast('cuda'):
                # Get model predictions
                outputs = model(images)
                # Compute loss using the provided criterion
                loss = criterion(outputs, labels)
            # Accumulate test loss (weighted by batch size)
            test_loss += loss.item() * images.size(0)
            # Count correct predictions by comparing predicted and true labels
            test_correct += (outputs.argmax(1) == labels).sum().item()
            # Track total number of samples processed
            test_total += labels.size(0)
    # Compute average test loss
    test_loss /= test_total
    # Compute test accuracy as the fraction of correct predictions
    test_accuracy = test_correct / test_total
    # Print formatted test loss and accuracy
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.2%}")
    # Return computed metrics
    return test_loss, test_accuracy

# Evaluate the combined model on the FF++ test dataset
# - Pass the trained model, test DataLoader, loss function, and device
test_loss, test_accuracy = evaluate_model(combined_model, test_loader, criterion, device)

Test Loss: 0.1483, Test Acc: 96.33%


In [12]:
# Train with 5 epochs
import torch
torch.cuda.empty_cache()
combined_model = train_combined_model(
    model=combined_model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    epochs=5,
    device=device,
    patience=2
)
free, total = torch.cuda.mem_get_info()
print(f"Free GPU memory: {free / 1024**3:.2f} GB")
print(f"Total GPU memory: {total / 1024**3:.2f} GB")
print("Training completed.")

Epoch 1/5, Train Loss: 0.0201, Train Acc: 99.36%, Val Loss: 0.0147, Val Acc: 99.58%
Epoch 2/5, Train Loss: 0.0202, Train Acc: 99.41%, Val Loss: 0.0065, Val Acc: 99.84%
Epoch 3/5, Train Loss: 0.0131, Train Acc: 99.65%, Val Loss: 0.0176, Val Acc: 99.54%
Epoch 4/5, Train Loss: 0.0123, Train Acc: 99.62%, Val Loss: 0.0060, Val Acc: 99.84%
Epoch 5/5, Train Loss: 0.0104, Train Acc: 99.72%, Val Loss: 0.0093, Val Acc: 99.68%
Free GPU memory: 9.92 GB
Total GPU memory: 14.58 GB
Training completed.


In [13]:
torch.save(combined_model.state_dict(), "/home/ghulam/FF++/combined_model_epoch5.pth")
print("Model saved to /home/ghulam/FF++/combined_model_epoch5.pth")

Model saved to /home/ghulam/FF++/combined_model_epoch5.pth


In [14]:
# Test Set Evaluation 2
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    test_loss, test_correct, test_total = 0.0, 0, 0
    with torch.no_grad():
        for images, labels in test_loader:
            if images is None:
                continue
            images, labels = images.to(device), labels.to(device)
            with autocast('cuda'):
                outputs = model(images)
                loss = criterion(outputs, labels)
            test_loss += loss.item() * images.size(0)
            test_correct += (outputs.argmax(1) == labels).sum().item()
            test_total += labels.size(0)
    test_loss /= test_total
    test_accuracy = test_correct / test_total
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.2%}")
    return test_loss, test_accuracy

test_loss, test_accuracy = evaluate_model(combined_model, test_loader, criterion, device)

Test Loss: 0.0862, Test Acc: 98.00%


In [None]:
# # Verify GPU availability and configuration
# import torch
# print(f"CUDA available: {torch.cuda.is_available()}")
# if torch.cuda.is_available():
#     print(f"GPU count: {torch.cuda.device_count()}")
#     print(f"GPU name: {torch.cuda.get_device_name(0)}")
#     print(f"CUDA version: {torch.version.cuda}")
#     print(f"CuDNN version: {torch.backends.cudnn.version()}")

CUDA available: True
GPU count: 4
GPU name: Tesla T4
CUDA version: 12.1
CuDNN version: 90100
