### Cross-Dataset Validation on Celeb-DF Dataset

> **NOTE:**  
> Activate the virtual environment before running:  
> `conda activate new_env`
>
> If the environment does not exist, create it using the following commands:
>
> ```
> conda create -n new_env python=3.12 -y
> conda activate new_env
> ```
>
> **Select the interpreter from the upper right-hand side (RHS) in Jupyter:**  
> `new_env (Python 3.12.9) miniconda3/envs/new_env/bin/python - Conda Env`

In [None]:
# Import required libraries for face detection, file handling, and image processing
from facenet_pytorch import MTCNN
import os
from PIL import Image

# Initialize MTCNN model for face detection
# - image_size=224: Resize detected faces to 224x224 pixels
# - margin=20: Add 20-pixel margin around detected faces
# - device='cpu': Run on CPU to avoid CUDA issues
# - keep_all=False: Return only the most confident face per image
mtcnn = MTCNN(image_size=224, margin=20, device='cpu', keep_all=False)

# Define input and output directories
# - input_dir: Source directory containing raw Celeb-DF images in 'real' and 'fake' subfolders
# - output_dir: Destination directory to save cropped face images
input_dir = "/home/ghulam/Celeb-DF/output_folder"
output_dir = "/home/ghulam/Celeb-DF/test"

# Iterate over 'real' and 'fake' labels
for label in ['real', 'fake']:
    # Loop through subdirectories (e.g., video folders) in the current label folder
    for subdir in os.listdir(os.path.join(input_dir, label)):
        # Construct full paths for input and output directories
        in_path = os.path.join(input_dir, label, subdir)
        out_path = os.path.join(output_dir, label, subdir)
        
        # Create output directory if it doesn't exist
        os.makedirs(out_path, exist_ok=True)
        
        # Process each image in the subdirectory
        for img_name in os.listdir(in_path):
            # Check if the file is an image (JPG or PNG)
            if img_name.endswith(('.jpg', '.png')):
                # Open the image using PIL
                img = Image.open(os.path.join(in_path, img_name))
                
                # Detect and crop the face using MTCNN; returns a single face tensor or None
                face = mtcnn(img)
                
                # If a face is detected, process and save it
                if face is not None:
                    # Convert face tensor: permute to (H,W,C), scale to 0-255, convert to numpy
                    face = face.permute(1, 2, 0).mul(255).byte().numpy()
                    # Save the cropped face as an image in the output directory
                    Image.fromarray(face).save(os.path.join(out_path, img_name))

In [None]:
# Import libraries for PyTorch, neural networks, EfficientNet, and Vision Transformer
import torch
import torch.nn as nn
from torchvision import models
from transformers import ViTConfig, ViTForImageClassification

# Define a custom model combining EfficientNet and Vision Transformer (ViT)
class CombinedModel(nn.Module):
    # Initialize with pre-trained EfficientNet and ViT models
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        # Extract EfficientNet layers, excluding the last two (avgpool and classifier)
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])
        # Store ViT model
        self.vit = vit
        # Add adaptive average pooling to reduce EfficientNet features to 1x1
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)
        # Define feature dimensions: EfficientNet (1536), ViT (from config)
        eff_features_dim = 1536
        vit_features_dim = vit.config.hidden_size
        # Create a fully connected layer to combine features and output 2 classes (real/fake)
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)

    # Define forward pass for input images
    def forward(self, x):
        # Extract features from EfficientNet
        eff_features = self.efficientnet(x)
        # Apply average pooling to reduce spatial dimensions
        eff_features = self.efficientnet_avgpool(eff_features)
        # Flatten features to 1D
        eff_features = eff_features.view(eff_features.size(0), -1)
        # Resize input for ViT (ensure 224x224)
        vit_input = torch.nn.functional.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
        # Pass through ViT, retrieving hidden states
        vit_outputs = self.vit(vit_input, output_hidden_states=True, return_dict=True)
        # Extract CLS token features from the last hidden state
        vit_features = vit_outputs.hidden_states[-1][:, 0]
        # Concatenate EfficientNet and ViT features
        combined_features = torch.cat((eff_features, vit_features), dim=1)
        # Pass combined features through fully connected layer for classification
        output = self.fc(combined_features)
        return output

# Set device to CPU to avoid CUDA issues
device = torch.device("cpu")

# Load pre-trained EfficientNet-B3 model with ImageNet weights
efficientnet = models.efficientnet_b3(weights="EfficientNet_B3_Weights.IMAGENET1K_V1")
# Modify classifier to output 2 classes (real/fake)
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 2)
# Move EfficientNet to CPU
efficientnet.to(device)

# Configure ViT model with 2 output labels
config = ViTConfig.from_pretrained("google/vit-base-patch16-224")
config.num_labels = 2
# Load pre-trained ViT model, ignoring mismatched classifier weights
vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    config=config,
    ignore_mismatched_sizes=True
)
# Replace ViT classifier with a new layer for 2 classes
vit.classifier = nn.Linear(vit.config.hidden_size, 2)
# Move ViT to CPU
vit.to(device)

# Create combined model instance with EfficientNet and ViT
combined_model = CombinedModel(efficientnet, vit)
# Move combined model to CPU
combined_model.to(device)

# Load pre-trained weights from FF++ training, mapping to CPU
combined_model.load_state_dict(torch.load("/home/ghulam/FF++/combined_model_epoch5.pth", map_location=torch.device('cpu')))
# Set model to evaluation mode (disable dropout, batch norm updates)
combined_model.eval()

# Confirm successful model loading
print("Model loaded successfully on CPU.")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully on CPU.


In [None]:
# Import libraries for file handling, dataset creation, and image processing
import os
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define image transformations to match FF++ preprocessing
transform = transforms.Compose([
    # Resize images to 224x224 pixels
    transforms.Resize((224, 224)),
    # Convert images to PyTorch tensors
    transforms.ToTensor(),
    # Normalize pixel values using ImageNet mean and standard deviation
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Custom dataset class for Celeb-DF images
class DeepfakeDataset(Dataset):
    # Initialize with image paths, labels, and transformation pipeline
    def __init__(self, image_paths, labels, transform):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        self.valid_indices = []
        # Create a log file to track invalid images
        log_file = "/home/ghulam/Celeb-DF/invalid_images.txt"
        with open(log_file, "w") as log:
            # Check each image for validity
            for idx, path in enumerate(self.image_paths):
                # Skip if file doesn't exist
                if not os.path.isfile(path):
                    print(f"Skipping missing file: {path}")
                    log.write(f"{path}: File does not exist\n")
                    continue
                try:
                    # Attempt to open and validate image as RGB
                    img = Image.open(path).convert("RGB")
                    img.close()
                    self.valid_indices.append(idx)
                except Exception as e:
                    # Log and skip invalid images
                    print(f"Skipping invalid image: {path} ({e})")
                    log.write(f"{path}: {e}\n")
        # Report number of valid images loaded
        print(f"Loaded {len(self.valid_indices)} valid images out of {len(self.image_paths)}")

    # Return number of valid images in dataset
    def __len__(self):
        return len(self.valid_indices)

    # Retrieve image and label for a given index
    def __getitem__(self, idx):
        actual_idx = self.valid_indices[idx]
        try:
            # Load and convert image to RGB
            image = Image.open(self.image_paths[actual_idx]).convert("RGB")
            # Apply transformations (resize, normalize, etc.)
            image = self.transform(image)
            # Convert label to PyTorch tensor
            label = torch.tensor(self.labels[actual_idx], dtype=torch.long)
            return image, label
        except Exception as e:
            # Log errors during loading and return None
            print(f"Error loading image {self.image_paths[actual_idx]}: {e}")
            return None

# Custom collate function to handle None entries in batch
def collate_fn(batch):
    # Filter out None items (failed image loads)
    batch = [item for item in batch if item is not None]
    # Return None if batch is empty
    if len(batch) == 0:
        return None
    # Use default collate to combine valid items into tensors
    return torch.utils.data.dataloader.default_collate(batch)

# Define base directory for Celeb-DF dataset
base_dir = "/home/ghulam/Celeb-DF"
# Set test directory containing preprocessed images
test_dir = os.path.join(base_dir, "test")

# Collect paths for real images from all subdirectories
real_test_images = [os.path.join(test_dir, "real", subdir, img) 
                    for subdir in os.listdir(os.path.join(test_dir, "real")) 
                    for img in os.listdir(os.path.join(test_dir, "real", subdir))]
# Collect paths for fake images from all subdirectories
fake_test_images = [os.path.join(test_dir, "fake", subdir, img) 
                    for subdir in os.listdir(os.path.join(test_dir, "fake")) 
                    for img in os.listdir(os.path.join(test_dir, "fake", subdir))]
# Combine real and fake image paths
test_paths = real_test_images + fake_test_images
# Assign labels: 0 for real, 1 for fake
test_labels = [0] * len(real_test_images) + [1] * len(fake_test_images)

# Create dataset instance for Celeb-DF test set
cross_test_dataset = DeepfakeDataset(test_paths, test_labels, transform)
# Create DataLoader for batch processing
cross_test_loader = DataLoader(
    cross_test_dataset, 
    batch_size=4,  # Process 4 images per batch
    shuffle=False,  # Maintain order for evaluation
    num_workers=4,  # Use 4 CPU workers for loading
    pin_memory=False,  # Disable pinned memory for CPU
    collate_fn=collate_fn  # Use custom collate function
)
# Print total samples and number of batches
print(f"Cross-dataset test samples: {len(test_paths)}, Batches: {len(cross_test_loader)}")

Loaded 15457 valid images out of 15457
Cross-dataset test samples: 15457, Batches: 3865


In [None]:
# Import libraries for PyTorch, neural networks, EfficientNet, and Vision Transformer
import torch
import torch.nn as nn
from torchvision import models
from transformers import ViTConfig, ViTForImageClassification

# Define a custom model combining EfficientNet and Vision Transformer (ViT)
class CombinedModel(nn.Module):
    # Initialize with pre-trained EfficientNet and ViT models
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        # Extract EfficientNet layers, excluding the last two (avgpool and classifier)
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])
        # Store ViT model
        self.vit = vit
        # Add adaptive average pooling to reduce EfficientNet features to 1x1
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)
        # Define feature dimensions: EfficientNet (1536), ViT (from config)
        eff_features_dim = 1536
        vit_features_dim = vit.config.hidden_size
        # Create a fully connected layer to combine features and output 2 classes (real/fake)
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)

    # Define forward pass for input images
    def forward(self, x):
        # Extract features from EfficientNet
        eff_features = self.efficientnet(x)
        # Apply average pooling to reduce spatial dimensions
        eff_features = self.efficientnet_avgpool(eff_features)
        # Flatten features to 1D
        eff_features = eff_features.view(eff_features.size(0), -1)
        # Resize input for ViT to ensure 224x224 resolution
        vit_input = torch.nn.functional.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
        # Pass through ViT, retrieving hidden states
        vit_outputs = self.vit(vit_input, output_hidden_states=True, return_dict=True)
        # Extract CLS token features from the last hidden state
        vit_features = vit_outputs.hidden_states[-1][:, 0]
        # Concatenate EfficientNet and ViT features
        combined_features = torch.cat((eff_features, vit_features), dim=1)
        # Pass combined features through fully connected layer for classification
        output = self.fc(combined_features)
        return output

# Set device to CPU to avoid CUDA compatibility issues
device = torch.device("cpu")

# Load pre-trained EfficientNet-B3 model with ImageNet weights
efficientnet = models.efficientnet_b3(weights="EfficientNet_B3_Weights.IMAGENET1K_V1")
# Modify the classifier to output 2 classes (real/fake)
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 2)
# Move EfficientNet model to CPU
efficientnet.to(device)

# Load configuration for Vision Transformer (ViT) with 2 output labels
config = ViTConfig.from_pretrained("google/vit-base-patch16-224")
config.num_labels = 2
# Initialize ViT model with pre-trained weights, ignoring mismatched classifier sizes
vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    config=config,
    ignore_mismatched_sizes=True
)
# Replace ViT classifier with a new layer for 2 classes
vit.classifier = nn.Linear(vit.config.hidden_size, 2)
# Move ViT model to CPU
vit.to(device)

# Create an instance of the combined model with EfficientNet and ViT
combined_model = CombinedModel(efficientnet, vit)
# Move the combined model to CPU
combined_model.to(device)

# Load pre-trained weights from FF++ training, mapping to CPU to avoid CUDA issues
combined_model.load_state_dict(torch.load("/home/ghulam/FF++/combined_model_epoch5.pth", map_location=torch.device('cpu')))
# Set the model to evaluation mode (disables dropout and batch normalization updates)
combined_model.eval()

# Confirm that the model was loaded successfully
print("Model loaded successfully on CPU.")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully on CPU.


In [None]:
# Import PyTorch's neural network module for loss calculation
import torch.nn as nn

# Define a function to evaluate the model on a test dataset
def evaluate_model(model, test_loader, criterion, device):
    # Set the model to evaluation mode (disables dropout and batch normalization updates)
    model.eval()
    # Initialize variables to track loss, correct predictions, and total samples
    test_loss, test_correct, test_total = 0.0, 0, 0
    # Disable gradient computation for efficiency during evaluation
    with torch.no_grad():
        # Iterate over batches in the test DataLoader
        for images, labels in test_loader:
            # Skip batches with None (invalid images)
            if images is None:
                continue
            # Move images and labels to the specified device (CPU)
            images, labels = images.to(device), labels.to(device)
            # Forward pass: get model predictions
            outputs = model(images)
            # Compute loss using the provided criterion
            loss = criterion(outputs, labels)
            # Accumulate loss (weighted by batch size)
            test_loss += loss.item() * images.size(0)
            # Count correct predictions by comparing predicted and true labels
            test_correct += (outputs.argmax(1) == labels).sum().item()
            # Track total number of samples processed
            test_total += labels.size(0)
    # Compute average test loss
    test_loss /= test_total
    # Compute test accuracy as the fraction of correct predictions
    test_accuracy = test_correct / test_total
    # Print formatted test loss and accuracy
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.2%}")
    # Return computed metrics
    return test_loss, test_accuracy

# Define the loss function (cross-entropy for classification)
criterion = nn.CrossEntropyLoss()
# Evaluate the combined model on the Celeb-DF test dataset
test_loss, test_accuracy = evaluate_model(combined_model, cross_test_loader, criterion, device)

Test Loss: 1.3248, Test Acc: 50.42%


In [None]:
# Import libraries for PyTorch, confusion matrix calculation, and numerical operations
import torch
from sklearn.metrics import confusion_matrix
import numpy as np

# Define a function to compute the confusion matrix for model predictions
def get_confusion_matrix(model, test_loader, device):
    # Set the model to evaluation mode (disables dropout and batch normalization updates)
    model.eval()
    # Initialize lists to store all predictions and true labels
    all_preds = []
    all_labels = []
    # Disable gradient computation for efficiency during evaluation
    with torch.no_grad():
        # Iterate over batches in the test DataLoader
        for images, labels in test_loader:
            # Skip batches with None (invalid images)
            if images is None:
                continue
            # Move images and labels to the specified device (CPU)
            images, labels = images.to(device), labels.to(device)
            # Forward pass: get model predictions
            outputs = model(images)
            # Get predicted class indices by selecting the highest output score
            preds = outputs.argmax(1)
            # Convert predictions to numpy and append to the list
            all_preds.extend(preds.cpu().numpy())
            # Convert true labels to numpy and append to the list
            all_labels.extend(labels.cpu().numpy())
    # Compute the confusion matrix using true labels and predictions
    cm = confusion_matrix(all_labels, all_preds)
    # Print the confusion matrix in a readable format
    print("Confusion Matrix:")
    print(f"[[TN={cm[0,0]} FP={cm[0,1]}]")  # True Negatives, False Positives (Real class)
    print(f" [FN={cm[1,0]} TP={cm[1,1]}]]")  # False Negatives, True Positives (Fake class)
    # Return the confusion matrix
    return cm

# Compute and display the confusion matrix for the combined model on Celeb-DF test data
cm = get_confusion_matrix(combined_model, cross_test_loader, device)

Confusion Matrix:
[[TN=2473 FP=2851]
 [FN=4812 TP=5321]]


In [None]:
# Import library for generating classification metrics
from sklearn.metrics import classification_report

# Define a function to compute classification metrics (precision, recall, F1-score)
def get_classification_metrics(model, test_loader, device):
    # Set the model to evaluation mode (disables dropout and batch normalization updates)
    model.eval()
    # Initialize lists to store all predictions and true labels
    all_preds = []
    all_labels = []
    # Disable gradient computation for efficiency during evaluation
    with torch.no_grad():
        # Iterate over batches in the test DataLoader
        for images, labels in test_loader:
            # Skip batches with None (invalid images)
            if images is None:
                continue
            # Move images and labels to the specified device (CPU)
            images, labels = images.to(device), labels.to(device)
            # Forward pass: get model predictions
            outputs = model(images)
            # Get predicted class indices by selecting the highest output score
            preds = outputs.argmax(1)
            # Convert predictions to numpy and append to the list
            all_preds.extend(preds.cpu().numpy())
            # Convert true labels to numpy and append to the list
            all_labels.extend(labels.cpu().numpy())
    # Print a detailed classification report with precision, recall, and F1-score for Real and Fake classes
    print(classification_report(all_labels, all_preds, target_names=['Real', 'Fake']))
    # Return the classification report as a dictionary for further use
    return classification_report(all_labels, all_preds, output_dict=True)

# Compute and display classification metrics for the combined model on Celeb-DF test data
metrics = get_classification_metrics(combined_model, cross_test_loader, device)

              precision    recall  f1-score   support

        Real       0.34      0.46      0.39      5324
        Fake       0.65      0.53      0.58     10133

    accuracy                           0.50     15457
   macro avg       0.50      0.49      0.49     15457
weighted avg       0.54      0.50      0.52     15457



In [None]:
# Calculate accuracy for the Real class using confusion matrix
# - cm[0,0]: True Negatives (correctly predicted Real)
# - cm[0,1]: False Positives (Real predicted as Fake)
# - Compute accuracy as TN / (TN + FP), return 0 if denominator is 0
real_acc = cm[0,0] / (cm[0,0] + cm[0,1]) if (cm[0,0] + cm[0,1]) > 0 else 0

# Calculate accuracy for the Fake class using confusion matrix
# - cm[1,1]: True Positives (correctly predicted Fake)
# - cm[1,0]: False Negatives (Fake predicted as Real)
# - Compute accuracy as TP / (TP + FN), return 0 if denominator is 0
fake_acc = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0

# Print Real class accuracy formatted as a percentage with 2 decimal places
print(f"Real Accuracy: {real_acc:.2%}")

# Print Fake class accuracy formatted as a percentage with 2 decimal places
print(f"Fake Accuracy: {fake_acc:.2%}")

Real Accuracy: 46.45%
Fake Accuracy: 52.51%


In [3]:
with open("/home/ghulam/Celeb-DF/evaluation_results.txt", "w") as f:
    f.write(f"Test Loss: 1.3248\n")
    f.write(f"Test Acc: 50.42%\n")
    f.write(f"Confusion Matrix:\n[[TN=2473 FP=2851]\n [FN=4812 TP=5321]]\n")
    f.write(f"Real Accuracy: 46.45%\n")
    f.write(f"Fake Accuracy: 52.51%\n")
    f.write("Classification Report:\n")
    f.write("              precision    recall  f1-score   support\n\n")
    f.write("       Real       0.34      0.46      0.39      5324\n")
    f.write("       Fake       0.65      0.53      0.58     10133\n\n")
    f.write("    accuracy                           0.50     15457\n")
    f.write("   macro avg       0.50      0.49      0.49     15457\n")
    f.write("weighted avg       0.54      0.50      0.52     15457\n")
print("Results saved to /home/ghulam/Celeb-DF/evaluation_results.txt")

Results saved to /home/ghulam/Celeb-DF/evaluation_results.txt
