In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


AUDIO

In [None]:
# Install requirements
!pip install torch torchaudio tqdm scikit-learn
!apt install ffmpeg

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np
import subprocess
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Configuration
SAMPLE_RATE = 16000
MAX_LENGTH = 5 * SAMPLE_RATE
BATCH_SIZE = 32
EPOCHS = 50
ADV_WEIGHT = 0.2

class RobustRawNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Main pathway
        self.initial = nn.Sequential(
            nn.Conv1d(1, 128, kernel_size=51, padding=25),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1)
        )

        self.res_blocks = nn.Sequential(
            self._make_res_block(128, 128, 27),
            self._make_res_block(128, 256, 15),
            self._make_res_block(256, 256, 15)
        )

        # Enhanced temporal modeling
        self.gru = nn.GRU(
            input_size=256,
            hidden_size=1024,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(2048, 256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, 2)
        )

    def _make_res_block(self, in_ch, out_ch, kernel_size):
        return nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_ch),
            nn.LeakyReLU(0.1),
            nn.Conv1d(out_ch, out_ch, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_ch),
            nn.LeakyReLU(0.1),
            nn.MaxPool1d(3)
        )

    def forward(self, x):
        x = self.initial(x)
        x = self.res_blocks(x)
        x = x.permute(0, 2, 1)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        return self.classifier(x)

class MP4AudioDataset(Dataset):
    def __init__(self, base_path, real_dir, fake_dir):
        self.real_path = os.path.join(base_path, real_dir)
        self.fake_path = os.path.join(base_path, fake_dir)
        self.file_paths = []
        self.labels = []

        # Load MP4 files
        for dir_path, label in [(self.real_path, 1), (self.fake_path, 0)]:
            if os.path.exists(dir_path):
                for f in os.listdir(dir_path):
                    if f.lower().endswith('.mp4'):
                        self.file_paths.append(os.path.join(dir_path, f))
                        self.labels.append(label)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        while True:
            try:
                path = self.file_paths[idx]
                label = self.labels[idx]

                # FFmpeg extraction with validation
                temp_wav = f"temp_{os.getpid()}.wav"
                cmd = [
                    'ffmpeg', '-y', '-i', path,
                    '-ac', '1', '-ar', str(SAMPLE_RATE),
                    '-t', '5', '-loglevel', 'error',
                    temp_wav
                ]
                result = subprocess.run(cmd, stderr=subprocess.PIPE)

                if result.returncode != 0:
                    raise RuntimeError(f"FFmpeg error: {result.stderr.decode()}")

                # Load and validate audio
                waveform, sr = torchaudio.load(temp_wav)
                os.remove(temp_wav)

                if waveform.nelement() == 0:
                    raise ValueError("Empty audio file")

                # Process waveform
                waveform = waveform.mean(dim=0, keepdim=True)  # Force mono
                waveform = waveform / (waveform.abs().max() + 1e-9)

                # Pad/trim
                if waveform.shape[1] > MAX_LENGTH:
                    waveform = waveform[:, :MAX_LENGTH]
                else:
                    pad = MAX_LENGTH - waveform.shape[1]
                    waveform = F.pad(waveform, (0, pad))

                return waveform, torch.tensor(label)

            except Exception as e:
                print(f"Skipping {path}: {str(e)}")
                idx = (idx + 1) % len(self)

# Initialize dataset
dataset = MP4AudioDataset(
    base_path="/content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD",
    real_dir="videos_real",
    fake_dir="videos_fake"
)

# Split dataset
train_idx, test_idx = train_test_split(
    range(len(dataset)),
    test_size=0.2,
    stratify=dataset.labels,
    random_state=42
)

# Create loaders
train_loader = DataLoader(
    torch.utils.data.Subset(dataset, train_idx),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    persistent_workers=True
)

test_loader = DataLoader(
    torch.utils.data.Subset(dataset, test_idx),
    batch_size=BATCH_SIZE,
    num_workers=2,
    persistent_workers=True
)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobustRawNet().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

# Training loop
best_acc = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for waveforms, labels in pbar:
        waveforms = waveforms.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(waveforms)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        pbar.set_postfix({'Loss': f"{loss.item():.4f}", 'Acc': f"{100*correct/total:.2f}%"})

    # Validation
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for waveforms, labels in test_loader:
            waveforms = waveforms.to(device)
            labels = labels.to(device)
            outputs = model(waveforms)
            _, predicted = torch.max(outputs, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_acc = test_correct / test_total
    print(f"Test Acc: {100*test_acc:.2f}%")
    scheduler.step()

    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "best_deepfake_detector.pth")
        print("Saved new best model")

print(f"\nBest Test Accuracy: {100*best_acc:.2f}%")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


Epoch 1:   0%|          | 0/3 [00:00<?, ?it/s]

Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v27.mp4: FFmpeg error: Output file #0 does not contain any stream
Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v28.mp4: FFmpeg error: Output file #0 does not contain any stream


Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v29.mp4: FFmpeg error: Output file #0 does not contain any stream

Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v15.mp4: FFmpeg error: Output file #0 does not contain any stream

Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v49.mp4: FFmpeg error: Output file #0 does not contain any stream

Skipping /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real/v31.mp4: FFmpeg error: Output file #0 does not contain any stream

Skip

Epoch 1:   0%|          | 0/3 [00:12<?, ?it/s]


KeyboardInterrupt: 

VIDEOS

In [None]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import ViTModel, ViTConfig
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torchvision.transforms as T
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import json

# ================== Frame Extraction Functions ==================
def extract_frames(video_path, output_dir, frames_per_video=3):
    """Extract frames from video file"""
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(1, total_frames // frames_per_video)

    for i in range(frames_per_video):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * interval)
        ret, frame = cap.read()
        if ret:
            frame_path = os.path.join(
                output_dir,
                f"{os.path.basename(video_path).split('.')[0]}_frame_{i:03d}.jpg"
            )
            cv2.imwrite(frame_path, frame)
    cap.release()

def process_dataset(input_dir, output_dir, frames_per_video=3):
    """Process directory of videos into frames"""
    print(f"Processing {input_dir}...")
    for video_file in os.listdir(input_dir):
        video_path = os.path.join(input_dir, video_file)
        if os.path.isfile(video_path):
            extract_frames(video_path, output_dir, frames_per_video)

# ================== Dataset and Model Classes ==================
class DeepfakeContrastiveDataset(Dataset):
    def __init__(self, real_dir, fake_dir, transform=None, mode='train'):
        self.mode = mode
        self.transform = transform
        self.real_images = []
        self.fake_images = []

        # Get unique video IDs
        real_videos = sorted({f.split('_')[0] for f in os.listdir(real_dir)})
        fake_videos = sorted({f.split('_')[0] for f in os.listdir(fake_dir)})

        # Split videos (70% train, 15% val, 15% test)
        real_train, real_temp = train_test_split(real_videos, test_size=0.3, random_state=42)
        real_val, real_test = train_test_split(real_temp, test_size=0.5, random_state=42)

        fake_train, fake_temp = train_test_split(fake_videos, test_size=0.3, random_state=42)
        fake_val, fake_test = train_test_split(fake_temp, test_size=0.5, random_state=42)

        # Select split
        if mode == 'train':
            real_vids, fake_vids = real_train, fake_train
        elif mode == 'val':
            real_vids, fake_vids = real_val, fake_val
        elif mode == 'test':
            real_vids, fake_vids = real_test, fake_test

        # Load frames
        self._load_frames(real_dir, real_vids, self.real_images)
        self._load_frames(fake_dir, fake_vids, self.fake_images)

        # Create contrastive pairs for training
        if mode == 'train':
            self.pairs = []
            # Real-Fake pairs
            for _ in range(int(len(self.real_images)*1.4)):
                self.pairs.append((
                    random.choice(self.real_images),
                    random.choice(self.fake_images),
                    0
                ))
            # Real-Real pairs
            for _ in range(int(len(self.real_images)*0.6)):
                self.pairs.append((
                    random.choice(self.real_images),
                    random.choice(self.real_images),
                    1
                ))
            random.shuffle(self.pairs)

    def _load_frames(self, root_dir, video_list, target_list):
        """Load frames for specified videos"""
        for f in os.listdir(root_dir):
            video_id = f.split('_')[0]
            if video_id in video_list:
                target_list.append(os.path.join(root_dir, f))

    def __len__(self):
        return len(self.pairs) if self.mode == 'train' else len(self.real_images) + len(self.fake_images)

    def __getitem__(self, idx):
        if self.mode != 'train':
            # Validation/Test mode
            if idx < len(self.real_images):
                img_path = self.real_images[idx]
                label = 1
            else:
                img_path = self.fake_images[idx - len(self.real_images)]
                label = 0

            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if self.transform:
                img = self.transform(image=img)['image']
            return img, torch.tensor(label, dtype=torch.long)
        else:
            # Training mode with contrastive pairs
            img1_path, img2_path, label = self.pairs[idx]
            img1 = cv2.imread(img1_path)
            img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
            img2 = cv2.imread(img2_path)
            img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)

            if self.transform:
                img1 = self.transform(image=img1)['image']
                img2 = self.transform(image=img2)['image']

            return img1, img2, torch.tensor(label, dtype=torch.float32)

class EnhancedDeepfakeClassifier(nn.Module):
    def __init__(self, config):
        super().__init__()
        vit_config = ViTConfig.from_pretrained("facebook/deit-tiny-distilled-patch16-224")
        vit_config.output_attentions = True
        self.vit = ViTModel.from_pretrained("facebook/deit-tiny-distilled-patch16-224", config=vit_config)

        # Freeze initial layers
        for param in self.vit.parameters():
            param.requires_grad = False

        # Progressive unfreezing setup
        self.layer_groups = {
            'final_layers': ['encoder.layer.9', 'encoder.layer.10', 'encoder.layer.11'],
            'mid_layers': ['encoder.layer.6', 'encoder.layer.7', 'encoder.layer.8'],
            'early_layers': ['encoder.layer.3', 'encoder.layer.4', 'encoder.layer.5']
        }
        self.config = config

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(192, 256),
            nn.GELU(),
            nn.Dropout(config.get('DROPOUT_RATE', 0.2)),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.get('DROPOUT_RATE', 0.2)),
            nn.Linear(128, 2)
        )

    def unfreeze_layer_group(self, group_name):
        """Unfreeze specified layer group"""
        for name, param in self.vit.named_parameters():
            if any(layer in name for layer in self.layer_groups[group_name]):
                param.requires_grad = True
        print(f"Unfroze layer group: {group_name}")

    def forward(self, x, return_attn=False):
        outputs = self.vit(x)
        pooled = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(pooled)
        return (logits, outputs.attentions) if return_attn else logits

# ================== Training and Evaluation ==================
def train_with_progressive_unfreezing(model, train_loader, val_loader, config, device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['LEARNING_RATE'], weight_decay=config['WEIGHT_DECAY'])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20)
    criterion = nn.CrossEntropyLoss()

    best_val_acc = 0
    patience_counter = 0

    for epoch in range(config['NUM_EPOCHS']):
        # Progressive unfreezing
        if epoch in config['UNFREEZE_SCHEDULE']:
            groups = ['final_layers', 'mid_layers', 'early_layers']
            idx = config['UNFREEZE_SCHEDULE'].index(epoch)
            if idx < len(groups):
                model.unfreeze_layer_group(groups[idx])

        # Training loop
        model.train()
        total_loss = 0
        for batch_idx, (img1, img2, labels) in enumerate(train_loader):
            img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)

            # Forward passes
            logits1 = model(img1)
            logits2 = model(img2)

            # Loss calculation
            loss1 = criterion(logits1, torch.ones_like(labels).long())
            loss2 = criterion(logits2, (labels == 1).long())
            loss = (loss1 + loss2) / 2

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        val_acc = evaluate(model, val_loader, device)
        print(f"Epoch {epoch+1}/{config['NUM_EPOCHS']} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f}")

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= config['PATIENCE']:
                print("Early stopping triggered")
                break

        scheduler.step()

    return best_val_acc

def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total if total > 0 else 0

# ================== Hyperparameter Search ==================
def get_search_space(trial):
    return {
        'LEARNING_RATE': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'WEIGHT_DECAY': trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
        'DROPOUT_RATE': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'UNFREEZE_SCHEDULE': trial.suggest_categorical('unfreeze_schedule',
            [[15, 30, 45], [20, 40, 60], [25, 50, 75]])
    }

def objective(trial, train_loader, val_loader, device):
    config = {
        'NUM_EPOCHS': 50,
        'PATIENCE': 10,
        'BATCH_SIZE': 16,
        'FRAMES_PER_VIDEO': 3,
        'IMAGE_SIZE': 224
    }
    config.update(get_search_space(trial))

    model = EnhancedDeepfakeClassifier(config).to(device)

    best_val_acc = train_with_progressive_unfreezing(model, train_loader, val_loader, config, device)
    return best_val_acc

def run_hyperparameter_search(train_loader, val_loader, device, n_trials=20):
    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42), pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10))
    study.optimize(lambda trial: objective(trial, train_loader, val_loader, device), n_trials=n_trials)
    print(f"Best hyperparameters: {study.best_params}")
    print(f"Best validation accuracy: {study.best_value:.4f}")
    return study.best_params

# ================== Main Execution ==================
if __name__ == "__main__":
    # Configuration
    base_config = {
        'BATCH_SIZE': 16,
        'NUM_EPOCHS': 50,
        'LEARNING_RATE': 3e-4,
        'WEIGHT_DECAY': 0.01,
        'PATIENCE': 10,
        'IMAGE_SIZE': 224,
        'FRAMES_PER_VIDEO': 3,
        'UNFREEZE_SCHEDULE': [15, 30, 45],
        'DROPOUT_RATE': 0.3
    }

    # Setup paths
    base_path = "/content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD"
    real_videos_dir = os.path.join(base_path, "videos_real")
    fake_videos_dir = os.path.join(base_path, "videos_fake")
    real_frames_dir = os.path.join(base_path, "real_frames")
    fake_frames_dir = os.path.join(base_path, "fake_frames")

    # Process datasets
    process_dataset(real_videos_dir, real_frames_dir, base_config['FRAMES_PER_VIDEO'])
    process_dataset(fake_videos_dir, fake_frames_dir, base_config['FRAMES_PER_VIDEO'])

    # Create datasets
    train_transform = A.Compose([
        A.RandomResizedCrop(224, 224),
        A.HorizontalFlip(p=0.5),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

    test_transform = A.Compose([
        A.Resize(224, 224),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

    train_dataset = DeepfakeContrastiveDataset(real_frames_dir, fake_frames_dir, train_transform, 'train')
    val_dataset = DeepfakeContrastiveDataset(real_frames_dir, fake_frames_dir, test_transform, 'val')

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=base_config['BATCH_SIZE'], shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=base_config['BATCH_SIZE'], num_workers=2)

    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Run hyperparameter search
    best_params = run_hyperparameter_search(train_loader, val_loader, device, n_trials=10)

    # Update config with best params
    final_config = base_config.copy()
    final_config.update(best_params)

    # Train final model
    final_model = EnhancedDeepfakeClassifier(final_config).to(device)
    train_with_progressive_unfreezing(final_model, train_loader, val_loader, final_config, device)

    # Load best model and evaluate on test set
    final_model.load_state_dict(torch.load('best_model.pth'))
    test_dataset = DeepfakeContrastiveDataset(real_frames_dir, fake_frames_dir, test_transform, 'test')
    test_loader = DataLoader(test_dataset, batch_size=final_config['BATCH_SIZE'], num_workers=2)

    def evaluate_final(model, loader, device):
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for imgs, labels in loader:
                imgs, labels = imgs.to(device), labels.to(device)
                outputs = model(imgs)
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        acc = np.mean(np.array(all_preds) == np.array(all_labels))
        print(f"\nTest Accuracy: {acc:.4f}")
        print("Classification Report:")
        print(classification_report(all_labels, all_preds, target_names=['Fake', 'Real']))
        cm = confusion_matrix(all_labels, all_preds)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])
        disp.plot(cmap='Blues')
        plt.show()

    evaluate_final(final_model, test_loader, device)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Processing /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_real...
Processing /content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD/videos_fake...


[I 2025-06-17 03:26:38,076] A new study created in memory with name: no-name-b00555fa-4a4f-470f-af0a-546496c20b83


Downloading config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

You are using a model of type deit to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-tiny-distilled-patch16-224 and are newly initialized: ['encoder.layer.9.attention.attention.value.bias', 'encoder.layer.4.layernorm_before.bias', 'encoder.layer.7.attention.output.dense.weight', 'layernorm.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.8.attention.attention.query.weight', 'encoder.layer.2.layernorm_after.weight', 'encoder.layer.2.attention.attention.value.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.6.attention.attention.query.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.10.layernorm_before.weight', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.

Epoch 1/50 | Loss: 0.6575 | Val Acc: 0.5015
Epoch 2/50 | Loss: 0.6508 | Val Acc: 0.5015
Epoch 3/50 | Loss: 0.6483 | Val Acc: 0.5015
Epoch 4/50 | Loss: 0.6503 | Val Acc: 0.5015
Epoch 5/50 | Loss: 0.6493 | Val Acc: 0.5015
Epoch 6/50 | Loss: 0.6479 | Val Acc: 0.5015
Epoch 7/50 | Loss: 0.6511 | Val Acc: 0.5015
Epoch 8/50 | Loss: 0.6478 | Val Acc: 0.5015
Epoch 9/50 | Loss: 0.6502 | Val Acc: 0.5015
Epoch 10/50 | Loss: 0.6471 | Val Acc: 0.5015


[I 2025-06-17 04:14:56,240] Trial 0 finished with value: 0.5014577259475219 and parameters: {'learning_rate': 5.6115164153345e-05, 'weight_decay': 0.006351221010640704, 'dropout_rate': 0.39279757672456206, 'unfreeze_schedule': [15, 30, 45]}. Best is trial 0 with value: 0.5014577259475219.


Epoch 11/50 | Loss: 0.6482 | Val Acc: 0.5015
Early stopping triggered


You are using a model of type deit to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-tiny-distilled-patch16-224 and are newly initialized: ['encoder.layer.9.attention.attention.value.bias', 'encoder.layer.4.layernorm_before.bias', 'encoder.layer.7.attention.output.dense.weight', 'layernorm.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.8.attention.attention.query.weight', 'encoder.layer.2.layernorm_after.weight', 'encoder.layer.2.attention.attention.value.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.6.attention.attention.query.bias', 'encoder.layer.9.attention.output.dense.weigh

Epoch 1/50 | Loss: 0.6890 | Val Acc: 0.5015
Epoch 2/50 | Loss: 0.6671 | Val Acc: 0.5015
Epoch 3/50 | Loss: 0.6586 | Val Acc: 0.5015
Epoch 4/50 | Loss: 0.6538 | Val Acc: 0.5015
Epoch 5/50 | Loss: 0.6507 | Val Acc: 0.5015
Epoch 6/50 | Loss: 0.6488 | Val Acc: 0.5015
Epoch 7/50 | Loss: 0.6490 | Val Acc: 0.5015
Epoch 8/50 | Loss: 0.6492 | Val Acc: 0.5015
Epoch 9/50 | Loss: 0.6496 | Val Acc: 0.5015
Epoch 10/50 | Loss: 0.6519 | Val Acc: 0.5015


[I 2025-06-17 05:01:21,899] Trial 1 finished with value: 0.5014577259475219 and parameters: {'learning_rate': 1.3066739238053272e-05, 'weight_decay': 0.0029154431891537554, 'dropout_rate': 0.34044600469728353, 'unfreeze_schedule': [25, 50, 75]}. Best is trial 0 with value: 0.5014577259475219.
You are using a model of type deit to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.


Epoch 11/50 | Loss: 0.6496 | Val Acc: 0.5015
Early stopping triggered


Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-tiny-distilled-patch16-224 and are newly initialized: ['encoder.layer.9.attention.attention.value.bias', 'encoder.layer.4.layernorm_before.bias', 'encoder.layer.7.attention.output.dense.weight', 'layernorm.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.8.attention.attention.query.weight', 'encoder.layer.2.layernorm_after.weight', 'encoder.layer.2.attention.attention.value.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.6.attention.attention.query.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.10.layernorm_before.weight', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.

Epoch 1/50 | Loss: 0.6532 | Val Acc: 0.5015
Epoch 2/50 | Loss: 0.6496 | Val Acc: 0.5015
Epoch 3/50 | Loss: 0.6489 | Val Acc: 0.5015
Epoch 4/50 | Loss: 0.6475 | Val Acc: 0.5015
Epoch 5/50 | Loss: 0.6476 | Val Acc: 0.5015
Epoch 6/50 | Loss: 0.6465 | Val Acc: 0.5015
Epoch 7/50 | Loss: 0.6445 | Val Acc: 0.5015
Epoch 8/50 | Loss: 0.6437 | Val Acc: 0.5015
Epoch 9/50 | Loss: 0.6429 | Val Acc: 0.5015
Epoch 10/50 | Loss: 0.6434 | Val Acc: 0.5015


[I 2025-06-17 05:47:34,308] Trial 2 finished with value: 0.5014577259475219 and parameters: {'learning_rate': 0.000462258900102083, 'weight_decay': 7.068974950624607e-06, 'dropout_rate': 0.17272998688284025, 'unfreeze_schedule': [25, 50, 75]}. Best is trial 0 with value: 0.5014577259475219.
You are using a model of type deit to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.


Epoch 11/50 | Loss: 0.6418 | Val Acc: 0.5015
Early stopping triggered


Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-tiny-distilled-patch16-224 and are newly initialized: ['encoder.layer.9.attention.attention.value.bias', 'encoder.layer.4.layernorm_before.bias', 'encoder.layer.7.attention.output.dense.weight', 'layernorm.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.8.attention.attention.query.weight', 'encoder.layer.2.layernorm_after.weight', 'encoder.layer.2.attention.attention.value.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.6.attention.attention.query.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.10.layernorm_before.weight', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.

Epoch 1/50 | Loss: 0.6665 | Val Acc: 0.5015
Epoch 2/50 | Loss: 0.6530 | Val Acc: 0.5015
Epoch 3/50 | Loss: 0.6478 | Val Acc: 0.5015
Epoch 4/50 | Loss: 0.6498 | Val Acc: 0.5015
Epoch 5/50 | Loss: 0.6468 | Val Acc: 0.5015


[W 2025-06-17 06:09:37,400] Trial 3 failed with parameters: {'learning_rate': 7.309539835912905e-05, 'weight_decay': 1.461896279370496e-05, 'dropout_rate': 0.34474115788895177, 'unfreeze_schedule': [25, 50, 75]} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-1-3088107041>", line 278, in <lambda>
    study.optimize(lambda trial: objective(trial, train_loader, val_loader, device), n_trials=n_trials)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-3088107041>", line 273, in objective
    best_val_acc = train_with_progressive_unfreezing(model, train_loader, val_loader, config, device)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-3088107041>", line 334, in <cell line: 0>
    best_params = run_hyperparameter_search(train_loader, val_loader, device, n_trials=10)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-3088107041>", line 278, in run_hyperparameter_search
    study.optimize(lambda trial: objective(trial, train_loader, val_loader, device), n_trials=n_trials)
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/study.py", line 489, in optimize
    _optimize(
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 64, in _optimize
    _optimize_sequential(
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 161, in _optimize_sequential
    frozen_trial = _run_trial(s

TypeError: object of type 'NoneType' has no len()

In [None]:
# Step 1: Uninstall all conflicting packages
!pip uninstall -y torch torchvision torchaudio albumentations opencv-python facenet-pytorch

# Step 2: Install PyTorch 2.2.0+ stack (CUDA 12.1)
!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121 --force-reinstall

# Step 3: Install vision/audio dependencies
!pip install albumentations==1.3.1 opencv-python==4.8.0.76 facenet-pytorch==2.5.3

# Step 4: Install sklearn and matplotlib
!pip install scikit-learn==1.3.2 matplotlib==3.7.1

# Step 5: Install Hugging Face Transformers and dependencies
!pip install transformers==4.37.0 huggingface_hub==0.23.0 accelerate==0.31.0

# Step 6: Fix OpenCV system dependencies
!apt-get update -qq && apt-get install -y libgl1-mesa-glx


Found existing installation: torch 2.1.0+cu121
Uninstalling torch-2.1.0+cu121:
  Successfully uninstalled torch-2.1.0+cu121
Found existing installation: torchvision 0.16.0+cu121
Uninstalling torchvision-0.16.0+cu121:
  Successfully uninstalled torchvision-0.16.0+cu121
Found existing installation: torchaudio 2.1.0+cu121
Uninstalling torchaudio-2.1.0+cu121:
  Successfully uninstalled torchaudio-2.1.0+cu121
Found existing installation: albumentations 1.3.1
Uninstalling albumentations-1.3.1:
  Successfully uninstalled albumentations-1.3.1
Found existing installation: opencv-python 4.8.0.76
Uninstalling opencv-python-4.8.0.76:
  Successfully uninstalled opencv-python-4.8.0.76
Found existing installation: facenet-pytorch 2.5.3
Uninstalling facenet-pytorch-2.5.3:
  Successfully uninstalled facenet-pytorch-2.5.3
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.2.0
  Downloading https://download.pytorch.org/whl/cu121/torch-2.2.0%2Bcu121-cp311-cp311-linux_x86_64.whl

Collecting albumentations==1.3.1
  Using cached albumentations-1.3.1-py3-none-any.whl.metadata (34 kB)
Collecting opencv-python==4.8.0.76
  Using cached opencv_python-4.8.0.76-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting facenet-pytorch==2.5.3
  Using cached facenet_pytorch-2.5.3-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.11.1 (from albumentations==1.3.1)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached albumentations-1.3.1-py3-none-any.whl (125 kB)
Using cached opencv_python-4.8.0.76-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (61.7 MB)
Using cached facenet_pytorch-2.5.3-py3-none-any.whl (1.9 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy, opencv-python, facenet-pytorch, albumentations
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.2
    Unin

Collecting transformers==4.37.0
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub==0.23.0
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting accelerate==0.31.0
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.0)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.0-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.31.0-

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libgl1-mesa-glx is already the newest version (23.0.4-0ubuntu1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 46 not upgraded.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import cv2
import os
import numpy as np
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from transformers import ViTModel, ViTConfig

# Configuration
FRAME_SIZE = 224
CLIP_LENGTH = 4
BATCH_SIZE = 4  # Lowered for RAM efficiency
EPOCHS = 5     # Use more if you have time/resources
NUM_FOLDS = 5
SEED = 42

class VideoTransform:
    def __init__(self, train=True):
        self.spatial_transform = A.Compose([
            A.RandomResizedCrop(FRAME_SIZE, FRAME_SIZE, scale=(0.2, 1.0)),
            A.HorizontalFlip(p=0.5),
            A.ColorJitter(brightness=0.3, contrast=0.3, p=0.7),
            A.GaussianBlur(blur_limit=(3, 5), p=0.4),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ]) if train else A.Compose([
            A.Resize(FRAME_SIZE, FRAME_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    def __call__(self, frames):
        transformed = []
        for frame in frames:
            transformed.append(self.spatial_transform(image=frame)['image'])
        return torch.stack(transformed)

class VideoDeepfakeDataset(Dataset):
    def __init__(self, base_path, real_dir="videos_real", fake_dir="videos_fake", transform=None):
        self.real_path = os.path.join(base_path, real_dir)
        self.fake_path = os.path.join(base_path, fake_dir)
        self.file_paths = []
        self.labels = []
        for f in os.listdir(self.real_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.real_path, f))
                self.labels.append(1)
        for f in os.listdir(self.fake_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.fake_path, f))
                self.labels.append(0)
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)
    def __getitem__(self, idx):
        path = self.file_paths[idx]
        label = self.labels[idx]
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            if len(frames) >= CLIP_LENGTH * 2:
                break
        cap.release()
        clip = self._sample_clip(frames, CLIP_LENGTH)
        if self.transform:
            clip = self.transform(clip)
        return clip, torch.tensor(label)
    def _sample_clip(self, frames, clip_length):
        if len(frames) < clip_length:
            frames += [frames[-1]] * (clip_length - len(frames))
        start_idx = np.random.randint(0, max(1, len(frames) - clip_length))
        return frames[start_idx : start_idx + clip_length]

class MultiScaleLocalAttention(nn.Module):
    def __init__(self, in_channels=3, base_channels=32):
        super().__init__()
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 3, padding=1),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 5, padding=2),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 7, padding=3),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.fuse = nn.Conv2d(base_channels * 3, base_channels, 1)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
    def forward(self, x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        x_cat = torch.cat([x1, x2, x3], dim=1)
        x_fused = self.fuse(x_cat)
        pooled = self.pool(x_fused).flatten(1)
        return pooled

class CrossAttentionFusion(nn.Module):
    def __init__(self, global_dim, local_dim, out_dim):
        super().__init__()
        self.query_proj = nn.Linear(global_dim, out_dim)
        self.key_proj = nn.Linear(local_dim, out_dim)
        self.value_proj = nn.Linear(local_dim, out_dim)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, global_feat, local_feat):
        Q = self.query_proj(global_feat).unsqueeze(1)
        K = self.key_proj(local_feat).unsqueeze(1)
        V = self.value_proj(local_feat).unsqueeze(1)
        attn_weights = self.softmax(torch.bmm(Q, K.transpose(1, 2)) / np.sqrt(Q.shape[-1]))
        fused = torch.bmm(attn_weights, V).squeeze(1)
        return fused

class HybridFrameFeature(nn.Module):
    def __init__(self, vit_ckpt="WinKawaks/vit-tiny-patch16-224", local_dim=32, fusion_dim=64):
        super().__init__()
        config = ViTConfig.from_pretrained(vit_ckpt)
        self.vit = ViTModel.from_pretrained(vit_ckpt, config=config)
        self.local_attn = MultiScaleLocalAttention(in_channels=3, base_channels=local_dim)
        self.cross_attn = CrossAttentionFusion(global_dim=config.hidden_size, local_dim=local_dim, out_dim=fusion_dim)
        self.out_dim = config.hidden_size + fusion_dim
    def forward(self, x):
        global_feat = self.vit(x).last_hidden_state[:, 0]
        local_feat = self.local_attn(x)
        fused = self.cross_attn(global_feat, local_feat)
        return torch.cat([global_feat, fused], dim=1)

class VideoDeepfakeClassifier(nn.Module):
    def __init__(self, frame_model, frame_feat_dim, num_classes=2):
        super().__init__()
        self.frame_model = frame_model
        self.temporal_attn = nn.MultiheadAttention(
            embed_dim=frame_feat_dim,
            num_heads=4,
            dropout=0.1,
            batch_first=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(frame_feat_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        B, T = x.shape[:2]
        x = x.view(B * T, *x.shape[2:])
        frame_feats = self.frame_model(x)
        frame_feats = frame_feats.view(B, T, -1)
        attn_out, _ = self.temporal_attn(frame_feats, frame_feats, frame_feats)
        pooled = attn_out.mean(dim=1)
        return self.classifier(pooled)

import random

def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# Prepare dataset and labels for splitting
base_path = "/content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD"
dataset = VideoDeepfakeDataset(base_path, transform=VideoTransform(train=True))
labels = np.array(dataset.labels)

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(labels)), labels)):
    print(f"\n--- Fold {fold+1}/{NUM_FOLDS} ---")
    train_loader = DataLoader(
        Subset(dataset, train_idx),
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    val_loader = DataLoader(
        Subset(dataset, val_idx),
        batch_size=BATCH_SIZE,
        num_workers=2,
        pin_memory=True
    )
    frame_model = HybridFrameFeature()
    frame_feat_dim = frame_model.out_dim
    model = VideoDeepfakeClassifier(frame_model, frame_feat_dim).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()
    best_acc = 0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        pbar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for clips, labels in pbar:
            clips = clips.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(clips)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            pbar.set_postfix({'Loss': f"{loss.item():.4f}", 'Acc': f"{100*correct/total:.2f}%"})
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for clips, labels in val_loader:
                clips = clips.to(device)
                labels = labels.to(device)
                outputs = model(clips)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        val_acc = val_correct / val_total
        print(f"Fold {fold+1} Epoch {epoch+1} Val Acc: {100*val_acc:.2f}%")
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), f"best_video_model_hybrid_fold{fold+1}.pth")
            print("Saved new best model for this fold")
    print(f"Best Val Accuracy for Fold {fold+1}: {100*best_acc:.2f}%")




--- Fold 1/5 ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 21/21 [02:28<00:00,  7.08s/it, Loss=0.6164, Acc=40.48%]


Fold 1 Epoch 1 Val Acc: 54.55%
Saved new best model for this fold


Fold 1 Epoch 2: 100%|██████████| 21/21 [02:34<00:00,  7.34s/it, Loss=0.7177, Acc=47.62%]


Fold 1 Epoch 2 Val Acc: 50.00%


Fold 1 Epoch 3: 100%|██████████| 21/21 [02:37<00:00,  7.50s/it, Loss=0.7064, Acc=41.67%]


Fold 1 Epoch 3 Val Acc: 50.00%


Fold 1 Epoch 4: 100%|██████████| 21/21 [02:24<00:00,  6.88s/it, Loss=0.7331, Acc=50.00%]


Fold 1 Epoch 4 Val Acc: 50.00%


Fold 1 Epoch 5: 100%|██████████| 21/21 [02:24<00:00,  6.87s/it, Loss=0.7240, Acc=47.62%]


Fold 1 Epoch 5 Val Acc: 50.00%
Best Val Accuracy for Fold 1: 54.55%

--- Fold 2/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 22/22 [02:25<00:00,  6.64s/it, Loss=0.7107, Acc=48.24%]


Fold 2 Epoch 1 Val Acc: 47.62%
Saved new best model for this fold


Fold 2 Epoch 2: 100%|██████████| 22/22 [02:37<00:00,  7.17s/it, Loss=0.7249, Acc=51.76%]


Fold 2 Epoch 2 Val Acc: 52.38%
Saved new best model for this fold


Fold 2 Epoch 3: 100%|██████████| 22/22 [02:26<00:00,  6.64s/it, Loss=0.7111, Acc=50.59%]


Fold 2 Epoch 3 Val Acc: 52.38%


Fold 2 Epoch 4: 100%|██████████| 22/22 [02:26<00:00,  6.66s/it, Loss=0.8868, Acc=44.71%]


Fold 2 Epoch 4 Val Acc: 52.38%


Fold 2 Epoch 5: 100%|██████████| 22/22 [02:37<00:00,  7.15s/it, Loss=0.5749, Acc=47.06%]


Fold 2 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 2: 52.38%

--- Fold 3/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 3 Epoch 1: 100%|██████████| 22/22 [02:24<00:00,  6.58s/it, Loss=1.1449, Acc=38.82%]


Fold 3 Epoch 1 Val Acc: 52.38%
Saved new best model for this fold


Fold 3 Epoch 2: 100%|██████████| 22/22 [02:25<00:00,  6.61s/it, Loss=1.1105, Acc=51.76%]


Fold 3 Epoch 2 Val Acc: 47.62%


Fold 3 Epoch 3: 100%|██████████| 22/22 [02:26<00:00,  6.64s/it, Loss=0.7879, Acc=44.71%]


Fold 3 Epoch 3 Val Acc: 38.10%


Fold 3 Epoch 4: 100%|██████████| 22/22 [02:37<00:00,  7.16s/it, Loss=0.8670, Acc=47.06%]


Fold 3 Epoch 4 Val Acc: 33.33%


Fold 3 Epoch 5: 100%|██████████| 22/22 [02:24<00:00,  6.57s/it, Loss=1.1281, Acc=51.76%]


Fold 3 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 3: 52.38%

--- Fold 4/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 4 Epoch 1: 100%|██████████| 22/22 [02:25<00:00,  6.60s/it, Loss=0.7759, Acc=54.12%]


Fold 4 Epoch 1 Val Acc: 57.14%
Saved new best model for this fold


Fold 4 Epoch 2: 100%|██████████| 22/22 [02:24<00:00,  6.57s/it, Loss=0.2796, Acc=57.65%]


Fold 4 Epoch 2 Val Acc: 47.62%


Fold 4 Epoch 3: 100%|██████████| 22/22 [02:35<00:00,  7.06s/it, Loss=0.5021, Acc=56.47%]


Fold 4 Epoch 3 Val Acc: 47.62%


Fold 4 Epoch 4: 100%|██████████| 22/22 [02:23<00:00,  6.54s/it, Loss=0.7700, Acc=45.88%]


Fold 4 Epoch 4 Val Acc: 33.33%


Fold 4 Epoch 5: 100%|██████████| 22/22 [02:25<00:00,  6.61s/it, Loss=0.5962, Acc=51.76%]


Fold 4 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 4: 57.14%

--- Fold 5/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 5 Epoch 1: 100%|██████████| 22/22 [02:26<00:00,  6.66s/it, Loss=0.7490, Acc=44.71%]


Fold 5 Epoch 1 Val Acc: 47.62%
Saved new best model for this fold


Fold 5 Epoch 2: 100%|██████████| 22/22 [02:39<00:00,  7.26s/it, Loss=0.5125, Acc=37.65%]


Fold 5 Epoch 2 Val Acc: 47.62%


Fold 5 Epoch 3: 100%|██████████| 22/22 [02:27<00:00,  6.69s/it, Loss=0.5943, Acc=51.76%]


Fold 5 Epoch 3 Val Acc: 33.33%


Fold 5 Epoch 4: 100%|██████████| 22/22 [02:27<00:00,  6.68s/it, Loss=0.3665, Acc=47.06%]


Fold 5 Epoch 4 Val Acc: 47.62%


Fold 5 Epoch 5: 100%|██████████| 22/22 [02:39<00:00,  7.23s/it, Loss=0.7299, Acc=43.53%]


Fold 5 Epoch 5 Val Acc: 61.90%
Saved new best model for this fold
Best Val Accuracy for Fold 5: 61.90%


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import cv2
import os
import numpy as np
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from transformers import ViTModel, ViTConfig

# Configuration
FRAME_SIZE = 224
CLIP_LENGTH = 4
BATCH_SIZE = 8  # Lowered for RAM efficiency
EPOCHS = 5     # Use more if you have time/resources
NUM_FOLDS = 5
SEED = 42

class VideoTransform:
    def __init__(self, train=True):
        self.spatial_transform = A.Compose([
            A.RandomResizedCrop(FRAME_SIZE, FRAME_SIZE, scale=(0.2, 1.0)),
            A.HorizontalFlip(p=0.5),
            A.ColorJitter(brightness=0.3, contrast=0.3, p=0.7),
            A.GaussianBlur(blur_limit=(3, 5), p=0.4),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ]) if train else A.Compose([
            A.Resize(FRAME_SIZE, FRAME_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    def __call__(self, frames):
        transformed = []
        for frame in frames:
            transformed.append(self.spatial_transform(image=frame)['image'])
        return torch.stack(transformed)

class VideoDeepfakeDataset(Dataset):
    def __init__(self, base_path, real_dir="videos_real", fake_dir="videos_fake", transform=None):
        self.real_path = os.path.join(base_path, real_dir)
        self.fake_path = os.path.join(base_path, fake_dir)
        self.file_paths = []
        self.labels = []
        for f in os.listdir(self.real_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.real_path, f))
                self.labels.append(1)
        for f in os.listdir(self.fake_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.fake_path, f))
                self.labels.append(0)
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)
    def __getitem__(self, idx):
        path = self.file_paths[idx]
        label = self.labels[idx]
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            if len(frames) >= CLIP_LENGTH * 2:
                break
        cap.release()
        clip = self._sample_clip(frames, CLIP_LENGTH)
        if self.transform:
            clip = self.transform(clip)
        return clip, torch.tensor(label)
    def _sample_clip(self, frames, clip_length):
        if len(frames) < clip_length:
            frames += [frames[-1]] * (clip_length - len(frames))
        start_idx = np.random.randint(0, max(1, len(frames) - clip_length))
        return frames[start_idx : start_idx + clip_length]

class MultiScaleLocalAttention(nn.Module):
    def __init__(self, in_channels=3, base_channels=32):
        super().__init__()
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 3, padding=1),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 5, padding=2),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 7, padding=3),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.fuse = nn.Conv2d(base_channels * 3, base_channels, 1)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
    def forward(self, x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        x_cat = torch.cat([x1, x2, x3], dim=1)
        x_fused = self.fuse(x_cat)
        pooled = self.pool(x_fused).flatten(1)
        return pooled

class CrossAttentionFusion(nn.Module):
    def __init__(self, global_dim, local_dim, out_dim):
        super().__init__()
        self.query_proj = nn.Linear(global_dim, out_dim)
        self.key_proj = nn.Linear(local_dim, out_dim)
        self.value_proj = nn.Linear(local_dim, out_dim)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, global_feat, local_feat):
        Q = self.query_proj(global_feat).unsqueeze(1)
        K = self.key_proj(local_feat).unsqueeze(1)
        V = self.value_proj(local_feat).unsqueeze(1)
        attn_weights = self.softmax(torch.bmm(Q, K.transpose(1, 2)) / np.sqrt(Q.shape[-1]))
        fused = torch.bmm(attn_weights, V).squeeze(1)
        return fused

class HybridFrameFeature(nn.Module):
    def __init__(self, vit_ckpt="WinKawaks/vit-tiny-patch16-224", local_dim=32, fusion_dim=64):
        super().__init__()
        config = ViTConfig.from_pretrained(vit_ckpt)
        self.vit = ViTModel.from_pretrained(vit_ckpt, config=config)
        self.local_attn = MultiScaleLocalAttention(in_channels=3, base_channels=local_dim)
        self.cross_attn = CrossAttentionFusion(global_dim=config.hidden_size, local_dim=local_dim, out_dim=fusion_dim)
        self.out_dim = config.hidden_size + fusion_dim
    def forward(self, x):
        global_feat = self.vit(x).last_hidden_state[:, 0]
        local_feat = self.local_attn(x)
        fused = self.cross_attn(global_feat, local_feat)
        return torch.cat([global_feat, fused], dim=1)

class VideoDeepfakeClassifier(nn.Module):
    def __init__(self, frame_model, frame_feat_dim, num_classes=2):
        super().__init__()
        self.frame_model = frame_model
        self.temporal_attn = nn.MultiheadAttention(
            embed_dim=frame_feat_dim,
            num_heads=4,
            dropout=0.1,
            batch_first=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(frame_feat_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        B, T = x.shape[:2]
        x = x.view(B * T, *x.shape[2:])
        frame_feats = self.frame_model(x)
        frame_feats = frame_feats.view(B, T, -1)
        attn_out, _ = self.temporal_attn(frame_feats, frame_feats, frame_feats)
        pooled = attn_out.mean(dim=1)
        return self.classifier(pooled)

import random

def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# Prepare dataset and labels for splitting
base_path = "/content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD"
dataset = VideoDeepfakeDataset(base_path, transform=VideoTransform(train=True))
labels = np.array(dataset.labels)

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(labels)), labels)):
    print(f"\n--- Fold {fold+1}/{NUM_FOLDS} ---")
    train_loader = DataLoader(
        Subset(dataset, train_idx),
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    val_loader = DataLoader(
        Subset(dataset, val_idx),
        batch_size=BATCH_SIZE,
        num_workers=2,
        pin_memory=True
    )
    frame_model = HybridFrameFeature()
    frame_feat_dim = frame_model.out_dim
    model = VideoDeepfakeClassifier(frame_model, frame_feat_dim).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()
    best_acc = 0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        pbar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for clips, labels in pbar:
            clips = clips.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(clips)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            pbar.set_postfix({'Loss': f"{loss.item():.4f}", 'Acc': f"{100*correct/total:.2f}%"})
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for clips, labels in val_loader:
                clips = clips.to(device)
                labels = labels.to(device)
                outputs = model(clips)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        val_acc = val_correct / val_total
        print(f"Fold {fold+1} Epoch {epoch+1} Val Acc: {100*val_acc:.2f}%")
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), f"best_video_model_hybrid_fold{fold+1}.pth")
            print("Saved new best model for this fold")
    print(f"Best Val Accuracy for Fold {fold+1}: {100*best_acc:.2f}%")




--- Fold 1/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1:  18%|█▊        | 2/11 [01:21<05:38, 37.65s/it, Loss=0.9953, Acc=56.25%]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import cv2
import os
import numpy as np
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold, train_test_split
from transformers import ViTModel, ViTConfig
import random

# Configuration
FRAME_SIZE = 224
CLIP_LENGTH = 4
BATCH_SIZE = 4
EPOCHS = 5
NUM_FOLDS = 5
SEED = 42

class VideoTransform:
    def __init__(self, train=True):
        self.spatial_transform = A.Compose([
            A.RandomResizedCrop(FRAME_SIZE, FRAME_SIZE, scale=(0.2, 1.0)),
            A.HorizontalFlip(p=0.5),
            A.ColorJitter(brightness=0.3, contrast=0.3, p=0.7),
            A.GaussianBlur(blur_limit=(3, 5), p=0.4),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ]) if train else A.Compose([
            A.Resize(FRAME_SIZE, FRAME_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    def __call__(self, frames):
        transformed = []
        for frame in frames:
            transformed.append(self.spatial_transform(image=frame)['image'])
        return torch.stack(transformed)

class VideoDeepfakeDataset(Dataset):
    def __init__(self, base_path, real_dir="videos_real", fake_dir="videos_fake", transform=None):
        self.real_path = os.path.join(base_path, real_dir)
        self.fake_path = os.path.join(base_path, fake_dir)
        self.file_paths = []
        self.labels = []
        for f in os.listdir(self.real_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.real_path, f))
                self.labels.append(1)
        for f in os.listdir(self.fake_path):
            if f.endswith(".mp4"):
                self.file_paths.append(os.path.join(self.fake_path, f))
                self.labels.append(0)
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)
    def __getitem__(self, idx):
        path = self.file_paths[idx]
        label = self.labels[idx]
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            if len(frames) >= CLIP_LENGTH * 2:
                break
        cap.release()
        clip = self._sample_clip(frames, CLIP_LENGTH)
        if self.transform:
            clip = self.transform(clip)
        return clip, torch.tensor(label)
    def _sample_clip(self, frames, clip_length):
        if len(frames) < clip_length:
            frames += [frames[-1]] * (clip_length - len(frames))
        start_idx = np.random.randint(0, max(1, len(frames) - clip_length))
        return frames[start_idx : start_idx + clip_length]

class MultiScaleLocalAttention(nn.Module):
    def __init__(self, in_channels=3, base_channels=32):
        super().__init__()
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 3, padding=1),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 5, padding=2),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, 7, padding=3),
            nn.BatchNorm2d(base_channels),
            nn.ReLU()
        )
        self.fuse = nn.Conv2d(base_channels * 3, base_channels, 1)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
    def forward(self, x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        x_cat = torch.cat([x1, x2, x3], dim=1)
        x_fused = self.fuse(x_cat)
        pooled = self.pool(x_fused).flatten(1)
        return pooled

class CrossAttentionFusion(nn.Module):
    def __init__(self, global_dim, local_dim, out_dim):
        super().__init__()
        self.query_proj = nn.Linear(global_dim, out_dim)
        self.key_proj = nn.Linear(local_dim, out_dim)
        self.value_proj = nn.Linear(local_dim, out_dim)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, global_feat, local_feat):
        Q = self.query_proj(global_feat).unsqueeze(1)
        K = self.key_proj(local_feat).unsqueeze(1)
        V = self.value_proj(local_feat).unsqueeze(1)
        attn_weights = self.softmax(torch.bmm(Q, K.transpose(1, 2)) / np.sqrt(Q.shape[-1]))
        fused = torch.bmm(attn_weights, V).squeeze(1)
        return fused

class HybridFrameFeature(nn.Module):
    def __init__(self, vit_ckpt="WinKawaks/vit-tiny-patch16-224", local_dim=32, fusion_dim=64):
        super().__init__()
        config = ViTConfig.from_pretrained(vit_ckpt)
        self.vit = ViTModel.from_pretrained(vit_ckpt, config=config)
        self.local_attn = MultiScaleLocalAttention(in_channels=3, base_channels=local_dim)
        self.cross_attn = CrossAttentionFusion(global_dim=config.hidden_size, local_dim=local_dim, out_dim=fusion_dim)
        self.out_dim = config.hidden_size + fusion_dim
    def forward(self, x):
        global_feat = self.vit(x).last_hidden_state[:, 0]
        local_feat = self.local_attn(x)
        fused = self.cross_attn(global_feat, local_feat)
        return torch.cat([global_feat, fused], dim=1)

class VideoDeepfakeClassifier(nn.Module):
    def __init__(self, frame_model, frame_feat_dim, num_classes=2):
        super().__init__()
        self.frame_model = frame_model
        self.temporal_attn = nn.MultiheadAttention(
            embed_dim=frame_feat_dim,
            num_heads=4,
            dropout=0.1,
            batch_first=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(frame_feat_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        B, T = x.shape[:2]
        x = x.view(B * T, *x.shape[2:])
        frame_feats = self.frame_model(x)
        frame_feats = frame_feats.view(B, T, -1)
        attn_out, _ = self.temporal_attn(frame_feats, frame_feats, frame_feats)
        pooled = attn_out.mean(dim=1)
        return self.classifier(pooled)

def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# Prepare dataset and labels for splitting
base_path = "/content/drive/MyDrive/SDFVD Small-scale Deepfake Forgery Video Dataset/SDFVD"
dataset = VideoDeepfakeDataset(base_path, transform=VideoTransform(train=True))
labels = np.array(dataset.labels)

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(labels)), labels)):
    print(f"\n--- Fold {fold+1}/{NUM_FOLDS} ---")
    train_loader = DataLoader(
        Subset(dataset, train_idx),
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    val_loader = DataLoader(
        Subset(dataset, val_idx),
        batch_size=BATCH_SIZE,
        num_workers=2,
        pin_memory=True
    )
    frame_model = HybridFrameFeature()
    frame_feat_dim = frame_model.out_dim
    model = VideoDeepfakeClassifier(frame_model, frame_feat_dim).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()
    best_acc = 0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        pbar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for clips, labels in pbar:
            clips = clips.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(clips)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            pbar.set_postfix({'Loss': f"{loss.item():.4f}", 'Acc': f"{100*correct/total:.2f}%"})
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for clips, labels in val_loader:
                clips = clips.to(device)
                labels = labels.to(device)
                outputs = model(clips)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        val_acc = val_correct / val_total
        print(f"Fold {fold+1} Epoch {epoch+1} Val Acc: {100*val_acc:.2f}%")
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), f"best_video_model_hybrid_fold{fold+1}.pth")
            print("Saved new best model for this fold")
    print(f"Best Val Accuracy for Fold {fold+1}: {100*best_acc:.2f}%")

# --- Small Test Block (for quick evaluation on a few samples) ---
# Select a small test subset (e.g., first 8 videos)
test_indices = list(range(min(8, len(dataset))))
test_subset = Subset(dataset, test_indices)
test_loader = DataLoader(
    test_subset,
    batch_size=1,
    shuffle=False,
    num_workers=0
)

# Load the best model from the last fold (or specify another fold if you wish)
model.load_state_dict(torch.load(f"best_video_model_hybrid_fold{NUM_FOLDS}.pth"))
model.eval()

test_correct = 0
test_total = 0
with torch.no_grad():
    for clips, labels in test_loader:
        clips = clips.to(device)
        labels = labels.to(device)
        outputs = model(clips)
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()
        print(f"Predicted: {predicted.item()}, Actual: {labels.item()}")

test_acc = test_correct / test_total if test_total > 0 else 0
print(f"\nTest Accuracy on {test_total} samples: {100*test_acc:.2f}%")



--- Fold 1/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 21/21 [02:33<00:00,  7.29s/it, Loss=0.6164, Acc=40.48%]


Fold 1 Epoch 1 Val Acc: 54.55%
Saved new best model for this fold


Fold 1 Epoch 2: 100%|██████████| 21/21 [02:32<00:00,  7.24s/it, Loss=0.7177, Acc=47.62%]


Fold 1 Epoch 2 Val Acc: 50.00%


Fold 1 Epoch 3: 100%|██████████| 21/21 [02:33<00:00,  7.32s/it, Loss=0.7064, Acc=41.67%]


Fold 1 Epoch 3 Val Acc: 50.00%


Fold 1 Epoch 4: 100%|██████████| 21/21 [02:30<00:00,  7.16s/it, Loss=0.7331, Acc=50.00%]


Fold 1 Epoch 4 Val Acc: 50.00%


Fold 1 Epoch 5: 100%|██████████| 21/21 [02:25<00:00,  6.91s/it, Loss=0.7240, Acc=47.62%]


Fold 1 Epoch 5 Val Acc: 50.00%
Best Val Accuracy for Fold 1: 54.55%

--- Fold 2/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 22/22 [02:27<00:00,  6.69s/it, Loss=0.7107, Acc=48.24%]


Fold 2 Epoch 1 Val Acc: 47.62%
Saved new best model for this fold


Fold 2 Epoch 2: 100%|██████████| 22/22 [02:25<00:00,  6.63s/it, Loss=0.7249, Acc=51.76%]


Fold 2 Epoch 2 Val Acc: 52.38%
Saved new best model for this fold


Fold 2 Epoch 3: 100%|██████████| 22/22 [02:26<00:00,  6.66s/it, Loss=0.7111, Acc=50.59%]


Fold 2 Epoch 3 Val Acc: 52.38%


Fold 2 Epoch 4: 100%|██████████| 22/22 [02:25<00:00,  6.61s/it, Loss=0.8868, Acc=44.71%]


Fold 2 Epoch 4 Val Acc: 52.38%


Fold 2 Epoch 5: 100%|██████████| 22/22 [02:28<00:00,  6.73s/it, Loss=0.5749, Acc=47.06%]


Fold 2 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 2: 52.38%

--- Fold 3/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 3 Epoch 1: 100%|██████████| 22/22 [02:27<00:00,  6.71s/it, Loss=1.1449, Acc=38.82%]


Fold 3 Epoch 1 Val Acc: 52.38%
Saved new best model for this fold


Fold 3 Epoch 2: 100%|██████████| 22/22 [02:29<00:00,  6.80s/it, Loss=1.1105, Acc=51.76%]


Fold 3 Epoch 2 Val Acc: 47.62%


Fold 3 Epoch 3: 100%|██████████| 22/22 [02:51<00:00,  7.79s/it, Loss=0.7879, Acc=44.71%]


Fold 3 Epoch 3 Val Acc: 38.10%


Fold 3 Epoch 4: 100%|██████████| 22/22 [02:40<00:00,  7.29s/it, Loss=0.8670, Acc=47.06%]


Fold 3 Epoch 4 Val Acc: 33.33%


Fold 3 Epoch 5: 100%|██████████| 22/22 [02:29<00:00,  6.81s/it, Loss=1.1281, Acc=51.76%]


Fold 3 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 3: 52.38%

--- Fold 4/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 4 Epoch 1: 100%|██████████| 22/22 [02:27<00:00,  6.71s/it, Loss=0.7759, Acc=54.12%]


Fold 4 Epoch 1 Val Acc: 57.14%
Saved new best model for this fold


Fold 4 Epoch 2: 100%|██████████| 22/22 [02:26<00:00,  6.64s/it, Loss=0.2796, Acc=57.65%]


Fold 4 Epoch 2 Val Acc: 47.62%


Fold 4 Epoch 3: 100%|██████████| 22/22 [02:27<00:00,  6.72s/it, Loss=0.5021, Acc=56.47%]


Fold 4 Epoch 3 Val Acc: 47.62%


Fold 4 Epoch 4: 100%|██████████| 22/22 [02:45<00:00,  7.51s/it, Loss=0.7700, Acc=45.88%]


Fold 4 Epoch 4 Val Acc: 33.33%


Fold 4 Epoch 5: 100%|██████████| 22/22 [02:52<00:00,  7.84s/it, Loss=0.5962, Acc=51.76%]


Fold 4 Epoch 5 Val Acc: 47.62%
Best Val Accuracy for Fold 4: 57.14%

--- Fold 5/5 ---


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-tiny-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 5 Epoch 1: 100%|██████████| 22/22 [02:59<00:00,  8.17s/it, Loss=0.7490, Acc=44.71%]


Fold 5 Epoch 1 Val Acc: 47.62%
Saved new best model for this fold


Fold 5 Epoch 2: 100%|██████████| 22/22 [02:58<00:00,  8.12s/it, Loss=0.5125, Acc=37.65%]


Fold 5 Epoch 2 Val Acc: 47.62%


Fold 5 Epoch 3: 100%|██████████| 22/22 [02:45<00:00,  7.53s/it, Loss=0.5943, Acc=51.76%]


Fold 5 Epoch 3 Val Acc: 33.33%


Fold 5 Epoch 4: 100%|██████████| 22/22 [02:43<00:00,  7.43s/it, Loss=0.3665, Acc=47.06%]


Fold 5 Epoch 4 Val Acc: 47.62%


Fold 5 Epoch 5: 100%|██████████| 22/22 [02:52<00:00,  7.83s/it, Loss=0.7299, Acc=43.53%]


Fold 5 Epoch 5 Val Acc: 61.90%
Saved new best model for this fold
Best Val Accuracy for Fold 5: 61.90%
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 0, Actual: 1
Predicted: 1, Actual: 1

Test Accuracy on 8 samples: 12.50%
