In [1]:
import sys
print(sys.version)

3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]


In [2]:
# --- 0. Install Prerequisites ---
!pip install mamba-ssm causal-conv1d ffmpeg-python --quiet
print("✅ Libraries installed.")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.8/113.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for mamba-ssm (pyproject.toml) ... [?25l[?25hdone
  Building wheel for causal-conv1d (pyproject.toml) ... [?25l[?25hdone
✅ Libraries installed.


In [3]:
import os
from google.colab import drive

print("\n" + "="*80)
print("📂 STEP 1: UNPACKING DATASET FROM .ZIP FILE")
print("="*80)
try:
    # 1. Mount your Google Drive
    drive.mount('/content/drive', force_remount=True)

    # 2. Define paths
    # IMPORTANT: Make sure your new zip file is named 'AVLips.zip' in your Drive
    ZIP_FILE_PATH = "/content/drive/MyDrive/CSE400 codes - 144/AVLips.zip"
    EXTRACT_TO_DIR = "/content/AVLips_data/"
    os.makedirs(EXTRACT_TO_DIR, exist_ok=True)

    # 3. Unpack the dataset using the 'unzip' command
    if not os.path.exists(os.path.join(EXTRACT_TO_DIR, "0_real")):
        print(f"🚀 Starting to unpack '{ZIP_FILE_PATH}'...")
        # Use the 'unzip' command with -q for quiet mode and -d for destination
        !unzip -q "{ZIP_FILE_PATH}" -d "{EXTRACT_TO_DIR}"
        print("✅ Unpacking complete!")
    else:
        print("✅ Dataset already unpacked.")
except Exception as e:
    print(f"❌ An error occurred during unpacking: {e}")
    raise




📂 STEP 1: UNPACKING DATASET FROM .ZIP FILE
Mounted at /content/drive
🚀 Starting to unpack '/content/drive/MyDrive/CSE400 codes - 144/AVLips.zip'...
✅ Unpacking complete!


In [4]:
# -*- coding: utf-8 -*-

"""
V1 MAMBA ABLATION STUDY - BATCH 1
Testing V1a (64,64), V1b (96,96), V1d (160,160)
"""

# --- 1. IMPORTS ---
import os
import cv2
import time
import torch
import librosa
import numpy as np
import pandas as pd
import torch.nn as nn
from pathlib import Path
from tqdm import tqdm
import torch.optim as optim
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torchvision.models as models
from mamba_ssm import Mamba
from torch.cuda.amp import autocast, GradScaler
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import warnings

warnings.filterwarnings('ignore')
torch.backends.cudnn.benchmark = True
print("✅ Libraries imported successfully.")

# --- 2. CONFIGURATION ---
class Config:
    def __init__(self, vis_d_model=128, aud_d_model=128):
        self.data_dir = "/content/AVLips_data/AVLips"
        self.model_save_dir = "/content/models/"
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        os.makedirs(self.model_save_dir, exist_ok=True)

        self.use_sampling = True
        self.num_samples_per_class = 2000

        # Visual Stream
        self.vis_image_size = (128, 128)
        self.vis_num_frames = 16
        self.vis_cnn_feature_dim = 576
        self.vis_mamba_d_model = vis_d_model  # CONFIGURABLE

        # Audio Stream
        self.aud_sample_rate = 16000
        self.aud_num_chunks = 5
        self.aud_chunk_duration = 1.0
        self.aud_n_mels = 128
        self.aud_cnn_feature_dim = 576
        self.aud_mamba_d_model = aud_d_model  # CONFIGURABLE

        # Training
        self.batch_size = 64
        self.accumulation_steps = 4
        self.epochs = 25
        self.learning_rate = 5e-4
        self.weight_decay = 0.05
        self.patience = 6

# --- 3. LABEL SMOOTHING LOSS ---
class LabelSmoothingBCELoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        target = target * (1 - self.smoothing) + 0.5 * self.smoothing
        return F.binary_cross_entropy_with_logits(pred, target)

# --- 4. DATA PROCESSING ---
def process_visual_stream(video_path: str, config: Config):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < config.vis_num_frames:
        return None

    frame_indices = np.linspace(0, total_frames - 1, config.vis_num_frames, dtype=int)
    frames = []
    face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    for i in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            continue

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_detector.detectMultiScale(gray, 1.1, 4)

        if len(faces) > 0:
            (x, y, w, h) = faces[0]
            mouth_crop = frame[y + int(h * 0.6):y + h, x + int(w * 0.25):x + int(w * 0.75)]
            if mouth_crop.size > 0:
                resized_crop = cv2.resize(mouth_crop, config.vis_image_size)
                resized_crop_rgb = cv2.cvtColor(resized_crop, cv2.COLOR_BGR2RGB)
                frames.append(resized_crop_rgb)

    cap.release()
    return np.stack(frames) if len(frames) == config.vis_num_frames else None

def process_audio_stream(video_path: str, config: Config):
    try:
        parts = Path(video_path).parts
        audio_filename = Path(video_path).stem + ".wav"
        label_folder = parts[-2]
        base_data_dir = str(Path(video_path).parent.parent)
        audio_path = os.path.join(base_data_dir, "wav", label_folder, audio_filename)

        y, sr = librosa.load(audio_path, sr=config.aud_sample_rate)
        total_samples = int(config.aud_chunk_duration * config.aud_num_chunks * sr)

        if len(y) < total_samples:
            y = np.pad(y, (0, total_samples - len(y)), mode='constant')
        else:
            y = y[:total_samples]

        samples_per_chunk = int(config.aud_chunk_duration * sr)
        mel_list = []

        for i in range(config.aud_num_chunks):
            chunk = y[i*samples_per_chunk : (i+1)*samples_per_chunk]
            mel = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=config.aud_n_mels)
            mel_db = librosa.power_to_db(mel, ref=np.max)
            mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-9)
            mel_list.append(torch.tensor(mel_db, dtype=torch.float32))

        return torch.stack(mel_list, axis=0)
    except Exception:
        return None

class DualStreamDataset(Dataset):
    def __init__(self, file_paths, labels, config):
        self.file_paths = file_paths
        self.labels = labels
        self.config = config

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        video_path = self.file_paths[idx]
        label = self.labels[idx]

        try:
            visual_frames_hwc = process_visual_stream(video_path, self.config)
            if visual_frames_hwc is None:
                return None

            visual_frames_tchw = visual_frames_hwc.transpose(0, 3, 1, 2)
            audio_mels = process_audio_stream(video_path, self.config)
            if audio_mels is None:
                return None

            audio_tensor = audio_mels.unsqueeze(1)
            return (visual_frames_tchw, audio_tensor), torch.tensor(label, dtype=torch.float32)
        except Exception:
            return None

class RAMCachedDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        visual_frames_np, audio_tensor = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            augmented_frames = []
            for frame_np in visual_frames_np:
                frame_hwc = frame_np.transpose(1, 2, 0)
                augmented_frames.append(self.transform(frame_hwc))
            visual_tensor = torch.stack(augmented_frames)
        else:
            visual_tensor = torch.from_numpy(visual_frames_np).float()

        return (visual_tensor, audio_tensor), label

# --- 5. MODEL ARCHITECTURE ---
class VisualStream_MobileNetV3Small(nn.Module):
    def __init__(self, config):
        super().__init__()
        mobilenet = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.DEFAULT)
        self.cnn_features = mobilenet.features
        self.avgpool = mobilenet.avgpool

        self.proj = nn.Linear(config.vis_cnn_feature_dim, config.vis_mamba_d_model)
        self.proj_dropout = nn.Dropout(0.3)
        self.mamba = Mamba(d_model=config.vis_mamba_d_model, d_state=16, d_conv=4, expand=2)
        self.mamba_dropout = nn.Dropout(0.2)

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w)

        features = self.cnn_features(x)
        features = self.avgpool(features)
        features = features.view(b, t, -1)

        projected_features = self.proj_dropout(self.proj(features))
        temporal_out = self.mamba(projected_features)
        temporal_out = self.mamba_dropout(temporal_out)

        return temporal_out[:, -1, :]

class AudioStream_MobileNetV3Small(nn.Module):
    def __init__(self, config):
        super().__init__()
        mobilenet = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.DEFAULT)
        self.cnn_features = mobilenet.features
        self.avgpool = mobilenet.avgpool

        self.proj = nn.Linear(config.aud_cnn_feature_dim, config.aud_mamba_d_model)
        self.proj_dropout = nn.Dropout(0.3)
        self.mamba = Mamba(d_model=config.aud_mamba_d_model, d_state=16, d_conv=4, expand=2)
        self.mamba_dropout = nn.Dropout(0.2)

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w).repeat(1, 3, 1, 1)

        features = self.cnn_features(x)
        features = self.avgpool(features)
        features = features.view(b, t, -1)

        projected_features = self.proj_dropout(self.proj(features))
        temporal_out = self.mamba(projected_features)
        temporal_out = self.mamba_dropout(temporal_out)

        return temporal_out[:, -1, :]

class FusionModel_V1(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.visual_stream = VisualStream_MobileNetV3Small(config)
        self.audio_stream = AudioStream_MobileNetV3Small(config)

        fusion_input_dim = config.vis_mamba_d_model + config.aud_mamba_d_model
        self.fusion_head = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(256, 1)
        )

    def forward(self, visual_input, audio_input):
        visual_features = self.visual_stream(visual_input)
        audio_features = self.audio_stream(audio_input)
        fused_features = torch.cat((visual_features, audio_features), dim=1)
        return self.fusion_head(fused_features)

# --- 6. UTILITY FUNCTIONS ---
def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

def get_model_size_mb(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / (1024 ** 2)

# --- 7. TRAINING FUNCTIONS ---
def train_one_epoch(model, loader, optimizer, criterion, scaler, config):
    model.train()
    total_loss = 0
    pbar = tqdm(loader, desc="Training")

    for i, ((visual_data, audio_data), labels) in enumerate(pbar):
        visual_data = visual_data.to(config.device, non_blocking=True)
        audio_data = audio_data.to(config.device, non_blocking=True)
        labels = labels.to(config.device, non_blocking=True).unsqueeze(1).float()

        with autocast():
            outputs = model(visual_data, audio_data)
            loss = criterion(outputs, labels)
            loss = loss / config.accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % config.accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

        total_loss += loss.item() * config.accumulation_steps
        pbar.set_postfix({'loss': f"{loss.item() * config.accumulation_steps:.4f}"})

    return total_loss / len(loader)

def validate_one_epoch(model, loader, criterion, config):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for (visual_data, audio_data), labels in tqdm(loader, desc="Validating"):
            visual_data = visual_data.to(config.device, non_blocking=True)
            audio_data = audio_data.to(config.device, non_blocking=True)
            labels = labels.to(config.device, non_blocking=True).unsqueeze(1).float()

            with autocast():
                outputs = model(visual_data, audio_data)
                loss = criterion(outputs, labels)

            total_loss += loss.item()

    return total_loss / len(loader)

def evaluate_model(model, test_loader, config):
    model.eval()
    all_labels, all_preds = [], []

    with torch.no_grad():
        for (visual_data, audio_data), labels in tqdm(test_loader, desc="Evaluating"):
            visual_data = visual_data.to(config.device)
            audio_data = audio_data.to(config.device)

            outputs = model(visual_data, audio_data)
            all_preds.extend(torch.sigmoid(outputs).cpu().numpy())
            all_labels.extend(labels.numpy())

    all_preds = np.array(all_preds).flatten()
    all_labels = np.array(all_labels).flatten()
    preds_binary = (all_preds > 0.5).astype(int)

    accuracy = (preds_binary == all_labels).mean()
    auc_score = roc_auc_score(all_labels, all_preds)

    return accuracy, auc_score, all_labels, preds_binary

# --- 8. MAIN EXECUTION ---
def main():
    print("\n" + "="*80)
    print("🚀 V1 MAMBA ABLATION STUDY - BATCH 1")
    print("Testing: V1a (64,64), V1b (96,96), V1d (160,160)")
    print("="*80 + "\n")

    start_time = time.time()

    # Initialize base config for data caching
    base_config = Config()

    print("\n" + "="*80 + "\nSTEP 1: PREPARING FILE LISTS\n" + "="*80)
    real_dir = os.path.join(base_config.data_dir, "0_real")
    fake_dir = os.path.join(base_config.data_dir, "1_fake")

    all_real = [os.path.join(real_dir, f) for f in os.listdir(real_dir) if f.endswith('.mp4')]
    all_fake = [os.path.join(fake_dir, f) for f in os.listdir(fake_dir) if f.endswith('.mp4')]

    if base_config.use_sampling:
        print(f"🔥 Sampling {base_config.num_samples_per_class} videos per class...")
        real_files = np.random.choice(all_real, base_config.num_samples_per_class, replace=False).tolist()
        fake_files = np.random.choice(all_fake, base_config.num_samples_per_class, replace=False).tolist()
    else:
        real_files, fake_files = all_real, all_fake

    all_files = real_files + fake_files
    labels = [0] * len(real_files) + [1] * len(fake_files)

    train_files, test_files, train_labels, test_labels = train_test_split(
        all_files, labels, test_size=0.3, random_state=42, stratify=labels)
    val_files, test_files, val_labels, test_labels = train_test_split(
        test_files, test_labels, test_size=0.5, random_state=42, stratify=test_labels)

    print(f"Total: {len(all_files)} | Train: {len(train_files)} | Val: {len(val_files)} | Test: {len(test_files)}")

    print("\n" + "="*80 + "\nSTEP 2: CACHING DATA (ONCE FOR ALL VARIANTS)\n" + "="*80)

    def collate_fn_skip_errors(batch):
        batch = list(filter(lambda x: x is not None, batch))
        return torch.utils.data.dataloader.default_collate(batch) if batch else (None, None)

    def cache_data(files, labels, desc):
        dataset = DualStreamDataset(files, labels, base_config)
        loader = DataLoader(dataset, batch_size=base_config.batch_size, num_workers=os.cpu_count(), collate_fn=collate_fn_skip_errors)
        cached_data, cached_labels = [], []

        for data, batch_labels in tqdm(loader, desc=f"Caching {desc}"):
            if data is not None:
                visual_batch, audio_batch = data
                for i in range(visual_batch.shape[0]):
                    cached_data.append((visual_batch[i].numpy(), audio_batch[i]))
                    cached_labels.append(batch_labels[i])

        return cached_data, torch.tensor(cached_labels)

    cached_train_data, cached_train_labels = cache_data(train_files, train_labels, "Train")
    cached_val_data, cached_val_labels = cache_data(val_files, val_labels, "Val")
    cached_test_data, cached_test_labels = cache_data(test_files, test_labels, "Test")

    print(f"✅ Cached - Train: {len(cached_train_data)}, Val: {len(cached_val_data)}, Test: {len(cached_test_data)}")

    # Define transforms
    val_test_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.RandomRotation(10),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Variants to test
    configs_to_test = [
        (64, 64, 'V1a'),
        (96, 96, 'V1b'),
        (160, 160, 'V1d'),
    ]

    results = []

    for idx, (vis_d, aud_d, variant_name) in enumerate(configs_to_test):
        print("\n" + "="*80)
        print(f"📊 VARIANT {idx+1}/3: {variant_name}")
        print(f"   Visual d_model={vis_d}, Audio d_model={aud_d}")
        print(f"   Started at: {time.strftime('%H:%M:%S')}")
        print("="*80 + "\n")

        # Create config for this variant
        config = Config(vis_d_model=vis_d, aud_d_model=aud_d)

        # Create dataloaders (reusing cached data!)
        train_dataset = RAMCachedDataset(cached_train_data, cached_train_labels, transform=train_transform)
        val_dataset = RAMCachedDataset(cached_val_data, cached_val_labels, transform=val_test_transform)
        test_dataset = RAMCachedDataset(cached_test_data, cached_test_labels, transform=val_test_transform)

        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=os.cpu_count(), pin_memory=True)
        val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=os.cpu_count(), pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=os.cpu_count(), pin_memory=True)

        # Build model
        model = FusionModel_V1(config).to(config.device)
        total_params, trainable_params = count_parameters(model)
        model_size_mb = get_model_size_mb(model)

        print(f"Model: {total_params:,} params ({total_params/1e6:.3f}M), {model_size_mb:.2f} MB")

        # Training setup
        optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        criterion = LabelSmoothingBCELoss(smoothing=0.1)
        scaler = GradScaler()
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6)

        model_path = os.path.join(config.model_save_dir, f'{variant_name.lower()}_best.pth')
        best_val_loss = float('inf')
        epochs_no_improve = 0
        history = {'train_loss': [], 'val_loss': []}

        # Training loop
        for epoch in range(config.epochs):
            train_loss = train_one_epoch(model, train_loader, optimizer, criterion, scaler, config)
            val_loss = validate_one_epoch(model, val_loader, criterion, config)

            history['train_loss'].append(train_loss)
            history['val_loss'].append(val_loss)

            scheduler.step(val_loss)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_no_improve = 0
                torch.save(model.state_dict(), model_path)
                print(f"Epoch {epoch+1}/{config.epochs} - Train: {train_loss:.4f}, Val: {val_loss:.4f} ✅")
            else:
                epochs_no_improve += 1
                print(f"Epoch {epoch+1}/{config.epochs} - Train: {train_loss:.4f}, Val: {val_loss:.4f}")
                if epochs_no_improve >= config.patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

        # Final evaluation
        model.load_state_dict(torch.load(model_path))
        accuracy, auc_score, all_labels, preds_binary = evaluate_model(model, test_loader, config)
        loss_gap = history['train_loss'][-1] - history['val_loss'][-1]

        print(f"\n✅ {variant_name} Results:")
        print(f"   Accuracy: {accuracy*100:.2f}%, AUC: {auc_score*100:.2f}%, Loss Gap: {loss_gap:.4f}")

        # Save results
        results.append({
            'variant': variant_name,
            'vis_d_model': vis_d,
            'aud_d_model': aud_d,
            'params_M': total_params / 1e6,
            'size_MB': model_size_mb,
            'accuracy': accuracy,
            'auc': auc_score,
            'loss_gap': loss_gap,
            'completed_at': time.strftime('%Y-%m-%d %H:%M:%S')
        })

        # Save intermediate results
        df = pd.DataFrame(results)
        df.to_csv('/content/v1_ablation_batch1_results.csv', index=False)
        print(f"   Saved to: /content/v1_ablation_batch1_results.csv")

        # Cleanup
        del model
        torch.cuda.empty_cache()

    # Final summary
    total_time = (time.time() - start_time) / 3600
    print("\n" + "="*80)
    print("🎉 BATCH 1 COMPLETED!")
    print("="*80)
    print(f"Total time: {total_time:.2f} hours")
    print("\nResults Summary:")
    print(pd.DataFrame(results)[['variant', 'params_M', 'accuracy', 'auc', 'loss_gap']])
    print("\n📁 Results saved to: /content/v1_ablation_batch1_results.csv")

if __name__ == '__main__':
    main()


✅ Libraries imported successfully.

🚀 V1 MAMBA ABLATION STUDY - BATCH 1
Testing: V1a (64,64), V1b (96,96), V1d (160,160)


STEP 1: PREPARING FILE LISTS
🔥 Sampling 2000 videos per class...
Total: 4000 | Train: 2800 | Val: 600 | Test: 600

STEP 2: CACHING DATA (ONCE FOR ALL VARIANTS)


Caching Train: 100%|██████████| 44/44 [29:12<00:00, 39.83s/it]
Caching Val: 100%|██████████| 10/10 [06:26<00:00, 38.60s/it]
Caching Test: 100%|██████████| 10/10 [05:27<00:00, 32.76s/it]


✅ Cached - Train: 1375, Val: 282, Test: 278

📊 VARIANT 1/3: V1a
   Visual d_model=64, Audio d_model=64
   Started at: 12:35:17

Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth


100%|██████████| 9.83M/9.83M [00:00<00:00, 189MB/s]


Model: 2,026,433 params (2.026M), 7.82 MB


Training: 100%|██████████| 22/22 [05:37<00:00, 15.35s/it, loss=0.6991]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]


Epoch 1/25 - Train: 0.6914, Val: 0.6874 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.37it/s, loss=0.6689]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 2/25 - Train: 0.6863, Val: 0.6857 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.49it/s, loss=0.6007]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.63it/s]


Epoch 3/25 - Train: 0.6460, Val: 0.6641 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.5078]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.41it/s]


Epoch 4/25 - Train: 0.5654, Val: 0.7855


Training: 100%|██████████| 22/22 [00:09<00:00,  2.43it/s, loss=0.5613]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


Epoch 5/25 - Train: 0.4606, Val: 1.7441


Training: 100%|██████████| 22/22 [00:08<00:00,  2.59it/s, loss=0.3851]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 6/25 - Train: 0.3836, Val: 1.4326


Training: 100%|██████████| 22/22 [00:08<00:00,  2.47it/s, loss=0.3101]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.72it/s]


Epoch 7/25 - Train: 0.3259, Val: 0.9768


Training: 100%|██████████| 22/22 [00:09<00:00,  2.43it/s, loss=0.2519]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 8/25 - Train: 0.2967, Val: 0.8260


Training: 100%|██████████| 22/22 [00:08<00:00,  2.46it/s, loss=0.2801]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.53it/s]


Epoch 9/25 - Train: 0.2795, Val: 1.1009
Early stopping at epoch 9


Evaluating: 100%|██████████| 5/5 [00:52<00:00, 10.50s/it]



✅ V1a Results:
   Accuracy: 52.52%, AUC: 69.55%, Loss Gap: -0.8214
   Saved to: /content/v1_ablation_batch1_results.csv

📊 VARIANT 2/3: V1b
   Visual d_model=96, Audio d_model=96
   Started at: 12:43:19

Model: 2,150,785 params (2.151M), 8.30 MB


Training: 100%|██████████| 22/22 [00:09<00:00,  2.36it/s, loss=0.6877]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.70it/s]


Epoch 1/25 - Train: 0.6909, Val: 0.6900 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.42it/s, loss=0.5928]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.49it/s]


Epoch 2/25 - Train: 0.6617, Val: 0.6861 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.52it/s, loss=0.5785]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 3/25 - Train: 0.6143, Val: 0.6879


Training: 100%|██████████| 22/22 [00:08<00:00,  2.50it/s, loss=0.6062]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.60it/s]


Epoch 4/25 - Train: 0.5725, Val: 0.6700 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.43it/s, loss=0.4093]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.54it/s]


Epoch 5/25 - Train: 0.4796, Val: 0.7849


Training: 100%|██████████| 22/22 [00:09<00:00,  2.42it/s, loss=0.5182]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.63it/s]


Epoch 6/25 - Train: 0.4025, Val: 0.6864


Training: 100%|██████████| 22/22 [00:09<00:00,  2.39it/s, loss=0.3415]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


Epoch 7/25 - Train: 0.3433, Val: 0.7255


Training: 100%|██████████| 22/22 [00:09<00:00,  2.37it/s, loss=0.2781]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.63it/s]


Epoch 8/25 - Train: 0.3031, Val: 0.7386


Training: 100%|██████████| 22/22 [00:08<00:00,  2.53it/s, loss=0.2716]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 9/25 - Train: 0.2732, Val: 0.7588


Training: 100%|██████████| 22/22 [00:08<00:00,  2.48it/s, loss=0.2500]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 10/25 - Train: 0.2580, Val: 0.6481 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.52it/s, loss=0.2360]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 11/25 - Train: 0.2494, Val: 0.6122 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.50it/s, loss=0.2359]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.59it/s]


Epoch 12/25 - Train: 0.2413, Val: 0.6409


Training: 100%|██████████| 22/22 [00:09<00:00,  2.43it/s, loss=0.2282]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.58it/s]


Epoch 13/25 - Train: 0.2374, Val: 0.5351 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.40it/s, loss=0.2226]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


Epoch 14/25 - Train: 0.2333, Val: 0.5140 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.49it/s, loss=0.2314]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.64it/s]


Epoch 15/25 - Train: 0.2295, Val: 0.4657 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.2192]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


Epoch 16/25 - Train: 0.2278, Val: 0.4602 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.52it/s, loss=0.2201]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.60it/s]


Epoch 17/25 - Train: 0.2250, Val: 0.4427 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.48it/s, loss=0.2277]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.62it/s]


Epoch 18/25 - Train: 0.2263, Val: 0.4138 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.42it/s, loss=0.2128]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 19/25 - Train: 0.2228, Val: 0.3970 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.34it/s, loss=0.2164]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.61it/s]


Epoch 20/25 - Train: 0.2205, Val: 0.4325


Training: 100%|██████████| 22/22 [00:08<00:00,  2.46it/s, loss=0.2101]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.66it/s]


Epoch 21/25 - Train: 0.2200, Val: 0.4030


Training: 100%|██████████| 22/22 [00:08<00:00,  2.46it/s, loss=0.2284]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.58it/s]


Epoch 22/25 - Train: 0.2185, Val: 0.4064


Training: 100%|██████████| 22/22 [00:08<00:00,  2.53it/s, loss=0.2186]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.45it/s]


Epoch 23/25 - Train: 0.2171, Val: 0.4070


Training: 100%|██████████| 22/22 [00:08<00:00,  2.45it/s, loss=0.2213]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 24/25 - Train: 0.2184, Val: 0.4067


Training: 100%|██████████| 22/22 [00:08<00:00,  2.45it/s, loss=0.2242]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 25/25 - Train: 0.2160, Val: 0.4005
Early stopping at epoch 25


Evaluating: 100%|██████████| 5/5 [00:02<00:00,  2.45it/s]



✅ V1b Results:
   Accuracy: 87.05%, AUC: 96.39%, Loss Gap: -0.1845
   Saved to: /content/v1_ablation_batch1_results.csv

📊 VARIANT 3/3: V1d
   Visual d_model=160, Audio d_model=160
   Started at: 12:47:55

Model: 2,476,289 params (2.476M), 9.54 MB


Training: 100%|██████████| 22/22 [00:09<00:00,  2.42it/s, loss=0.6940]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 1/25 - Train: 0.6949, Val: 0.6976 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.41it/s, loss=0.5948]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.64it/s]


Epoch 2/25 - Train: 0.6670, Val: 0.6927 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.44it/s, loss=0.5950]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.55it/s]


Epoch 3/25 - Train: 0.6052, Val: 0.6628 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.49it/s, loss=0.5667]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.41it/s]


Epoch 4/25 - Train: 0.5468, Val: 0.6363 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.5204]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.50it/s]


Epoch 5/25 - Train: 0.5001, Val: 0.6173 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.44it/s, loss=0.4110]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.48it/s]


Epoch 6/25 - Train: 0.4465, Val: 0.5794 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.53it/s, loss=0.2539]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 7/25 - Train: 0.3553, Val: 0.5106 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.41it/s, loss=0.2797]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 8/25 - Train: 0.2954, Val: 0.4710 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.50it/s, loss=0.2444]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


Epoch 9/25 - Train: 0.2685, Val: 0.4486 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.52it/s, loss=0.2633]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.46it/s]


Epoch 10/25 - Train: 0.2535, Val: 0.4222 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.47it/s, loss=0.2445]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 11/25 - Train: 0.2348, Val: 0.4813


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.2099]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.65it/s]


Epoch 12/25 - Train: 0.2368, Val: 0.3973 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.48it/s, loss=0.2234]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 13/25 - Train: 0.2249, Val: 0.3916 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.45it/s, loss=0.2135]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.63it/s]


Epoch 14/25 - Train: 0.2202, Val: 0.3403 ✅


Training: 100%|██████████| 22/22 [00:09<00:00,  2.39it/s, loss=0.2275]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Epoch 15/25 - Train: 0.2187, Val: 0.3655


Training: 100%|██████████| 22/22 [00:08<00:00,  2.48it/s, loss=0.2168]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.48it/s]


Epoch 16/25 - Train: 0.2159, Val: 0.3509


Training: 100%|██████████| 22/22 [00:08<00:00,  2.50it/s, loss=0.2102]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.60it/s]


Epoch 17/25 - Train: 0.2152, Val: 0.3758


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.2117]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.64it/s]


Epoch 18/25 - Train: 0.2123, Val: 0.3700


Training: 100%|██████████| 22/22 [00:08<00:00,  2.48it/s, loss=0.2175]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.49it/s]


Epoch 19/25 - Train: 0.2104, Val: 0.3663


Training: 100%|██████████| 22/22 [00:08<00:00,  2.45it/s, loss=0.2125]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


Epoch 20/25 - Train: 0.2109, Val: 0.3347 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.51it/s, loss=0.2147]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.61it/s]


Epoch 21/25 - Train: 0.2102, Val: 0.3079 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.55it/s, loss=0.2062]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.45it/s]


Epoch 22/25 - Train: 0.2094, Val: 0.2919 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.57it/s, loss=0.2117]
Validating: 100%|██████████| 5/5 [00:02<00:00,  2.47it/s]


Epoch 23/25 - Train: 0.2110, Val: 0.2837 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.50it/s, loss=0.2137]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


Epoch 24/25 - Train: 0.2089, Val: 0.2816 ✅


Training: 100%|██████████| 22/22 [00:08<00:00,  2.47it/s, loss=0.2121]
Validating: 100%|██████████| 5/5 [00:01<00:00,  2.62it/s]


Epoch 25/25 - Train: 0.2089, Val: 0.2782 ✅


Evaluating: 100%|██████████| 5/5 [00:02<00:00,  2.45it/s]



✅ V1d Results:
   Accuracy: 94.60%, AUC: 99.12%, Loss Gap: -0.0692
   Saved to: /content/v1_ablation_batch1_results.csv

🎉 BATCH 1 COMPLETED!
Total time: 0.97 hours

Results Summary:
  variant  params_M  accuracy       auc  loss_gap
0     V1a  2.026433  0.525180  0.695465 -0.821418
1     V1b  2.150785  0.870504  0.963885 -0.184515
2     V1d  2.476289  0.946043  0.991179 -0.069248

📁 Results saved to: /content/v1_ablation_batch1_results.csv
