In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except Exception as e:
    print("Error during drive mount:", e)


Mounted at /content/drive


In [None]:
import pandas as pd
from pathlib import Path
import subprocess
from tqdm import tqdm

# CSVs
index_df = pd.read_csv('/content/drive/MyDrive/FakeAVCeleb/index.csv')
train_df = pd.read_csv('/content/drive/MyDrive/FakeAVCeleb/train_split.csv')
test_df  = pd.read_csv('/content/drive/MyDrive/FakeAVCeleb/test_split.csv')

# Merge to get full paths for train + test
train_df = train_df.merge(index_df[['sample_id', 'full_path']], on='sample_id', how='left')
test_df  = test_df.merge(index_df[['sample_id', 'full_path']], on='sample_id', how='left')

# Output folder for .wav files
AUDIO_ROOT = Path('/content/drive/MyDrive/FakeAVCeleb/audio_wav')
AUDIO_ROOT.mkdir(parents=True, exist_ok=True)

# Combine train + test and extract
all_df = pd.concat([train_df, test_df], ignore_index=True)
print(f"Extracting audio for {len(all_df)} videos...")

for _, row in tqdm(all_df.iterrows(), total=len(all_df), desc="Extracting audio"):
    vid_path = Path(row['full_path'])
    out_path = AUDIO_ROOT / f"{row['sample_id']}.wav"
    if vid_path.exists() and not out_path.exists():
        cmd = [
            "ffmpeg", "-i", str(vid_path), "-ar", "16000", "-ac", "1", "-vn", str(out_path), "-y"
        ]
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

print("Audio extraction complete!")



Extracting audio for 2000 videos...


Extracting audio: 100%|██████████| 2000/2000 [20:14<00:00,  1.65it/s]

Audio extraction complete!





In [None]:
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd, numpy as np
from pathlib import Path
import librosa
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

# --- CONFIG ---
DEVICE        = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TRAIN_CSV     = '/content/drive/MyDrive/FakeAVCeleb/train_fixed_fullschema_hashsplit.csv'
TEST_CSV      = '/content/drive/MyDrive/FakeAVCeleb/test_fixed_fullschema_hashsplit.csv'
AUDIO_ROOT    = Path('/content/drive/MyDrive/FakeAVCeleb/audio_wav')          # audio files (.wav)
CACHE_DIR     = Path('/content/drive/MyDrive/FakeAVCeleb/cache/audio_cnn_mel')# cache for mels
BATCH_SIZE    = 16
EPOCHS        = 10
LR            = 1e-4
WD            = 1e-5
GRAD_CLIP     = 1.0
SAMPLE_RATE   = 16000
N_MELS        = 128
DURA_SEC      = 4.0  # clip duration in seconds
TARGET_LEN    = int(SAMPLE_RATE * DURA_SEC)

# Save paths (same folder as CSVs)
MODEL_DIR     = Path(TRAIN_CSV).parent
BEST_PATH     = MODEL_DIR / 'audio_cnn_best.pth'
REPORT_PATH   = MODEL_DIR / 'audio_cnn_report.txt'

CACHE_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# --- Mel-Spectrogram extraction ---
def audio_to_mel(path: Path) -> torch.Tensor:
    y, sr = librosa.load(path, sr=SAMPLE_RATE)
    if len(y) > TARGET_LEN:
        y = y[:TARGET_LEN]
    else:
        y = np.pad(y, (0, TARGET_LEN - len(y)))
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)  # normalize 0–1
    return torch.tensor(mel_norm, dtype=torch.float32)  # [128, T]

# --- Dataset ---
class MelDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        sid = str(row['sample_id'])
        pt_path = CACHE_DIR / f"{sid}.pt"

        if not pt_path.exists():
            wav_path = AUDIO_ROOT / f"{sid}.wav"
            mel = audio_to_mel(wav_path)                 # [128, T]
            torch.save(mel, pt_path)
        else:
            mel = torch.load(pt_path)                    # [128, T]

        img = mel.unsqueeze(0)                           # [1, 128, T]
        label = torch.tensor(float(row['audio_fake']), dtype=torch.float32)  # 0/1 float
        return img, label

# --- Model ---
class AudioCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16), nn.ReLU(), nn.MaxPool2d(2),      # -> [16, 64, T/2]
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),      # -> [32, 32, T/4]
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64), nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4)),                        # -> [64, 4, 4]
            nn.Flatten(),                                        # -> [64*4*4]
            nn.Linear(64 * 4 * 4, 128), nn.ReLU(), nn.Dropout(0.5)
        )
        self.head = nn.Linear(128, 1)

    def forward(self, x):
        x = self.cnn(x)
        return self.head(x).squeeze(1)  # [B]

# --- Load Data ---
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# sanity checks
for col in ('sample_id', 'audio_fake'):
    assert col in train_df.columns and col in test_df.columns, f"Missing column: {col}"

print("Train label stats:\n", train_df['audio_fake'].value_counts())
print("Test  label stats:\n",  test_df['audio_fake'].value_counts())

train_dl = DataLoader(MelDataset(train_df), batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=2, pin_memory=(DEVICE.type=='cuda'))
test_dl  = DataLoader(MelDataset(test_df),  batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=2, pin_memory=(DEVICE.type=='cuda'))

# --- Training ---
model = AudioCNN().to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)

best_acc = 0.0
for epoch in range(1, EPOCHS+1):
    # train
    model.train()
    train_preds, train_labels, train_loss = [], [], 0.0
    for x,y in tqdm(train_dl, desc=f"Epoch {epoch} Train"):
        x,y = x.to(DEVICE, non_blocking=True), y.to(DEVICE, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        logits = model(x)                           # [B]
        loss = criterion(logits, y)                 # y: [B]
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        train_loss += loss.item()
        train_preds += (torch.sigmoid(logits) > 0.5).detach().cpu().numpy().astype(int).tolist()
        train_labels += y.detach().cpu().numpy().astype(int).tolist()

    train_acc = accuracy_score(train_labels, train_preds)
    print(f"→ Train Loss: {train_loss/len(train_dl):.4f} Acc: {train_acc:.3f}")

    # eval
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for x,y in tqdm(test_dl, desc=f"Epoch {epoch} Test"):
            x,y = x.to(DEVICE, non_blocking=True), y.to(DEVICE, non_blocking=True)
            logits = model(x)
            preds += (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int).tolist()
            labels += y.cpu().numpy().astype(int).tolist()

    test_acc = accuracy_score(labels, preds)
    print(f"→ Test Acc: {test_acc:.3f}")

    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), BEST_PATH)
        print(f"✓ Saved new best model → {BEST_PATH}")

# --- Final classification report---
model.load_state_dict(torch.load(BEST_PATH, map_location=DEVICE))
model.eval()
preds, labels = [], []
with torch.no_grad():
    for x,y in tqdm(test_dl, desc="Final Test"):
        x = x.to(DEVICE, non_blocking=True)
        logits = model(x)
        preds += (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int).tolist()
        labels += y.numpy().astype(int).tolist()

report = classification_report(labels, preds, target_names=["Real","Fake"])
print("\nClassification Report:\n", report)

with open(REPORT_PATH, 'w') as f:
    f.write(report)
print(f" Saved report → {REPORT_PATH}")


Train label stats:
 audio_fake
0    971
1    622
Name: count, dtype: int64
Test  label stats:
 audio_fake
0    259
1    148
Name: count, dtype: int64


Epoch 1 Train: 100%|██████████| 100/100 [04:57<00:00,  2.98s/it]


→ Train Loss: 0.5662 Acc: 0.706


Epoch 1 Test: 100%|██████████| 26/26 [01:07<00:00,  2.60s/it]


→ Test Acc: 0.828
✓ Saved new best model → /content/drive/MyDrive/FakeAVCeleb/audio_cnn_best.pth


Epoch 2 Train: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


→ Train Loss: 0.2946 Acc: 0.906


Epoch 2 Test: 100%|██████████| 26/26 [00:03<00:00,  7.12it/s]


→ Test Acc: 0.961
✓ Saved new best model → /content/drive/MyDrive/FakeAVCeleb/audio_cnn_best.pth


Epoch 3 Train: 100%|██████████| 100/100 [00:33<00:00,  3.01it/s]


→ Train Loss: 0.1191 Acc: 0.979


Epoch 3 Test: 100%|██████████| 26/26 [00:04<00:00,  6.12it/s]


→ Test Acc: 0.980
✓ Saved new best model → /content/drive/MyDrive/FakeAVCeleb/audio_cnn_best.pth


Epoch 4 Train: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


→ Train Loss: 0.0666 Acc: 0.985


Epoch 4 Test: 100%|██████████| 26/26 [00:03<00:00,  6.85it/s]


→ Test Acc: 0.975


Epoch 5 Train: 100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


→ Train Loss: 0.0477 Acc: 0.988


Epoch 5 Test: 100%|██████████| 26/26 [00:05<00:00,  5.08it/s]


→ Test Acc: 0.975


Epoch 6 Train: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


→ Train Loss: 0.0406 Acc: 0.991


Epoch 6 Test: 100%|██████████| 26/26 [00:03<00:00,  7.32it/s]


→ Test Acc: 0.978


Epoch 7 Train: 100%|██████████| 100/100 [00:32<00:00,  3.12it/s]


→ Train Loss: 0.0334 Acc: 0.993


Epoch 7 Test: 100%|██████████| 26/26 [00:04<00:00,  5.41it/s]


→ Test Acc: 0.980


Epoch 8 Train: 100%|██████████| 100/100 [00:32<00:00,  3.07it/s]


→ Train Loss: 0.0294 Acc: 0.992


Epoch 8 Test: 100%|██████████| 26/26 [00:03<00:00,  6.79it/s]


→ Test Acc: 0.978


Epoch 9 Train: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


→ Train Loss: 0.0284 Acc: 0.993


Epoch 9 Test: 100%|██████████| 26/26 [00:04<00:00,  5.40it/s]


→ Test Acc: 0.985
✓ Saved new best model → /content/drive/MyDrive/FakeAVCeleb/audio_cnn_best.pth


Epoch 10 Train: 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


→ Train Loss: 0.0297 Acc: 0.994


Epoch 10 Test: 100%|██████████| 26/26 [00:03<00:00,  7.37it/s]


→ Test Acc: 0.985


Final Test: 100%|██████████| 26/26 [00:03<00:00,  7.31it/s]



Classification Report:
               precision    recall  f1-score   support

        Real       1.00      0.98      0.99       259
        Fake       0.96      1.00      0.98       148

    accuracy                           0.99       407
   macro avg       0.98      0.99      0.98       407
weighted avg       0.99      0.99      0.99       407

 Saved report → /content/drive/MyDrive/FakeAVCeleb/audio_cnn_report.txt


In [None]:
def label_shuffle_test():
    class ShufDS(Dataset):
        def __init__(self, df):
            self.df = df.reset_index(drop=True).copy()
            self.shuf = self.df['audio_fake'].sample(frac=1.0, random_state=123).to_numpy().astype(int)
        def __len__(self): return len(self.df)
        def __getitem__(self, i):
            r = self.df.iloc[i]; sid = str(r['sample_id'])
            mel = torch.load(CACHE_DIR / f"{sid}.pt")          # [128, T]
            return mel.unsqueeze(0), torch.tensor(self.shuf[i], dtype=torch.float32)

    m = AudioCNN().to(DEVICE)
    opt = torch.optim.AdamW(m.parameters(), lr=1e-4, weight_decay=1e-5)
    crit = nn.BCEWithLogitsLoss()

    # quick 1-epoch fit on shuffled labels
    dl = DataLoader(ShufDS(train_df), batch_size=16, shuffle=True, num_workers=2)
    m.train()
    for xb, yb in dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad(set_to_none=True)
        out = m(xb); loss = crit(out, yb); loss.backward(); opt.step()

    # evaluate on true test labels
    m.eval(); labels, preds = [], []
    with torch.no_grad():
        for xb, yb in DataLoader(EvalMelDS(test_df), batch_size=32, shuffle=False, num_workers=2):
            xb = xb.to(DEVICE)
            p = torch.sigmoid(m(xb)).cpu().numpy()             # no .detach() needed inside no_grad
            preds += (p > 0.5).astype(int).tolist()
            labels += yb.numpy().astype(int).tolist()
    return accuracy_score(labels, preds)

print("Label-shuffle acc (should ~= test majority ≈ 0.636):", label_shuffle_test())


Label-shuffle acc (should ~= test majority ≈ 0.636): 0.6363636363636364


In [None]:
print(train_df['audio_fake'].value_counts())
print(test_df['audio_fake'].value_counts())


audio_fake
0    801
1    799
Name: count, dtype: int64
audio_fake
1    201
0    199
Name: count, dtype: int64


In [None]:
import random
import IPython.display as ipd
from pathlib import Path

samples = random.sample(list(AUDIO_ROOT.glob("*.wav")), 3)
for s in samples:
    print(s.name)
    display(ipd.Audio(str(s)))


sample_01403.wav


sample_00117.wav


sample_01828.wav


In [None]:
import os
train_ids = set(pd.read_csv(TRAIN_CSV)['sample_id'])
wav_ids = set([p.stem for p in Path(AUDIO_ROOT).glob("*.wav")])
print("Missing in audio folder:", train_ids - wav_ids)
print("Extra in audio folder:", wav_ids - train_ids)


Missing in audio folder: set()
Extra in audio folder: {'sample_00585', 'sample_00533', 'sample_01536', 'sample_01199', 'sample_00325', 'sample_00995', 'sample_00210', 'sample_00538', 'sample_01001', 'sample_00927', 'sample_01913', 'sample_00411', 'sample_01698', 'sample_00749', 'sample_00630', 'sample_01968', 'sample_01681', 'sample_00433', 'sample_00656', 'sample_01548', 'sample_01428', 'sample_00936', 'sample_01098', 'sample_01299', 'sample_00145', 'sample_00090', 'sample_01861', 'sample_00476', 'sample_00984', 'sample_01148', 'sample_00797', 'sample_01342', 'sample_01647', 'sample_00702', 'sample_00636', 'sample_01457', 'sample_01733', 'sample_01449', 'sample_01520', 'sample_01719', 'sample_01720', 'sample_01853', 'sample_00899', 'sample_01375', 'sample_01657', 'sample_01694', 'sample_00158', 'sample_00137', 'sample_00770', 'sample_01341', 'sample_00593', 'sample_00700', 'sample_00650', 'sample_01273', 'sample_00157', 'sample_00188', 'sample_00781', 'sample_01196', 'sample_00255', '

In [None]:
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/FakeAVCeleb")

# load both sets
train_vid = pd.read_csv(BASE/"train_fixed_fullschema.csv")
test_vid  = pd.read_csv(BASE/"test_fixed_fullschema.csv")
train_aud = pd.read_csv(BASE/"train_fixed_fullschema_hashsplit.csv")
test_aud  = pd.read_csv(BASE/"test_fixed_fullschema_hashsplit.csv")

# check overlap / equality
print("Train video:", len(train_vid), "Train audio:", len(train_aud))
print("Test  video:", len(test_vid),  "Test  audio:", len(test_aud))

print("\n--- Train set ---")
print("Exact match:", set(train_vid['sample_id']) == set(train_aud['sample_id']))
print("Overlap count:", len(set(train_vid['sample_id']).intersection(set(train_aud['sample_id']))))

print("\n--- Test set ---")
print("Exact match:", set(test_vid['sample_id']) == set(test_aud['sample_id']))
print("Overlap count:", len(set(test_vid['sample_id']).intersection(set(test_aud['sample_id']))))

diff_train = set(train_vid['sample_id']).symmetric_difference(set(train_aud['sample_id']))
diff_test  = set(test_vid['sample_id']).symmetric_difference(set(test_aud['sample_id']))

print("\nTrain diff count:", len(diff_train))
print("Test diff count:", len(diff_test))


Train video: 1600 Train audio: 1593
Test  video: 400 Test  audio: 407

--- Train set ---
Exact match: False
Overlap count: 1276

--- Test set ---
Exact match: False
Overlap count: 83

Train diff count: 641
Test diff count: 641


In [None]:
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/FakeAVCeleb")

train_hash = pd.read_csv(BASE/"train_fixed_fullschema_hashsplit.csv")
test_hash  = pd.read_csv(BASE/"test_fixed_fullschema_hashsplit.csv")

print("Hashsplit train size:", len(train_hash))
print("Hashsplit test  size:", len(test_hash))
print("Total:", len(train_hash) + len(test_hash))

# check label balance
print("\nTrain label stats:\n", train_hash['audio_fake'].value_counts())
print("\nTest label stats:\n", test_hash['audio_fake'].value_counts())


Hashsplit train size: 1593
Hashsplit test  size: 407
Total: 2000

Train label stats:
 audio_fake
0    971
1    622
Name: count, dtype: int64

Test label stats:
 audio_fake
0    259
1    148
Name: count, dtype: int64
