# üé• FaceForensics++ (C23) Deepfake Detector ‚Äì Local KaggleHub Setup
---
This notebook lets you run the FaceForensics++ deepfake detector **locally** using your **NVIDIA RTX 4060 GPU**.

**Includes:**
- Conda environment setup commands
- KaggleHub dataset caching code
- GPU verification block
- Original model training code (unchanged)


In [None]:
# ===============================================================
# üß± 1Ô∏è‚É£ Conda Environment Setup (run these in Anaconda Prompt)
# ===============================================================
# conda create -n deepfake python=3.10 -y
# conda activate deepfake
# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
# pip install timm tqdm opencv-python pillow numpy pandas kagglehub


In [7]:
# ===============================================================
# üì• 2Ô∏è‚É£ KaggleHub Dataset Fetcher & Local Caching
# ===============================================================
import kagglehub, os

dataset_path =kagglehub.dataset_download("xdxd003/ff-c23")
os.environ["BASE_PATH"] = dataset_path
print(f"‚úÖ Dataset cached at: {dataset_path}")


‚úÖ Dataset cached at: C:\Users\Abhishek\.cache\kagglehub\datasets\xdxd003\ff-c23\versions\1


In [8]:
# ===============================================================
# ‚öôÔ∏è 3Ô∏è‚É£ GPU Verification
# ===============================================================
import torch

print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéÆ Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected, training will run on CPU.")


‚úÖ CUDA available: True
üéÆ Using GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [9]:
# ===============================================================
# üîç 4Ô∏è‚É£ Dataset Verification Utility
# ===============================================================
import os

BASE_PATH = os.getenv("BASE_PATH", "./FaceForensics++_C23")

def verify_dataset(base_dir):
    all_videos = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".mp4"):
                all_videos.append(os.path.join(root, f))
    print(f"üéû Found {len(all_videos)} videos in {base_dir}")
    real = len([v for v in all_videos if '/original/' in v.lower()])
    fake = len(all_videos) - real
    print(f"‚úÖ Real: {real} | Fake: {fake}")
    return all_videos

video_files = verify_dataset(BASE_PATH)
if not video_files:
    print("‚ö†Ô∏è No videos found. Check your dataset path or KaggleHub cache.")


üéû Found 7000 videos in C:\Users\Abhishek\.cache\kagglehub\datasets\xdxd003\ff-c23\versions\1
‚úÖ Real: 0 | Fake: 7000


In [10]:
# ===============================================================
# üé• 5Ô∏è‚É£ Main Training Script (original, unchanged)
# ===============================================================
import os, cv2, torch, timm, random
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from collections import Counter

BASE_PATH = os.getenv("BASE_PATH", "./FaceForensics++_C23")
FRAME_SIZE = (224, 224)
FRAME_SKIP = 15
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_label_from_path(path: str) -> int:
    lower = path.lower()
    if "/original/" in lower:
        return 0
    elif any(x in lower for x in [
        "face2face", "faceswap", "deepfakes",
        "faceshifter", "neuraltextures", "deepfakedetection"
    ]):
        return 1
    else:
        return 0

def get_all_videos(base_dir):
    video_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".mp4"):
                video_files.append(os.path.join(root, f))
    return sorted(video_files)

video_list = get_all_videos(BASE_PATH)
labels = [get_label_from_path(v) for v in video_list]
count = Counter(labels)
print("üìä Dataset Label Breakdown:")
print(f"Real (0): {count[0]} videos")
print(f"Fake (1): {count[1]} videos")
print(f"Total   : {len(video_list)} videos\n")

face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
eye_cascade  = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')

def extract_eyes(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    eyes_list = []
    for (x,y,w,h) in faces:
        roi = frame[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi)
        for (ex,ey,ew,eh) in eyes:
            eyes_list.append(roi[ey:ey+eh, ex:ex+ew])
    return eyes_list

transform_full_base = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

transform_eye = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor()
])

transform_aug_real = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

class FaceForensicsDataset(Dataset):
    def __init__(self, video_list, frame_skip=10):
        self.video_list = video_list
        self.frame_skip = frame_skip

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        path  = self.video_list[idx]
        label = get_label_from_path(path)

        cap = cv2.VideoCapture(path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        target_idx = np.random.randint(0, frame_count)
        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
        ret, frame = cap.read()
        cap.release()
        if not ret:
            frame = np.zeros((224,224,3), np.uint8)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, FRAME_SIZE)

        eyes = extract_eyes(frame)
        eye = eyes[0] if len(eyes)>0 else np.zeros((32,32,3),np.uint8)
        eye_pil   = Image.fromarray(eye)
        frame_pil = Image.fromarray(frame)

        frame_t = transform_aug_real(frame_pil) if label == 0 else transform_full_base(frame_pil)
        eye_t   = transform_eye(eye_pil)

        return {"frame": frame_t, "eye": eye_t, "label": torch.tensor(label, dtype=torch.long)}

class RegionCNN(nn.Module):
    def __init__(self,in_ch=3,out_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_ch,32,3,1,1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3,1,1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,128,3,1,1), nn.BatchNorm2d(128), nn.ReLU(), nn.AdaptiveAvgPool2d(1)
        )
        self.fc = nn.Linear(128,out_dim)
    def forward(self,x):
        if x.ndim==3: x=x.unsqueeze(0)
        f=self.net(x).view(x.size(0),-1)
        return self.fc(f)

class HybridDetector(nn.Module):
    def __init__(self, swin_name="swin_tiny_patch4_window7_224", num_classes=2):
        super().__init__()
        self.swin = timm.create_model(swin_name, pretrained=True, num_classes=0)
        swin_dim = self.swin.num_features
        self.eye_net = RegionCNN(out_dim=128)
        self.classifier = nn.Sequential(
            nn.Linear(swin_dim+128,512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512,num_classes)
        )
    def forward(self,frame,eye):
        s = self.swin(frame)
        e = self.eye_net(eye)
        x = torch.cat([s,e],1)
        return self.classifier(x)

dataset = FaceForensicsDataset(video_list, FRAME_SKIP)
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(0.8 * dataset_size)
np.random.shuffle(indices)
train_indices, val_indices = indices[:split], indices[split:]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler   = SubsetRandomSampler(val_indices)

train_loader = DataLoader(dataset, batch_size=4, sampler=train_sampler, collate_fn=lambda x:[i for i in x if i])
val_loader   = DataLoader(dataset, batch_size=4, sampler=val_sampler, collate_fn=lambda x:[i for i in x if i])

print(f"‚úÖ Train videos: {len(train_indices)} | Val videos: {len(val_indices)}")

def accuracy(pred,lab):
    _,p=torch.max(pred,1)
    return (p==lab).float().mean().item()

class EarlyStopper:
    def __init__(self,patience=5,delta=1e-3):
        self.patience, self.delta, self.counter, self.best = patience, delta, 0, np.inf
        self.stop=False
    def check(self,loss):
        if loss < self.best - self.delta:
            self.best, self.counter = loss, 0
        else:
            self.counter += 1
            if self.counter>=self.patience: self.stop=True

model = HybridDetector().to(DEVICE)
opt   = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
sch   = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)
crit  = nn.CrossEntropyLoss()
stopper = EarlyStopper(patience=5)
EPOCHS=30
best_acc=0

for ep in range(EPOCHS):
    model.train(); tl,ta=0,0
    for b in train_loader:
        fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)
        opt.zero_grad()
        out=model(fr,ey)
        loss=crit(out,lb)
        loss.backward(); opt.step()
        tl+=loss.item(); ta+=accuracy(out,lb)
    tl/=len(train_loader); ta/=len(train_loader)

    model.eval(); vl,va=0,0
    with torch.no_grad():
        for b in val_loader:
            fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
            ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
            lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)
            out=model(fr,ey)
            loss=crit(out,lb)
            vl+=loss.item(); va+=accuracy(out,lb)
    vl/=len(val_loader); va/=len(val_loader)
    sch.step()
    print(f"üìÜ Epoch {ep+1}/{EPOCHS} | Train {tl:.4f}/{ta*100:.2f}% | Val {vl:.4f}/{va*100:.2f}%")

    if va>best_acc:
        best_acc=va
        torch.save(model.state_dict(),"best_balanced_sampler_model.pth")
    stopper.check(vl)
    if stopper.stop:
        print("‚õî Early stopping.")
        break

print(f"üèÅ Training complete. Best Val Acc = {best_acc*100:.2f}%")


üìä Dataset Label Breakdown:
Real (0): 1000 videos
Fake (1): 6000 videos
Total   : 7000 videos

‚úÖ Train videos: 5600 | Val videos: 1400


KeyboardInterrupt: 

attempt at model training again

In [23]:
# ===============================================================
# üé• Deepfake Detection Training (Balanced + Augmented)
# ===============================================================
import os, cv2, torch, timm, random
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from collections import Counter

# ---------------- CONFIG ----------------
BASE_PATH = os.getenv("BASE_PATH", "./FaceForensics++_C23")
FRAME_SIZE = (224, 224)
FRAME_SKIP = 15
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------- LABEL LOGIC ----------------
def get_label_from_path(path: str) -> int:
    lower = path.lower()
    if "/original/" in lower:
        return 0
    elif any(x in lower for x in [
        "face2face", "faceswap", "deepfakes",
        "faceshifter", "neuraltextures", "deepfakedetection"
    ]):
        return 1
    else:
        return 0

def get_all_videos(base_dir):
    video_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".mp4"):
                video_files.append(os.path.join(root, f))
    return sorted(video_files)

video_list = get_all_videos(BASE_PATH)
labels = [get_label_from_path(v) for v in video_list]
count = Counter(labels)
print("üìä Dataset Label Breakdown:")
print(f"Real (0): {count[0]} videos")
print(f"Fake (1): {count[1]} videos")
print(f"Total   : {len(video_list)} videos\n")

# ---------------- EYE DETECTION ----------------
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
eye_cascade  = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')

def extract_eyes(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    eyes_list = []
    for (x,y,w,h) in faces:
        roi = frame[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi)
        for (ex,ey,ew,eh) in eyes:
            eyes_list.append(roi[ey:ey+eh, ex:ex+ew])
    return eyes_list

# ---------------- TRANSFORMS ----------------
transform_full_base = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

transform_eye = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor()
])

transform_aug_real = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# ---------------- DATASET ----------------
class FaceForensicsDataset(Dataset):
    def __init__(self, video_list, frame_skip=10):
        self.video_list = video_list
        self.frame_skip = frame_skip

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        path  = self.video_list[idx]
        label = get_label_from_path(path)

        cap = cv2.VideoCapture(path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        target_idx = np.random.randint(0, frame_count)
        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
        ret, frame = cap.read()
        cap.release()
        if not ret:
            frame = np.zeros((224,224,3), np.uint8)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, FRAME_SIZE)

        eyes = extract_eyes(frame)
        eye = eyes[0] if len(eyes)>0 else np.zeros((32,32,3),np.uint8)
        eye_pil   = Image.fromarray(eye)
        frame_pil = Image.fromarray(frame)

        frame_t = transform_aug_real(frame_pil) if label == 0 else transform_full_base(frame_pil)
        eye_t   = transform_eye(eye_pil)

        return {"frame": frame_t, "eye": eye_t, "label": torch.tensor(label, dtype=torch.long)}

# ---------------- MODEL ----------------
class RegionCNN(nn.Module):
    def __init__(self,in_ch=3,out_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_ch,32,3,1,1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3,1,1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,128,3,1,1), nn.BatchNorm2d(128), nn.ReLU(), nn.AdaptiveAvgPool2d(1)
        )
        self.fc = nn.Linear(128,out_dim)
    def forward(self,x):
        if x.ndim==3: x=x.unsqueeze(0)
        f=self.net(x).view(x.size(0),-1)
        return self.fc(f)

class HybridDetector(nn.Module):
    def __init__(self, swin_name="swin_tiny_patch4_window7_224", num_classes=2):
        super().__init__()
        self.swin = timm.create_model(swin_name, pretrained=True, num_classes=0)
        swin_dim = self.swin.num_features
        self.eye_net = RegionCNN(out_dim=128)
        self.classifier = nn.Sequential(
            nn.Linear(swin_dim+128,512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512,num_classes)
        )
    def forward(self,frame,eye):
        s = self.swin(frame)
        e = self.eye_net(eye)
        x = torch.cat([s,e],1)
        return self.classifier(x)

# ---------------- DATA SPLIT ----------------
dataset = FaceForensicsDataset(video_list, FRAME_SKIP)
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(0.8 * dataset_size)
np.random.shuffle(indices)
train_indices, val_indices = indices[:split], indices[split:]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler   = SubsetRandomSampler(val_indices)

train_loader = DataLoader(dataset, batch_size=4, sampler=train_sampler, collate_fn=lambda x:[i for i in x if i])
val_loader   = DataLoader(dataset, batch_size=4, sampler=val_sampler, collate_fn=lambda x:[i for i in x if i])

print(f"‚úÖ Train videos: {len(train_indices)} | Val videos: {len(val_indices)}")

# ---------------- LOSS BALANCING ----------------
labels = [get_label_from_path(v) for v in video_list]
class_counts = np.bincount(labels)
weights = torch.tensor(1.0 / class_counts, dtype=torch.float32).to(DEVICE)
crit = nn.CrossEntropyLoss(weight=weights)

# ---------------- TRAINING CONFIG ----------------
model = HybridDetector().to(DEVICE)
opt   = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
sch   = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)

class EarlyStopper:
    def __init__(self,patience=5,delta=1e-3):
        self.patience, self.delta, self.counter, self.best = patience, delta, 0, np.inf
        self.stop=False
    def check(self,loss):
        if loss < self.best - self.delta:
            self.best, self.counter = loss, 0
        else:
            self.counter += 1
            if self.counter>=self.patience: self.stop=True

stopper = EarlyStopper(patience=5)
EPOCHS=30
best_acc=0

# ---------------- TRAINING LOOP ----------------
def accuracy(pred,lab):
    _,p=torch.max(pred,1)
    return (p==lab).float().mean().item()

for ep in range(EPOCHS):
    model.train(); tl,ta=0,0
    for b in train_loader:
        fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)
        opt.zero_grad()
        out=model(fr,ey)
        loss=crit(out,lb)
        loss.backward(); opt.step()
        tl+=loss.item(); ta+=accuracy(out,lb)
    tl/=len(train_loader); ta/=len(train_loader)

    model.eval(); vl,va=0,0
    with torch.no_grad():
        for b in val_loader:
            fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
            ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
            lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)
            out=model(fr,ey)
            loss=crit(out,lb)
            vl+=loss.item(); va+=accuracy(out,lb)
    vl/=len(val_loader); va/=len(val_loader)
    sch.step()
    print(f"üìÜ Epoch {ep+1}/{EPOCHS} | Train {tl:.4f}/{ta*100:.2f}% | Val {vl:.4f}/{va*100:.2f}%")

    if va>best_acc:
        best_acc=va
        torch.save(model.state_dict(),"best_balanced_fixed_model.pth")
    stopper.check(vl)
    if stopper.stop:
        print("‚õî Early stopping triggered.")
        break

print(f"üèÅ Training complete. Best Val Acc = {best_acc*100:.2f}%")


üìä Dataset Label Breakdown:
Real (0): 1000 videos
Fake (1): 6000 videos
Total   : 7000 videos

‚úÖ Train videos: 5600 | Val videos: 1400
üìÜ Epoch 1/30 | Train 0.6571/84.77% | Val 0.6365/86.71%
üìÜ Epoch 2/30 | Train 0.6428/85.45% | Val 0.6207/86.71%
üìÜ Epoch 3/30 | Train 0.6411/85.46% | Val 0.6160/86.71%
üìÜ Epoch 4/30 | Train 0.6467/85.46% | Val 0.6195/86.71%
üìÜ Epoch 5/30 | Train 0.6412/85.46% | Val 0.6160/86.71%
üìÜ Epoch 6/30 | Train 0.6417/85.34% | Val 0.6185/86.71%
üìÜ Epoch 7/30 | Train 0.6505/85.38% | Val 0.6240/86.71%
üìÜ Epoch 8/30 | Train 0.6371/85.43% | Val 0.6215/86.71%
‚õî Early stopping triggered.
üèÅ Training complete. Best Val Acc = 86.71%


In [22]:
# ===============================================================
# üé• Deepfake Detection ‚Äì Balanced Hybrid Model Training
# ===============================================================

import os, cv2, torch, timm, random
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight

# ---------------- CONFIG ----------------
BASE_PATH = os.getenv("BASE_PATH", "./FaceForensics++_C23")   # update path if needed
FRAME_SIZE = (224, 224)
FRAME_SKIP = 15
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 4
EPOCHS     = 30
print(f"üöÄ Using device: {DEVICE}")

# ---------------- LABEL MAPPING ----------------
def get_label_from_path(path: str) -> int:
    """Return 0 for Real videos and 1 for Fake ones."""
    lower = path.lower()
    if "original" in lower:
        return 0
    elif any(x in lower for x in [
        "face2face", "faceswap", "deepfakes",
        "faceshifter", "neuraltextures", "deepfakedetection"
    ]):
        return 1
    else:
        return 0

def get_all_videos(base_dir):
    video_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".mp4"):
                video_files.append(os.path.join(root, f))
    return sorted(video_files)

video_list = get_all_videos(BASE_PATH)
labels = [get_label_from_path(v) for v in video_list]
count = Counter(labels)
print("üìä Dataset Label Breakdown:")
print(f"Real (0): {count[0]} videos")
print(f"Fake (1): {count[1]} videos")
print(f"Total   : {len(video_list)} videos\n")

# ---------------- HAAR CASCADE FOR EYES ----------------
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
eye_cascade  = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')

def extract_eyes(frame):
    """Detect eye regions using Haar Cascade."""
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    eyes_list = []
    for (x,y,w,h) in faces:
        roi = frame[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi)
        for (ex,ey,ew,eh) in eyes:
            eyes_list.append(roi[ey:ey+eh, ex:ex+ew])
    return eyes_list

# ---------------- TRANSFORMS ----------------
transform_frame = transforms.Compose([
    transforms.Resize(FRAME_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

transform_eye = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# ---------------- DATASET ----------------
class FaceForensicsDataset(Dataset):
    def __init__(self, video_list, frame_skip=10):
        self.video_list = video_list
        self.frame_skip = frame_skip

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        path  = self.video_list[idx]
        label = get_label_from_path(path)

        cap = cv2.VideoCapture(path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        target_idx = np.random.randint(0, frame_count)
        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
        ret, frame = cap.read()
        cap.release()

        if not ret:
            frame = np.zeros((224,224,3), np.uint8)

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, FRAME_SIZE)

        eyes = extract_eyes(frame)
        eye = eyes[0] if len(eyes) > 0 else np.zeros((32,32,3), np.uint8)
        eye_pil   = Image.fromarray(eye)
        frame_pil = Image.fromarray(frame)

        frame_t = transform_frame(frame_pil)
        eye_t   = transform_eye(eye_pil)

        return {"frame": frame_t, "eye": eye_t, "label": torch.tensor(label, dtype=torch.long)}

dataset = FaceForensicsDataset(video_list, FRAME_SKIP)

# ---------------- SPLIT & SAMPLER ----------------
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(0.8 * dataset_size)
np.random.shuffle(indices)
train_indices, val_indices = indices[:split], indices[split:]
# ---------------- FIXED CLASS WEIGHT CALCULATION ----------------
train_labels = np.array([int(labels[i]) for i in train_indices], dtype=int)

unique_classes = np.unique(train_labels).astype(int)
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=unique_classes,
                                     y=train_labels)

# Map class weights correctly to dataset indices
label_to_weight = {cls: w for cls, w in zip(unique_classes, class_weights)}
weights = [label_to_weight[int(labels[i])] for i in train_indices]

sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

print(f"‚öñÔ∏è Class Weights: {label_to_weight}")

print(f"‚úÖ Train videos: {len(train_indices)} | Val videos: {len(val_indices)}")

# ---------------- MODEL ----------------
class RegionCNN(nn.Module):
    def __init__(self,in_ch=3,out_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_ch,32,3,1,1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3,1,1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,128,3,1,1), nn.BatchNorm2d(128), nn.ReLU(), nn.AdaptiveAvgPool2d(1)
        )
        self.fc = nn.Linear(128,out_dim)
    def forward(self,x):
        if x.ndim==3: x=x.unsqueeze(0)
        f=self.net(x).view(x.size(0),-1)
        return self.fc(f)

class HybridDetector(nn.Module):
    def __init__(self, swin_name="swin_tiny_patch4_window7_224", num_classes=2):
        super().__init__()
        self.swin = timm.create_model(swin_name, pretrained=True, num_classes=0)
        swin_dim = self.swin.num_features
        self.eye_net = RegionCNN(out_dim=128)
        self.classifier = nn.Sequential(
            nn.Linear(swin_dim+128,512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512,num_classes)
        )
    def forward(self,frame,eye):
        s = self.swin(frame)
        e = self.eye_net(eye)
        x = torch.cat([s,e],1)
        return self.classifier(x)

# ---------------- TRAINING UTILS ----------------
def accuracy(pred,lab):
    _,p=torch.max(pred,1)
    return (p==lab).float().mean().item()

class EarlyStopper:
    def __init__(self,patience=5,delta=1e-3):
        self.patience, self.delta, self.counter, self.best = patience, delta, 0, np.inf
        self.stop=False
    def check(self,loss):
        if loss < self.best - self.delta:
            self.best, self.counter = loss, 0
        else:
            self.counter += 1
            if self.counter>=self.patience: self.stop=True

# ---------------- INITIALIZE MODEL ----------------
model = HybridDetector().to(DEVICE)
opt   = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
crit  = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(DEVICE))
sch   = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)
stopper = EarlyStopper(patience=5)

best_acc=0

# ---------------- TRAIN LOOP ----------------
for ep in range(EPOCHS):
    model.train(); tl,ta=0,0
    for b in tqdm(train_loader, desc=f"Epoch {ep+1}/{EPOCHS}"):
        fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)

        opt.zero_grad()
        out=model(fr,ey)
        loss=crit(out,lb)
        loss.backward()
        opt.step()

        tl+=loss.item(); ta+=accuracy(out,lb)
    tl/=len(train_loader); ta/=len(train_loader)

    model.eval(); vl,va=0,0
    with torch.no_grad():
        for b in val_loader:
            fr=torch.stack([d["frame"] for d in b]).to(DEVICE)
            ey=torch.stack([d["eye"] for d in b]).to(DEVICE)
            lb=torch.stack([d["label"] for d in b]).long().to(DEVICE)
            out=model(fr,ey)
            loss=crit(out,lb)
            vl+=loss.item(); va+=accuracy(out,lb)
    vl/=len(val_loader); va/=len(val_loader)
    sch.step()

    print(f"üìÜ Epoch {ep+1}/{EPOCHS} | Train {tl:.4f}/{ta*100:.2f}% | Val {vl:.4f}/{va*100:.2f}%")

    if va>best_acc:
        best_acc=va
        torch.save(model.state_dict(),"best_balanced_model.pth")
        print(f"üíæ Saved model with Val Acc: {va*100:.2f}%")

    stopper.check(vl)
    if stopper.stop:
        print("‚õî Early stopping.")
        break

print(f"üèÅ Training complete. Best Val Acc = {best_acc*100:.2f}%")


üöÄ Using device: cuda
üìä Dataset Label Breakdown:
Real (0): 1000 videos
Fake (1): 6000 videos
Total   : 7000 videos

‚öñÔ∏è Class Weights: {np.int64(0): np.float64(3.4313725490196076), np.int64(1): np.float64(0.5852842809364549)}
‚úÖ Train videos: 5600 | Val videos: 1400


Epoch 1/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [15:22<00:00,  1.52it/s]


üìÜ Epoch 1/30 | Train 0.6555/84.64% | Val 0.6370/85.14%
üíæ Saved model with Val Acc: 85.14%


Epoch 2/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:17<00:00,  2.07it/s]


üìÜ Epoch 2/30 | Train 0.6405/85.55% | Val 0.6395/85.14%


Epoch 3/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:10<00:00,  2.09it/s]


üìÜ Epoch 3/30 | Train 0.6388/85.86% | Val 0.6347/85.14%


Epoch 4/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:11<00:00,  2.09it/s]


üìÜ Epoch 4/30 | Train 0.6327/85.86% | Val 0.7416/85.14%


Epoch 5/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:13<00:00,  2.08it/s]


üìÜ Epoch 5/30 | Train 0.6567/84.89% | Val 0.6518/85.14%


Epoch 6/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:20<00:00,  2.06it/s]


üìÜ Epoch 6/30 | Train 0.6373/85.36% | Val 0.6388/85.14%


Epoch 7/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:25<00:00,  2.04it/s]


üìÜ Epoch 7/30 | Train 0.6318/85.71% | Val 0.6429/85.14%


Epoch 8/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1400/1400 [11:25<00:00,  2.04it/s]


üìÜ Epoch 8/30 | Train 0.6331/85.86% | Val 0.6385/85.14%
‚õî Early stopping.
üèÅ Training complete. Best Val Acc = 85.14%


In [None]:
# ===============================================================
# üì• Load the Trained Model
# ===============================================================
!pip install scikit-learn
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
import numpy as np
from tqdm import tqdm

# Initialize the same architecture
model = HybridDetector().to(DEVICE)
model.load_state_dict(torch.load("best_balanced_sampler_model.pth", map_location=DEVICE))
model.eval()
print("‚úÖ Loaded model: best_balanced_sampler_model.pth")

# ===============================================================
# üìä Evaluate on Validation Set
# ===============================================================
val_loader = DataLoader(
    dataset,
    batch_size=4,
    sampler=val_sampler,
    collate_fn=lambda x: [i for i in x if i]  # FIXED: ensures each sample is a dict
)

y_true, y_pred, y_prob = [], [], []

with torch.no_grad():
    for b in tqdm(val_loader, desc="Evaluating Validation Set"):
        fr = torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey = torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb = torch.stack([d["label"] for d in b]).long().to(DEVICE)

        outputs = model(fr, ey)
        probs = torch.softmax(outputs, dim=1)[:, 1]
        preds = torch.argmax(outputs, dim=1)

        y_true.extend(lb.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(probs.cpu().numpy())

# ===============================================================
# üßÆ Compute Metrics
# ===============================================================
acc  = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec  = recall_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred)
auc  = roc_auc_score(y_true, y_prob)
cm   = confusion_matrix(y_true, y_pred)

print("\n========== Evaluation Results ==========")
print(f"Accuracy     : {acc*100:.2f}%")
print(f"Precision    : {prec*100:.2f}%")
print(f"Recall       : {rec*100:.2f}%")
print(f"F1 Score     : {f1*100:.2f}%")
print(f"AUC Score    : {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("========================================")




  model.load_state_dict(torch.load("best_balanced_sampler_model.pth", map_location=DEVICE))


‚úÖ Loaded model: best_balanced_sampler_model.pth


Evaluating Validation Set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 350/350 [02:48<00:00,  2.07it/s]


Accuracy     : 85.14%
Precision    : 85.14%
Recall       : 100.00%
F1 Score     : 91.98%
AUC Score    : 0.5646
Confusion Matrix:
[[   0  208]
 [   0 1192]]





In [12]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
import numpy as np
from tqdm import tqdm

# Initialize the same architecture
model = HybridDetector().to(DEVICE)
model.load_state_dict(torch.load("best_balanced_fixed_model.pth", map_location=DEVICE))
model.eval()
print("‚úÖ Loaded model: best_balanced_fixed_model.pth")

# ===============================================================
# üìä Evaluate on Validation Set
# ===============================================================
val_loader = DataLoader(
    dataset,
    batch_size=4,
    sampler=val_sampler,
    collate_fn=lambda x: [i for i in x if i]  # FIXED: ensures each sample is a dict
)

y_true, y_pred, y_prob = [], [], []

with torch.no_grad():
    for b in tqdm(val_loader, desc="Evaluating Validation Set"):
        fr = torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey = torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb = torch.stack([d["label"] for d in b]).long().to(DEVICE)

        outputs = model(fr, ey)
        probs = torch.softmax(outputs, dim=1)[:, 1]
        preds = torch.argmax(outputs, dim=1)

        y_true.extend(lb.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(probs.cpu().numpy())

# ===============================================================
# üßÆ Compute Metrics
# ===============================================================
acc  = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec  = recall_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred)
auc  = roc_auc_score(y_true, y_prob)
cm   = confusion_matrix(y_true, y_pred)

print("\n========== Evaluation Results ==========")
print(f"Accuracy     : {acc*100:.2f}%")
print(f"Precision    : {prec*100:.2f}%")
print(f"Recall       : {rec*100:.2f}%")
print(f"F1 Score     : {f1*100:.2f}%")
print(f"AUC Score    : {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("========================================")

  model.load_state_dict(torch.load("best_balanced_fixed_model.pth", map_location=DEVICE))


‚úÖ Loaded model: best_balanced_fixed_model.pth


Evaluating Validation Set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 350/350 [02:27<00:00,  2.37it/s]


Accuracy     : 86.57%
Precision    : 86.57%
Recall       : 100.00%
F1 Score     : 92.80%
AUC Score    : 0.5565
Confusion Matrix:
[[   0  188]
 [   0 1212]]





In [21]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
import numpy as np
from tqdm import tqdm

# Initialize the same architecture
model = HybridDetector().to(DEVICE)
model.load_state_dict(torch.load("best_balanced_model.pth", map_location=DEVICE))
model.eval()
print("‚úÖ Loaded model: best_balanced_model.pth")

# ===============================================================
# üìä Evaluate on Validation Set
# ===============================================================
val_loader = DataLoader(
    dataset,
    batch_size=4,
    sampler=val_sampler,
    collate_fn=lambda x: [i for i in x if i]  # FIXED: ensures each sample is a dict
)

y_true, y_pred, y_prob = [], [], []

with torch.no_grad():
    for b in tqdm(val_loader, desc="Evaluating Validation Set"):
        fr = torch.stack([d["frame"] for d in b]).to(DEVICE)
        ey = torch.stack([d["eye"] for d in b]).to(DEVICE)
        lb = torch.stack([d["label"] for d in b]).long().to(DEVICE)

        outputs = model(fr, ey)
        probs = torch.softmax(outputs, dim=1)[:, 1]
        preds = torch.argmax(outputs, dim=1)

        y_true.extend(lb.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(probs.cpu().numpy())

# ===============================================================
# üßÆ Compute Metrics
# ===============================================================
acc  = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec  = recall_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred)
auc  = roc_auc_score(y_true, y_prob)
cm   = confusion_matrix(y_true, y_pred)

print("\n========== Evaluation Results ==========")
print(f"Accuracy     : {acc*100:.2f}%")
print(f"Precision    : {prec*100:.2f}%")
print(f"Recall       : {rec*100:.2f}%")
print(f"F1 Score     : {f1*100:.2f}%")
print(f"AUC Score    : {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("========================================")

  model.load_state_dict(torch.load("best_balanced_model.pth", map_location=DEVICE))


‚úÖ Loaded model: best_balanced_model.pth


Evaluating Validation Set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 350/350 [03:48<00:00,  1.53it/s]


Accuracy     : 85.14%
Precision    : 85.14%
Recall       : 100.00%
F1 Score     : 91.98%
AUC Score    : 0.5561
Confusion Matrix:
[[   0  208]
 [   0 1192]]



