In [None]:
import json

train_test_ids = {
    "train": {
        "original": [f"{i:03d}" for i in range(120)],
        "deepfakes": [f"{i:03d}" for i in range(40)],
        "face2face": [f"{i:03d}" for i in range(30)],
        "faceshifter": [f"{i:03d}" for i in range(30)],
        "faceswap": [f"{i:03d}" for i in range(30)],
    },
    "test": {
        "original": [f"{i:03d}" for i in range(120, 148)],
        "deepfakes": [f"{i:03d}" for i in range(40, 47)],
        "face2face": [f"{i:03d}" for i in range(30, 37)],
        "faceshifter": [f"{i:03d}" for i in range(30, 37)],
        "faceswap": [f"{i:03d}" for i in range(30, 37)],
    }
}

with open("train_test_ids.json", "w") as f:
    json.dump(train_test_ids, f)

print("✅ train_test_ids.json is completed successfully!")

✅ train_test_ids.json is completed successfully!


In [None]:
from google.colab import files
files.upload()

Saving faceforensics_download_v4.py to faceforensics_download_v4.py




In [None]:
%%writefile faceforensics_download_v4.py

Overwriting faceforensics_download_v4.py


In [None]:
%%writefile faceforensics_download_v4.py
import argparse
import os
import urllib.request
import tempfile
import time
import sys
import json
from tqdm import tqdm
from os.path import join
import ast

# Constants
FILELIST_URL = 'misc/filelist.json'
DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5']

DATASETS = {
    'original': 'original_sequences/youtube',
    'Deepfakes': 'manipulated_sequences/Deepfakes',
    'Face2Face': 'manipulated_sequences/Face2Face',
    'FaceShifter': 'manipulated_sequences/FaceShifter',
    'FaceSwap': 'manipulated_sequences/FaceSwap',
}
ALL_DATASETS = list(DATASETS.keys())
COMPRESSION = ['raw', 'c23', 'c40']
TYPE = ['videos']
SERVERS = ['EU', 'EU2', 'CA']

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('output_path', type=str, help='Root directory to save data.')
    parser.add_argument('-d', '--dataset', type=str, required=True, choices=ALL_DATASETS, help='Dataset name')
    parser.add_argument('-c', '--compression', type=str, default='c23', choices=COMPRESSION)
    parser.add_argument('-t', '--type', type=str, default='videos', choices=TYPE)
    parser.add_argument('--server', type=str, default='EU', choices=SERVERS)
    parser.add_argument('--video_ids', type=str, default=None, help='JSON file or list of video IDs')
    parser.add_argument('--split', type=str, default='train', choices=['train', 'test'], help='Split name: train or test')
    return parser.parse_args()

def get_server_url(server):
    if server == 'EU':
        return 'http://canis.vc.in.tum.de:8100/'
    elif server == 'EU2':
        return 'http://kaldir.vc.in.tum.de/faceforensics/'
    elif server == 'CA':
        return 'http://falas.cmpt.sfu.ca:8100/'
    else:
        raise ValueError("Invalid server")

def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
    else:
        duration = time.time() - start_time
        progress = int(count * block_size)
        percent = int(progress * 100 / total_size)
        sys.stdout.write(f"\r{percent}% ({progress // (1024*1024)} MB) downloaded")
        sys.stdout.flush()

def download_file(url, dest):
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    if not os.path.exists(dest):
        tmp_file, tmp_path = tempfile.mkstemp(dir=os.path.dirname(dest))
        os.close(tmp_file)
        urllib.request.urlretrieve(url, tmp_path, reporthook=reporthook)
        os.rename(tmp_path, dest)
    else:
        tqdm.write(f'Skipping already existing file: {dest}')

def main(args):
    base_url = get_server_url(args.server) + 'v3/'
    dataset_key = args.dataset
    dataset_path = DATASETS[dataset_key]
    dataset_url = f"{base_url}{dataset_path}/{args.compression}/{args.type}/"

    # Load filelist
    file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode("utf-8"))

    if dataset_key == "original":
        # Sadece birer dosya ismi var: "000", "001", ...
        file_ids = set()
        for pair in file_pairs:
            file_ids.update(pair)  # hem 000 hem 001 gibi
        filelist = list(file_ids)
    else:
        filelist = []
        for pair in file_pairs:
            filelist.append('_'.join(pair))
            filelist.append('_'.join(pair[::-1]))


    # Load video IDs
    selected_ids = None
    if args.video_ids:
        if args.video_ids.endswith('.json'):
            with open(args.video_ids, 'r') as f:
                ids_json = json.load(f)
            selected_ids = ids_json[args.split].get(dataset_key.lower(), [])
        else:
            selected_ids = ast.literal_eval(args.video_ids)

    # Filter by IDs
    if selected_ids:
        filelist = [f for f in filelist if f.split('_')[-1] in selected_ids]
        print(f"✅ Filtered: {len(filelist)} files matched for {args.split} set")

    filelist = [f + ".mp4" for f in filelist]

    # Final output path
    full_output_path = os.path.join(args.output_path, args.split, dataset_path, args.compression, args.type)
    print(f"📁 Downloading to: {full_output_path}")

    for f in tqdm(filelist):
        download_file(dataset_url + f, os.path.join(full_output_path, f))

if __name__ == "__main__":
    args = parse_args()
    main(args)

Overwriting faceforensics_download_v4.py


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d Deepfakes -c c23 --video_ids train_test_ids.json --split train --server EU2

✅ Filtered: 40 files matched for train set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/train/manipulated_sequences/Deepfakes/c23/videos
100% 40/40 [01:00<00:00,  1.50s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d Face2Face -c c23 --video_ids train_test_ids.json --split train --server EU2

✅ Filtered: 30 files matched for train set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/train/manipulated_sequences/Face2Face/c23/videos
100% 30/30 [00:37<00:00,  1.24s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d FaceShifter -c c23 --video_ids train_test_ids.json --split train --server EU2

✅ Filtered: 30 files matched for train set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/train/manipulated_sequences/FaceShifter/c23/videos
100% 30/30 [00:37<00:00,  1.26s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d FaceSwap -c c23 --video_ids train_test_ids.json --split train --server EU2

✅ Filtered: 30 files matched for train set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/train/manipulated_sequences/FaceSwap/c23/videos
100% 30/30 [00:38<00:00,  1.28s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d original -c c23 --video_ids train_test_ids.json --split train --server EU2

✅ Filtered: 120 files matched for train set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/train/original_sequences/youtube/c23/videos
100% 120/120 [02:32<00:00,  1.27s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d Deepfakes -c c23 --video_ids train_test_ids.json --split test --server EU2

✅ Filtered: 7 files matched for test set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/test/manipulated_sequences/Deepfakes/c23/videos
100% 7/7 [00:11<00:00,  1.60s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d Face2Face -c c23 --video_ids train_test_ids.json --split test --server EU2

✅ Filtered: 7 files matched for test set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/test/manipulated_sequences/Face2Face/c23/videos
100% 7/7 [00:08<00:00,  1.28s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d FaceShifter -c c23 --video_ids train_test_ids.json --split test --server EU2

✅ Filtered: 7 files matched for test set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/test/manipulated_sequences/FaceShifter/c23/videos
100% 7/7 [00:09<00:00,  1.33s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d FaceSwap -c c23 --video_ids train_test_ids.json --split test --server EU2

✅ Filtered: 7 files matched for test set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/test/manipulated_sequences/FaceSwap/c23/videos
100% 7/7 [00:08<00:00,  1.25s/it]


In [None]:
!python3 faceforensics_download_v4.py /content/drive/MyDrive/faceforensics_data \
    -d original -c c23 --video_ids train_test_ids.json --split test --server EU2

✅ Filtered: 28 files matched for test set
📁 Downloading to: /content/drive/MyDrive/faceforensics_data/test/original_sequences/youtube/c23/videos
100% 28/28 [00:35<00:00,  1.25s/it]


In [None]:
import cv2
import os
from pathlib import Path
from tqdm import tqdm

# SETTINGS
every_n = 5  # Extract every 5th frame

# 📁 Google Drive paths
video_root = Path("/content/drive/MyDrive/faceforensics_data/train")
frame_root = Path("/content/drive/MyDrive/face_forensics_frames/train")
os.makedirs(frame_root, exist_ok=True)

# Mapping: dataset folder → class name
dataset_map = {
    "original_sequences/youtube": "original",
    "manipulated_sequences/Deepfakes": "deepfakes",
    "manipulated_sequences/Face2Face": "face2face",
    "manipulated_sequences/FaceShifter": "faceshifter",
    "manipulated_sequences/FaceSwap": "faceswap"
}

# Function to extract frames from video
def extract_frames(video_path, output_dir, every_n=5):
    cap = cv2.VideoCapture(str(video_path))
    frame_idx = 0
    success, frame = cap.read()

    while success:
        if frame_idx % every_n == 0:
            out_path = output_dir / f"{video_path.stem}_frame{frame_idx}.jpg"
            cv2.imwrite(str(out_path), frame)
        success, frame = cap.read()
        frame_idx += 1
    cap.release()

# Process each dataset
for subfolder, label in dataset_map.items():
    video_dir = video_root / subfolder / "c23" / "videos"
    output_dir = frame_root / label
    os.makedirs(output_dir, exist_ok=True)

    video_files = list(video_dir.glob("*.mp4"))
    print(f"📦 {label}: Found {len(video_files)} videos")

    for video_path in tqdm(video_files, desc=f"Extracting {label}"):
        extract_frames(video_path, output_dir, every_n)

📦 original: Found 120 videos


Extracting original: 100%|██████████| 120/120 [13:33<00:00,  6.78s/it]


📦 deepfakes: Found 40 videos


Extracting deepfakes: 100%|██████████| 40/40 [04:24<00:00,  6.61s/it]


📦 face2face: Found 30 videos


Extracting face2face: 100%|██████████| 30/30 [02:55<00:00,  5.84s/it]


📦 faceshifter: Found 30 videos


Extracting faceshifter: 100%|██████████| 30/30 [03:12<00:00,  6.41s/it]


📦 faceswap: Found 30 videos


Extracting faceswap: 100%|██████████| 30/30 [02:20<00:00,  4.68s/it]


In [None]:
import shutil
from pathlib import Path
from tqdm import tqdm

# SOURCE base directory (on Google Drive)
base_dir = Path("/content/drive/MyDrive/face_forensics_frames/train")

# Target directory where all fakes will be merged
fake_dir = base_dir / "fake_sequences"
os.makedirs(fake_dir, exist_ok=True)

# List of fake classes to merge
fake_classes = ["deepfakes", "face2face", "faceshifter", "faceswap"]

# Copy files from each class into fake_sequences/
for cls in fake_classes:
    src_dir = base_dir / cls
    image_files = list(src_dir.glob("*.jpg"))
    print(f"📦 Merging {len(image_files)} images from '{cls}'")

    for img_path in tqdm(image_files, desc=f"Merging {cls}"):
        # Optional: prefix class name to avoid name collision
        dest_filename = f"{cls}_{img_path.name}"
        shutil.copy(img_path, fake_dir / dest_filename)

📦 Merging 3920 images from 'deepfakes'


Merging deepfakes: 100%|██████████| 3920/3920 [02:07<00:00, 30.78it/s]


📦 Merging 2938 images from 'face2face'


Merging face2face: 100%|██████████| 2938/2938 [01:39<00:00, 29.59it/s]


📦 Merging 3049 images from 'faceshifter'


Merging faceshifter: 100%|██████████| 3049/3049 [01:49<00:00, 27.74it/s]


📦 Merging 2305 images from 'faceswap'


Merging faceswap: 100%|██████████| 2305/2305 [01:17<00:00, 29.84it/s]


In [None]:
from pathlib import Path

# 📂 Base path definer
base_path = Path("/content/drive/MyDrive/face_forensics_frames/train")

counts = {}

for class_folder in sorted(base_path.iterdir()):
    if class_folder.is_dir():
        jpg_files = list(class_folder.glob("*.jpg"))
        counts[class_folder.name] = len(jpg_files)

# ✨ Results
total = 0
print("📊 Frame numbers:\n")
for name, count in counts.items():
    print(f"{name:<15}: {count:6}")
    total += count

print("\n🧮 Total frame numbers:", total)

📊 Frame numbers:

deepfakes      :   3920
face2face      :   2938
faceshifter    :   3049
faceswap       :   2305
fake_sequences :  12212
original       :  12068

🧮 Total frame numbers: 36492
