# import

In [12]:
import random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional

import cv2
import torch
import torch.nn.functional as F
from PIL import Image
from tqdm import tqdm
from transformers import ViTForImageClassification, ViTImageProcessor

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


# utils

In [19]:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".jfif"}
VIDEO_EXTS = {".mp4", ".mov"}

TARGET_SIZE = (224, 224)
NUM_FRAMES = 10  # 비디오 샘플링 프레임 수

In [20]:
def uniform_frame_indices(total_frames: int, num_frames: int) -> np.ndarray:
    """비디오 프레임을 균등하게 샘플링"""
    if total_frames <= 0:
        return np.array([], dtype=int)
    if total_frames <= num_frames:
        return np.arange(total_frames, dtype=int)
    return np.linspace(0, total_frames - 1, num_frames, dtype=int)

def get_full_frame_padded(pil_img: Image.Image, target_size=(224, 224)) -> Image.Image:
    """전체 이미지를 비율 유지하며 정사각형 패딩 처리"""
    img = pil_img.convert("RGB")
    img.thumbnail(target_size, Image.BICUBIC)
    new_img = Image.new("RGB", target_size, (0, 0, 0))
    new_img.paste(img, ((target_size[0] - img.size[0]) // 2,
                        (target_size[1] - img.size[1]) // 2))
    return new_img

def read_rgb_frames(file_path: Path, num_frames: int = NUM_FRAMES) -> List[np.ndarray]:
    """이미지 또는 비디오에서 RGB 프레임 추출"""
    ext = file_path.suffix.lower()
    
    # 이미지 파일
    if ext in IMAGE_EXTS:
        try:
            img = Image.open(file_path).convert("RGB")
            return [np.array(img)]
        except Exception:
            return []
    
    # 비디오 파일
    if ext in VIDEO_EXTS:
        cap = cv2.VideoCapture(str(file_path))
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if total <= 0:
            cap.release()
            return []
        
        frame_indices = uniform_frame_indices(total, num_frames)
        frames = []
        
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
            ret, frame = cap.read()
            if not ret:
                continue
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        cap.release()
        return frames
    
    return []

In [29]:
class PreprocessOutput:
    def __init__(
        self,
        filename: str,
        imgs: List[Image.Image],
        error: Optional[str] = None
    ):
        self.filename = filename
        self.imgs = imgs
        self.error = error

def preprocess_one(file_or_img, num_frames: int = NUM_FRAMES) -> PreprocessOutput:
    """
    파일(Path) 또는 PIL 이미지(Image.Image)를 처리
    """
    try:
        if isinstance(file_or_img, Path):
            frames = read_rgb_frames(file_or_img, num_frames=num_frames)
            filename = file_or_img.name
        elif isinstance(file_or_img, Image.Image):
            frames = [np.array(file_or_img)]
            filename = "image_obj"
        else:
            raise TypeError("Expected Path or PIL.Image.Image")
        
        imgs: list[Image.Image] = []
        for rgb in frames:
            imgs.append(get_full_frame_padded(Image.fromarray(rgb), TARGET_SIZE))
        
        return PreprocessOutput(filename, imgs, None)
    
    except Exception as e:
        return PreprocessOutput("unknown", [], str(e))


# 데이터셋 목록
- "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
- "face_crop": "OpenRL/DeepFakeFace",
- "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}

In [15]:
from datasets import load_dataset

In [16]:
# 데이터셋 목록
DATASETS = {
    "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
    "face_crop": "OpenRL/DeepFakeFace",
    "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}


In [17]:
# 개별 데이터셋 로드

from datasets import load_dataset, DatasetDict
import decord
from decord import VideoReader
from PIL import Image
import io

# 데이터셋 목록
DATASETS = {
    "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
    "face_crop": "OpenRL/DeepFakeFace",
    "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}

In [4]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds1 = load_dataset("Hemgg/deep-fake-detection-dfd-entire-original-dataset")


Downloading data:  42%|████▏     | 1450/3431 [02:44<03:44,  8.84files/s]


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Hemgg/deep-fake-detection-dfd-entire-original-dataset/xet-read-token/0d42e7bdcc07e7b855395c99e707a16645dfb6fe (Request ID: Root=1-696235bb-14f9e7a536bca322111237a6;48d8be62-8a79-4a7f-bafe-0e8f750c8151)

We had to rate limit you, you hit the quota of 1000 api requests per 5 minutes period. Upgrade to a PRO user or Team/Enterprise organization account (https://hf.co/pricing) to get higher limits. See https://huggingface.co/docs/hub/rate-limits

In [4]:
ds2 = load_dataset("OpenRL/DeepFakeFace")

Generating train split: 120004 examples [00:19, 6058.88 examples/s]


In [5]:
ds3 = load_dataset("UniDataPro/deepfake-videos-dataset")

Generating train split: 100%|██████████| 10/10 [00:00<00:00, 2530.19 examples/s]


In [6]:
print(ds2)

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 120004
    })
})


In [7]:
print(ds3)

DatasetDict({
    train: Dataset({
        features: ['video', 'label'],
        num_rows: 10
    })
})


In [30]:
import os

OUTPUT_DIR = Path("processed_data")
OUTPUT_DIR.mkdir(exist_ok=True)

def save_preprocess_output(output: PreprocessOutput, out_dir: Path = OUTPUT_DIR):
    """
    PreprocessOutput을 이미지 파일로 저장
    """
    if output.error:
        print(f"[ERROR] {output.filename}: {output.error}")
        return

    file_dir = out_dir / output.filename
    file_dir.mkdir(parents=True, exist_ok=True)

    for i, img in enumerate(output.imgs):
        img.save(file_dir / f"{i:04d}.png")


In [None]:
import os
from pathlib import Path

# 저장할 기본 경로
SAVE_DIR = Path("/workspace/hecto_deepfake/data")
SAVE_DIR.mkdir(exist_ok=True, parents=True)

# 이미지 기반 데이터셋 저장
IMG_DIR = SAVE_DIR / "ds2_images"
IMG_DIR.mkdir(exist_ok=True, parents=True)

for idx, row in enumerate(ds2["train"]):
    pil_img = row["image"]
    output = preprocess_one(pil_img)
    
    for f_idx, img in enumerate(output.imgs):
        img_path = IMG_DIR / f"fake/{idx:06d}_{f_idx}.png"
        img.save(img_path)

In [None]:
# 비디오 기반 데이터셋 저장
VID_DIR = SAVE_DIR / "ds3_videos"
VID_DIR.mkdir(exist_ok=True, parents=True)

for idx, row in enumerate(ds3["train"]):
    video_path = Path(row["video"])
    output = preprocess_one(video_path)
    
    for f_idx, img in enumerate(output.imgs):
        img_path = VID_DIR / f"{video_path.stem}_{f_idx}.png"
        img.save(img_path)