# import

In [6]:
import random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional

import cv2
import torch
import torch.nn.functional as F
from PIL import Image
from tqdm import tqdm
from transformers import ViTForImageClassification, ViTImageProcessor

# utils

In [7]:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".jfif"}
VIDEO_EXTS = {".mp4", ".mov"}

TARGET_SIZE = (224, 224)
NUM_FRAMES = 10  # 비디오 샘플링 프레임 수

In [8]:
def uniform_frame_indices(total_frames: int, num_frames: int) -> np.ndarray:
    """비디오 프레임을 균등하게 샘플링"""
    if total_frames <= 0:
        return np.array([], dtype=int)
    if total_frames <= num_frames:
        return np.arange(total_frames, dtype=int)
    return np.linspace(0, total_frames - 1, num_frames, dtype=int)

def get_full_frame_padded(pil_img: Image.Image, target_size=(224, 224)) -> Image.Image:
    """전체 이미지를 비율 유지하며 정사각형 패딩 처리"""
    img = pil_img.convert("RGB")
    img.thumbnail(target_size, Image.BICUBIC)
    new_img = Image.new("RGB", target_size, (0, 0, 0))
    new_img.paste(img, ((target_size[0] - img.size[0]) // 2,
                        (target_size[1] - img.size[1]) // 2))
    return new_img

def read_rgb_frames(file_path: Path, num_frames: int = NUM_FRAMES) -> List[np.ndarray]:
    """이미지 또는 비디오에서 RGB 프레임 추출"""
    ext = file_path.suffix.lower()
    
    # 이미지 파일
    if ext in IMAGE_EXTS:
        try:
            img = Image.open(file_path).convert("RGB")
            return [np.array(img)]
        except Exception:
            return []
    
    # 비디오 파일
    if ext in VIDEO_EXTS:
        cap = cv2.VideoCapture(str(file_path))
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if total <= 0:
            cap.release()
            return []
        
        frame_indices = uniform_frame_indices(total, num_frames)
        frames = []
        
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
            ret, frame = cap.read()
            if not ret:
                continue
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        cap.release()
        return frames
    
    return []

In [9]:
class PreprocessOutput:
    def __init__(
        self,
        filename: str,
        imgs: List[Image.Image],
        error: Optional[str] = None
    ):
        self.filename = filename
        self.imgs = imgs
        self.error = error

def preprocess_one(file_or_img, num_frames: int = NUM_FRAMES) -> PreprocessOutput:
    """
    파일(Path) 또는 PIL 이미지(Image.Image)를 처리
    """
    try:
        if isinstance(file_or_img, Path):
            frames = read_rgb_frames(file_or_img, num_frames=num_frames)
            filename = file_or_img.name
        elif isinstance(file_or_img, Image.Image):
            frames = [np.array(file_or_img)]
            filename = "image_obj"
        else:
            raise TypeError("Expected Path or PIL.Image.Image")
        
        imgs: list[Image.Image] = []
        for rgb in frames:
            imgs.append(get_full_frame_padded(Image.fromarray(rgb), TARGET_SIZE))
        
        return PreprocessOutput(filename, imgs, None)
    
    except Exception as e:
        return PreprocessOutput("unknown", [], str(e))


# 데이터셋 목록
- "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
- "face_crop": "OpenRL/DeepFakeFace",
- "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}

In [11]:
from datasets import load_dataset

In [12]:
# 데이터셋 목록
DATASETS = {
    "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
    "face_crop": "OpenRL/DeepFakeFace",
    "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}


In [None]:
# 개별 데이터셋 로드

from datasets import load_dataset, DatasetDict
import decord
from decord import VideoReader
from PIL import Image
import io

# 데이터셋 목록
DATASETS = {
    "low_face_ratio": "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
    "face_crop": "OpenRL/DeepFakeFace",
    "tall_aspect_ratio": "UniDataPro/deepfake-videos-dataset"
}

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
from datasets import load_dataset

ds = load_dataset(
    "Hemgg/deep-fake-detection-dfd-entire-original-dataset",
    streaming = True
)




Downloading data:   0%|          | 0/3431 [00:00<?, ?files/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\yjneo\\.cache\\huggingface\\hub\\datasets--Hemgg--deep-fake-detection-dfd-entire-original-dataset\\snapshots\\0d42e7bdcc07e7b855395c99e707a16645dfb6fe\\DFD_manipulated_sequences\\DFD_manipulated_sequences\\01_04__walking_down_street_outside_angry__0XUW13RW.mp4'

In [None]:
ds2 = load_dataset("OpenRL/DeepFakeFace",
                   streaming = True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but t

OSError: [Errno 28] No space left on device

In [None]:
ds3 = load_dataset("UniDataPro/deepfake-videos-dataset",
                   cache_dir="C:/Users/yjneo/workspace/hecto_deepfake/data")

Generating train split: 100%|██████████| 10/10 [00:00<00:00, 2530.19 examples/s]


In [6]:
print(ds2)

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 120004
    })
})


In [7]:
print(ds3)

DatasetDict({
    train: Dataset({
        features: ['video', 'label'],
        num_rows: 10
    })
})


In [30]:
import os

OUTPUT_DIR = Path("processed_data")
OUTPUT_DIR.mkdir(exist_ok=True)

def save_preprocess_output(output: PreprocessOutput, out_dir: Path = OUTPUT_DIR):
    """
    PreprocessOutput을 이미지 파일로 저장
    """
    if output.error:
        print(f"[ERROR] {output.filename}: {output.error}")
        return

    file_dir = out_dir / output.filename
    file_dir.mkdir(parents=True, exist_ok=True)

    for i, img in enumerate(output.imgs):
        img.save(file_dir / f"{i:04d}.png")


In [None]:
import os
from pathlib import Path

# 저장할 기본 경로
SAVE_DIR = Path("/workspace/hecto_deepfake/data")
SAVE_DIR.mkdir(exist_ok=True, parents=True)

# 이미지 기반 데이터셋 저장
IMG_DIR = SAVE_DIR / "ds2_images"
IMG_DIR.mkdir(exist_ok=True, parents=True)

for idx, row in enumerate(ds2["train"]):
    pil_img = row["image"]
    output = preprocess_one(pil_img)
    
    for f_idx, img in enumerate(output.imgs):
        img_path = IMG_DIR / f"fake/{idx:06d}_{f_idx}.png"
        img.save(img_path)

In [None]:
# 비디오 기반 데이터셋 저장
VID_DIR = SAVE_DIR / "ds3_videos"
VID_DIR.mkdir(exist_ok=True, parents=True)

for idx, row in enumerate(ds3["train"]):
    video_path = Path(row["video"])
    output = preprocess_one(video_path)
    
    for f_idx, img in enumerate(output.imgs):
        img_path = VID_DIR / f"{video_path.stem}_{f_idx}.png"
        img.save(img_path)