# Import

In [1]:
import random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional

import sklearn
import cv2
import torch
import torch.nn.functional as F
from PIL import Image
from tqdm import tqdm
from transformers import ViTForImageClassification, ViTImageProcessor
import wandb

  from .autonotebook import tqdm as notebook_tqdm


# Settings

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# MODEL_ID = "prithivMLmods/Deep-Fake-Detector-v2-Model"
MODEL_ID = "buildborderless/CommunityForensics-DeepfakeDet-ViT"
TEST_DIR = Path("./test_data")  # test 데이터 경로

# Submission
OUTPUT_DIR = Path("./output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # output 폴더 없으면 생성

SAFE_MODEL_ID = MODEL_ID.replace("/", "_")
OUT_CSV = OUTPUT_DIR / f"{SAFE_MODEL_ID}_submission.csv"

In [4]:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".jfif"}
VIDEO_EXTS = {".mp4", ".mov"}

# TARGET_SIZE = (224, 224)
TARGET_SIZE = (384, 384)
NUM_FRAMES = 10  # 비디오 샘플링 프레임 수

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cpu


# Utils

In [5]:
def uniform_frame_indices(total_frames: int, num_frames: int) -> np.ndarray:
    """비디오 프레임을 균등하게 샘플링"""
    if total_frames <= 0:
        return np.array([], dtype=int)
    if total_frames <= num_frames:
        return np.arange(total_frames, dtype=int)
    return np.linspace(0, total_frames - 1, num_frames, dtype=int)

def get_full_frame_padded(pil_img: Image.Image, target_size=(384, 384)) -> Image.Image:
    """전체 이미지를 비율 유지하며 정사각형 패딩 처리"""
    img = pil_img.convert("RGB")
    img.thumbnail(target_size, Image.BICUBIC)
    new_img = Image.new("RGB", target_size, (0, 0, 0))
    new_img.paste(img, ((target_size[0] - img.size[0]) // 2,
                        (target_size[1] - img.size[1]) // 2))
    return new_img

def read_rgb_frames(file_path: Path, num_frames: int = NUM_FRAMES) -> List[np.ndarray]:
    """이미지 또는 비디오에서 RGB 프레임 추출"""
    ext = file_path.suffix.lower()
    
    # 이미지 파일
    if ext in IMAGE_EXTS:
        try:
            img = Image.open(file_path).convert("RGB")
            return [np.array(img)]
        except Exception:
            return []
    
    # 비디오 파일
    if ext in VIDEO_EXTS:
        cap = cv2.VideoCapture(str(file_path))
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if total <= 0:
            cap.release()
            return []
        
        frame_indices = uniform_frame_indices(total, num_frames)
        frames = []
        
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
            ret, frame = cap.read()
            if not ret:
                continue
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        cap.release()
        return frames
    
    return []

# Data Preprocessing

In [6]:
class PreprocessOutput:
    def __init__(
        self,
        filename: str,
        imgs: List[Image.Image],
        error: Optional[str] = None
    ):
        self.filename = filename
        self.imgs = imgs
        self.error = error

def preprocess_one(file_path: Path, num_frames: int = NUM_FRAMES) -> PreprocessOutput:
    """
    파일 하나에 대한 전처리 수행
    
    Args:
        file_path: 처리할 파일 경로
        num_frames: 비디오에서 추출할 프레임 수
    
    Returns:
        PreprocessOutput 객체
    """
    try:
        frames = read_rgb_frames(file_path, num_frames=num_frames)
              
        imgs: List[Image.Image] = []
        
        for rgb in frames:     
            imgs.append(get_full_frame_padded(Image.fromarray(rgb), TARGET_SIZE))
        
        return PreprocessOutput(file_path.name, imgs, None)
    
    except Exception as e:
        return PreprocessOutput(file_path.name, [], str(e))

# Model Load

In [7]:
print("Loading model...")
model = ViTForImageClassification.from_pretrained(MODEL_ID).to(DEVICE)
processor = ViTImageProcessor.from_pretrained(MODEL_ID,size={"height": 384, "width": 384}, do_resize=True)
model.eval()

print(f"Model loaded: {MODEL_ID}")
print(f"Model config: num_labels={model.config.num_labels}")
if hasattr(model.config, 'id2label'):
    print(f"id2label: {model.config.id2label}") #real:0,fake:1,

Loading model...
Model loaded: buildborderless/CommunityForensics-DeepfakeDet-ViT
Model config: num_labels=2
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}


In [8]:
def infer_fake_probs(pil_images: List[Image.Image]) -> List[float]:
    if not pil_images:
        return []

    probs: List[float] = []

    with torch.inference_mode():
        inputs = processor(images=pil_images, return_tensors="pt",do_resize = False)  # get_full_frame_padded() 가 있으므로 resize 중복 방지
        inputs = {k: v.to(DEVICE, non_blocking=True) for k, v in inputs.items()}
        logits = model(**inputs).logits
        batch_probs = F.softmax(logits, dim=1)[:, 1]
        probs.extend(batch_probs.cpu().tolist())

    return probs

# Dataset 정의
- 학습시킬 dataset 처리

In [9]:
from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image
import torch

In [10]:
from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image

class DeepFakeImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = Path(root_dir)
        self.transform = transform
        self.samples = []

        for folder in self.root_dir.iterdir():
            if not folder.is_dir():
                continue
            if folder.name not in LABEL_MAP:
                continue

            label = LABEL_MAP[folder.name]

            for img_path in folder.rglob("*"):
                if img_path.suffix.lower() in [".jpg", ".jpeg", ".png"]:
                    self.samples.append((img_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return {
            "pixel_values": image,
            "labels": label
        }


In [11]:
# zip 파일만 다운로드
from huggingface_hub import snapshot_download

local_dir = "./deepfakeface_raw"

snapshot_download(
    repo_id="OpenRL/DeepFakeFace",
    repo_type="dataset",          # 매우 중요
    allow_patterns=["*.zip"],
    local_dir=local_dir,
    local_dir_use_symlinks=False
)


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 502.66it/s]


'C:\\Users\\yjneo\\workspace\\hecto_deepfake\\notebooks\\deepfakeface_raw'

In [13]:
# 압축해제
import zipfile
from pathlib import Path

raw_dir = Path("./deepfakeface_raw")
out_dir = Path("./deepfakeface_extracted")

out_dir.mkdir(exist_ok=True)

for zip_path in raw_dir.glob("*.zip"):
    target_dir = out_dir / zip_path.stem
    target_dir.mkdir(exist_ok=True)

    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(target_dir)


In [12]:
# 라벨링 잘 됐는지 확인
LABEL_MAP = {
     "wiki": 0, # real 
     "inpainting": 1, # fake 
     "insight": 1, # fake 
     "text2img": 1, # fake 
     }
dataset = DeepFakeImageDataset("./deepfakeface_extracted")

from collections import Counter
labels = [label for _, label in dataset.samples]
print(Counter(labels))



Counter({1: 90000, 0: 30000})


In [13]:
# train/val 분할
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import numpy as np

indices = np.arange(len(dataset))
labels = [label for _, label in dataset.samples]

train_idx, val_idx = train_test_split(
    indices,
    test_size=0.1,
    stratify=labels,
    random_state=42
)

train_dataset = Subset(dataset, train_idx)
val_dataset   = Subset(dataset, val_idx)

In [14]:
print(len(train_dataset))
print(len(val_dataset))

108000
12000


In [15]:
# 일부 데이터만 추출
from torch.utils.data import Subset
import numpy as np

ratio = 0.05   #일부만 사용
num_samples = int(len(train_dataset) * ratio)

indices = np.random.choice(
    len(train_dataset),
    size=num_samples,
    replace=False
)

small_train_dataset = Subset(train_dataset, indices)

In [16]:
from torch.utils.data import Subset
import numpy as np

val_ratio = 0.01
num_val = int(len(val_dataset) * val_ratio)

val_indices = np.random.choice(
    len(val_dataset),
    size=num_val,
    replace=False
)

small_val_dataset = Subset(val_dataset, val_indices)


In [17]:
print(len(small_train_dataset))
print(len(small_val_dataset))

5400
120


# train 준비 

In [18]:
num_epochs = 1

In [19]:
# collate_fn 학습과 추론 입력 구조를 동일하게 유지 

def collate_fn(batch):
    images, labels = zip(*batch)
    return {
        "pixel_values": torch.stack(images),
        "labels": torch.tensor(labels)
    }



In [20]:
# DataLoader
from torch.utils.data import DataLoader

train_loader = DataLoader(
    small_train_dataset,  # Train data 비율 조절하기
    batch_size=16,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    small_val_dataset,  # val data 비율 조절하기
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
    collate_fn=collate_fn
)


In [21]:
# backbone 먼저 freeze
for param in model.vit.parameters():
    param.requires_grad = False

In [22]:
# unfreeze
N = 2  # 마지막 N개 block unfreeze
for layer in model.vit.encoder.layer[-N:]:
    for param in layer.parameters():
        param.requires_grad = True


# 학습 되는 layer 확인 (선택)
trainable = sum(p.requires_grad for p in model.parameters())
total = sum(1 for _ in model.parameters())
print(f"Trainable params: {trainable} / {total}")

Trainable params: 34 / 200


In [23]:
# Oprimizer/Scheduler
Learning_Rate = 1e-3
DEVICE = "cuda"
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=Learning_Rate,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=len(train_loader) * num_epochs
)


In [24]:
# early stopping
class EarlyStopping:
    def __init__(self, patience=2, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.counter = 0

    def step(self, score):
        if self.best_score is None:
            self.best_score = score
            return False  # stop = False

        if score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                return True  # stop = True
        else:
            self.best_score = score
            self.counter = 0

        return False


In [29]:
# training one epoch
def train_one_epoch(model, train_loader, optimizer, epoch, num_epochs):
    model.train()
    total_loss = 0.0

    batch_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch [{epoch+1}/{num_epochs}] Train",
        leave=False,
        position=1
    )

    for step, (inputs, labels) in batch_bar:
        inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda(non_blocking=True)

        optimizer.zero_grad()

        logits = model(inputs)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels.float()
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        batch_bar.set_postfix({
            "batch_loss": f"{loss.item():.4f}",
            "avg_loss": f"{avg_loss:.4f}",
            "lr": optimizer.param_groups[0]["lr"]
        })

    return total_loss / len(train_loader)


In [30]:
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score


@torch.inference_mode()

def validate(model, loader):
    model.eval()
    total_loss = 0.0
    all_labels = []
    all_probs = []

    for batch in loader:
        batch = {k: v.to(DEVICE, non_blocking=True) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        probs = torch.softmax(outputs.logits, dim=1)[:, 1]  # fake probability
        all_probs.extend(probs.cpu().tolist())
        all_labels.extend(batch["labels"].cpu().tolist())

    val_loss = total_loss / len(loader)
    val_auc = roc_auc_score(all_labels, all_probs)
    return val_loss, val_auc



# training

In [31]:
# # wandb
# num_epochs = 10

# import wandb

# run = wandb.init(
#     entity="yjneon339-kyonggi-university",   # 팀명 또는 계정명
#     project="dacon_hecto_deepfake",          # 프로젝트명
#     config={
#         "learning_rate": Learning_Rate,
#         "architecture": MODEL_ID,
#         "dataset": 'kaggle',
#         "epochs": num_epochs,
#         "batch_size": train_loader.batch_size
#     }
# )


In [None]:
best_val_auc = 0.0
early_stopper = EarlyStopping(patience=2)
epoch_bar = tqdm(range(num_epochs), desc="Training", position=0)

for epoch in epoch_bar:
    # 1. Train (batch 로그는 내부에서 처리됨)
    train_loss = train_one_epoch(
        model=model,
        train_loader=train_loader,
        optimizer=optimizer,
        epoch=epoch,
        num_epochs=num_epochs
    )

    # 2. Validation
    val_loss, val_auc = validate(model, val_loader)

    # 3. Epoch 단위 로그
    epoch_bar.set_postfix({
        "train_loss": f"{train_loss:.4f}",
        "val_loss": f"{val_loss:.4f}",
        "val_auc": f"{val_auc:.4f}"
    })

    # 4. Best model 저장
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), "best_model.pt")

    # 5. Early stopping
    if early_stopper.step(val_auc):
        epoch_bar.write(
            f"Early stopping triggered at epoch {epoch+1} "
            f"(best val_auc={early_stopper.best_score:.4f})"
        )
        break


Training:   0%|          | 0/1 [00:00<?, ?it/s]

# Inference

In [27]:
# 1. 모델 아티팩트 생성
artifact = wandb.Artifact('unfreeze_2blocks_model', type='model')
artifact.add_file("unfreeze_2blocks_model.pt")

# 2. wandb에 로그
wandb.log_artifact(artifact)

ValueError: Path is not a file: 'unfreeze_2blocks_model.pt'

In [46]:
# 아티팩트 가져오기
import wandb

run = wandb.init(project="dacon_hecto_deepfake", job_type="inference")

artifact = run.use_artifact(
    "yjneon339-kyonggi-university/dacon_hecto_deepfake/unfreeze_2blocks_model:v0",
    type="model"
)

artifact_dir = artifact.download()
print(artifact_dir)


DEVICE = torch.device("cpu")
model = ViTForImageClassification.from_pretrained(MODEL_ID)
state_dict = torch.load(
    f"{artifact_dir}/unfreeze_2blocks_model.pt",
    map_location=DEVICE
)

model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()  

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[34m[1mwandb[0m: Downloading large artifact 'unfreeze_2blocks_model:v0', 137.36MB. 1 files...
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 00:00:00.4 (366.3MB/s)


c:\Users\yjneo\workspace\hecto_deepfake\notebooks\artifacts\unfreeze_2blocks_model-v0


  state_dict = torch.load(


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=384, out_features=3072, bias=True)
            (intermed

In [49]:
TEST_DIR = Path("C:/Users/yjneo/Downloads/open/test_data")  # test 데이터 경로

In [50]:
files = sorted([p for p in TEST_DIR.iterdir() if p.is_file()])
print(f"Test data length: {len(files)}")

results: Dict[str, float] = {}

# 전처리 및 추론
for file_path in tqdm(files, desc="Processing"):
    out = preprocess_one(file_path)
    
    # 1. 에러 로깅
    if out.error:
        print(f"[WARN] {out.filename}: {out.error}")
    
    # 2. 정상 추론
    elif out.imgs:
        probs = infer_fake_probs(out.imgs)
        results[out.filename] = float(np.mean(probs)) if probs else 0.0
    
    # 3. 둘 다 없으면 0.0 (real)
    else:
        results[out.filename] = 0.0

print(f"Inference completed. Processed: {len(results)} files")

Test data length: 500


Processing: 100%|██████████| 500/500 [23:30<00:00,  2.82s/it]

Inference completed. Processed: 500 files





# Submission

In [51]:
submission = pd.read_csv('C:/Users/yjneo/workspace/hecto_deepfake/sample_submission.csv')
submission['prob'] = submission['filename'].map(results).fillna(0.0)

# CSV 저장
submission.to_csv(OUT_CSV, encoding='utf-8-sig', index=False)
print(f"Saved submission to: {OUT_CSV}")

Saved submission to: output\buildborderless_CommunityForensics-DeepfakeDet-ViT_submission.csv
