# VIT(vision transformer)

https://haystar.tistory.com/95

https://ivelopalways.tistory.com/entry/Deep-Learning-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0-Vision-TransformerViT-An-Image-is-Worth-16x16-Words-Transformers-for-Image-Recognition-at-Scale#google_vignette

DATASET
https://www.kaggle.com/competitions/isic-2024-challenge/data



# Data Processing

In [None]:
import os
import shutil
import pandas as pd

# 경로 설정
image_dir = '/home/juhyun/Downloads/isic-2024-challenge/train-image/image'
meta_csv = '/home/juhyun/Downloads/isic-2024-challenge/train-metadata.csv'
output_root = 'converted-dataset/train'

# 메타데이터 로드
df = pd.read_csv(meta_csv)

for idx, row in df.iterrows():
    isic_id = row['isic_id']
    label = str(row['target'])

    # 입력 이미지 경로
    src_path = os.path.join(image_dir, f"{isic_id}.jpg")
    # 저장할 클래스 폴더
    save_dir = os.path.join(output_root, f"class{label}")
    os.makedirs(save_dir, exist_ok=True)
    # 출력 이미지 경로
    dst_path = os.path.join(save_dir, f"{isic_id}.jpg")

    try:
        shutil.copyfile(src_path, dst_path)
    except Exception as e:
        print(f" 복사 실패: {src_path} → {dst_path} : {e}")


In [None]:
import h5py
import pandas as pd
import numpy as np
import os
from PIL import Image
from io import BytesIO

# 경로
hdf5_path = '/home/juhyun/Downloads/isic-2024-challenge/test-image.hdf5'
meta_csv = '/home/juhyun/Downloads/isic-2024-challenge/test-metadata.csv'
output_dir = 'converted-dataset/test'
os.makedirs(output_dir, exist_ok=True)

# 메타데이터 로드
df = pd.read_csv(meta_csv)

# HDF5에서 이미지 추출
with h5py.File(hdf5_path, 'r') as hdf:
    for idx, row in df.iterrows():
        isic_id = row['isic_id']

        try:
            raw = hdf[isic_id][()]  # shape: [[b'\xff\xd8...']]
            if isinstance(raw, (np.ndarray, list)):
                raw = raw[0]
            img = Image.open(BytesIO(raw))
            img.save(os.path.join(output_dir, f"{isic_id}.jpg"))
        except Exception as e:
            print(f"❌ {isic_id} 변환 실패: {e}")


In [None]:
#----------------------------------------------------------------------------

In [None]:
# 값만 확인
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary


x = Image.open('/home/juhyun/Desktop/digitfinal/converted-dataset/train/class0/ISIC_0015670.jpg')
x = x.resize((288,288))
tf_toTensor = ToTensor() 
x = tf_toTensor(x)
x = torch.unsqueeze(x,0) #배치 크기 맞춰줌
x.shape

In [None]:
x = torch.randn(8,3,288,288)


In [None]:
P = 18
N = int(288*288/(18*18)) #256

"""
기존의 B*C*H*W의 차원을 B*N*(P*P*C)로 바꿔줘야함

einops의 rearrange 함수를 이용하여 이미지를 패치로 나누고 flatten을 한번에 수행할 수 있다.
"""

patches = rearrange(x, 'b c (h s1) (w s2) -> b (h w) (s1 s2 c)', s1=P, s2=P) #[1,N,P*P*c\]

In [None]:
patches.shape

In [1]:
# >> PatchEmbedding, 파라미터는 직접 계산해서 지정함
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary


import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary

class PatchEmbedding(nn.Module):
    def __init__(self, in_channels: int = 3, patch_size: int = 18, emb_size: int = 972, img_size: int = 288):
        self.patch_size = patch_size
        super().__init__()
        # patch embedding
        self.projection = nn.Sequential(
            # using a conv layer instead of a linear one -> performance gains
            nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
            Rearrange('b e (h) (w) -> b (h w) e'),
        )
        
        # nn.Parameter = 학습 가능한 파라미터로 설정하는 것임.
        # Add CLS Token
        self.cls_token = nn.Parameter(torch.randn(1,1, emb_size))
        
        # position embedding
        self.positions = nn.Parameter(torch.randn((img_size // patch_size) **2 + 1, emb_size))


    def forward(self, x: Tensor) -> Tensor:
        b, _, _, _ = x.shape
        x = self.projection(x)
        cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b)  # cls token을 x의 첫번째 차원으로 반복함.

        # prepend the cls token to the input
        x = torch.cat([cls_tokens, x], dim=1)  # torch.cat = concate  -> cls_tokens 와 x 를 연결함. (= cls 토큰 추가 과정.)
        
        # add position embedding
        x += self.positions
        return x

In [None]:
embed = PatchEmbedding()
x = embed(x)

In [None]:
x.shape   #([8, 256, 972])

In [None]:
#----------------------------------------------------
emb= 972
num_heads =9  

#k,q,v 입력 Linear embedding=> Linear projection
keys= nn.Linear(emb,emb)
queries = nn.Linear(emb,emb)
values =nn.Linear(emb,emb)

#Linear projection을 거친 q,k,v를 8 개의 head로 나눔
queries = rearrange(queries(x), 'b n (h d) -> b h n d', h=num_heads)
keys = rearrange(keys(x), 'b n (h d) -> b h n d', h=num_heads)
values = rearrange(values(x), 'b n (h d) -> b h n d', h=num_heads)

queries.shape, keys.shape, values.shape

In [None]:
# queries * keys
# Q.matmul(K.T)
emb_size= 972   # 972

energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys)
print("energy : ",energy.shape)

#Get attention score
scaling = emb_size**(1/2)
att = F.softmax(energy, dim=-1)/scaling
print("att : ", att.shape)

#Attention score * Values
out = torch.einsum('bhal, bhlv -> bhav',att, values)
print("out : ", out.shape)

#Rearrange to emb_size (concatenate)
out = rearrange(out, "b h n d -> b n (h d)")
print("out2 : ", out.shape)
#---------------------------------------------------------------------

In [2]:
# >> Transformer Encoder

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary


## Multihead attention.
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size: int = 972, num_heads: int = 9, dropout: float = 0):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        # fuse the queries, keys and values in one matrix
        self.qkv = nn.Linear(emb_size, emb_size * 3)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(emb_size, emb_size)
        
    def forward(self, x : Tensor, mask: Tensor = None) -> Tensor:
        # split keys, queries and values in num_heads
        qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=self.num_heads, qkv=3)
        queries, keys, values = qkv[0], qkv[1], qkv[2]
        # sum up over the last axis
        energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_heads, query_len, key_len
        #print("energy : ",energy.shape)
        
        if mask is not None:
            fill_value = torch.finfo(torch.float32).min
            energy.mask_fill(~mask, fill_value)
            
        scaling = self.emb_size ** (1/2)   # Get attention score
        att = F.softmax(energy, dim=-1) / scaling
        att = self.att_drop(att)
        #print("att : ", att.shape)
        
        # sum up over the third axis
        out = torch.einsum('bhal, bhlv -> bhav ', att, values)   #Attention score * Values
        #print("out : ", out.shape)

        out = rearrange(out, "b h n d -> b n (h d)")   #Rearrange to emb_size (concatenate)
        #print("out2 : ", out.shape)
        out = self.projection(out)
        return out
    



# Residuals
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
        
    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x


# MLP layer
# 기타사항, nn.Sequential 임으로 굳이 def forward 쓸 필요가 없음.
class FeedForwardBlock(nn.Sequential):
    def __init__(self, emb_size: int, expansion: int = 4, drop_p: float = 0.):
        super().__init__(
            nn.Linear(emb_size, expansion * emb_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * emb_size, emb_size),
        )

In [3]:
# >> Transformer Encoder Block


## load custom module ============> 따로 불러오기
#from layers.Multihead_attention import MultiHeadAttention, ResidualAdd, FeedForwardBlock
#from layers.patch_embedding import PatchEmbedding



class TransformerEncoderBlock(nn.Sequential):
    def __init__(self,
                 emb_size: int = 972,
                 drop_p: float = 0.,
                 forward_expansion: int = 4,
                 forward_drop_p: float = 0.,
                 ** kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                MultiHeadAttention(emb_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                FeedForwardBlock(
                    emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            )
            ))
        
class TransformerEncoder(nn.Sequential):
    def __init__(self, depth: int = 12, **kwargs):
        super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)])

In [None]:
#-------------------------------------------------
x = torch.randn(8,3,288,288)
patches_embedded = PatchEmbedding()(x)
TransformerEncoderBlock()(patches_embedded).shape
#-------------------------------------------------------

In [4]:
# >> Make MLP Head
from einops.layers.torch import Rearrange, Reduce

class ClassificationHead(nn.Sequential):
    def __init__(self, emb_size: int = 972, n_classes: int = 2):
        super().__init__(
            Reduce('b n e -> b e', reduction='mean'),
            nn.LayerNorm(emb_size), 
            nn.Linear(emb_size, n_classes))

In [5]:
# >> 학습 코드
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import os

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary
from torch.utils.data import random_split


# load custom module
#from layers.patch_embedding import PatchEmbedding
#from layers.Mlp_head import ClassificationHead
#from layers.Earlystopping import EarlyStopping
#from block.Encoder_Block import TransformerEncoder
#from PIL import Image

class ViT(nn.Sequential):
    def __init__(self,     
                in_channels: int = 3,
                patch_size: int = 18,
                emb_size: int = 972,
                img_size: int = 288,
                depth: int = 12,
                n_classes: int = 2,
                **kwargs):
        super().__init__(
            PatchEmbedding(in_channels, patch_size, emb_size, img_size),
            TransformerEncoder(depth, emb_size=emb_size, **kwargs),
            ClassificationHead(emb_size, n_classes)
        )

In [6]:
summary(ViT(), (3,288,288), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 972, 16, 16]         945,756
         Rearrange-2             [-1, 256, 972]               0
    PatchEmbedding-3             [-1, 257, 972]               0
         LayerNorm-4             [-1, 257, 972]           1,944
            Linear-5            [-1, 257, 2916]       2,837,268
           Dropout-6          [-1, 9, 257, 257]               0
            Linear-7             [-1, 257, 972]         945,756
MultiHeadAttention-8             [-1, 257, 972]               0
           Dropout-9             [-1, 257, 972]               0
      ResidualAdd-10             [-1, 257, 972]               0
        LayerNorm-11             [-1, 257, 972]           1,944
           Linear-12            [-1, 257, 3888]       3,783,024
             GELU-13            [-1, 257, 3888]               0
          Dropout-14            [-1, 25

# 데이터 학습

In [None]:
import os
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, WeightedRandomSampler
from torchvision import datasets, transforms
import torch.optim as optim
from collections import Counter
from early import *  # 조기 종료 콜백

# 1. 데이터 전처리 정의
transform = transforms.Compose([
    transforms.Resize((288, 288), antialias=True),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

batch_size = 64
data_dir = '/home/juhyun/Desktop/digitfinal/converted-dataset/train'  # class0/class1 포함

# 2. ImageFolder로 전체 데이터셋 로딩
dataset = datasets.ImageFolder(root=data_dir, transform=transform)
classes = dataset.classes
print(f"클래스: {classes}")  # 예: ['class0', 'class1']

# 3. 7:2:1 비율로 Split
total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

trainset, valset, testset = random_split(dataset, [train_size, val_size, test_size])
print(f"➡️ Train: {len(trainset)}, Val: {len(valset)}, Test: {len(testset)}")

# ✅ 4. WeightedRandomSampler: trainset만 대상
train_targets = [dataset.targets[i] for i in trainset.indices]
train_class_counts = Counter(train_targets)
print(f"Train Class Count: {train_class_counts}")

# 클래스 비율 기반 가중치 계산
class_weights = 1. / torch.tensor([train_class_counts[0], train_class_counts[1]], dtype=torch.float)
sample_weights = torch.tensor([class_weights[t] for t in train_targets])
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# ✅ 5. DataLoader 구성
trainloader = DataLoader(trainset, batch_size=batch_size, sampler=sampler, num_workers=2)
valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=2)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)


In [64]:
from early import * 

# ✅ 모델/손실/최적화 설정2
device = torch.device('cuda:0')
vit = ViT(in_channels=3, patch_size=18, emb_size=972, img_size=288, depth=12, n_classes=2).to(device)

epochs = 1000
lr = 0.001
patience = 10
early_stopping = EarlyStopping(patience=patience, verbose=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(vit.parameters(), lr=lr, momentum=0.9)

os.makedirs('./pt', exist_ok=True)
best_val_loss = float('inf')



def train(model, train_loader, optimizer, log_interval):
    model.train()
    for batch_idx, (image, label) in enumerate(train_loader):
        image, label = image.to(device), label.to(device).long()
        optimizer.zero_grad()
        output = model(image)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print(f"Train Epoch: {epoch}[{batch_idx*len(image)}/{len(train_loader.dataset)}({100*batch_idx/len(train_loader):.0f}%)]\t Train Loss : {loss.item():.6f}")

def evaluate(model, test_loader):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for image, label in test_loader:
            image, label = image.to(device), label.to(device).long()
            output = model(image)
            test_loss += criterion(output, label).item()
            prediction = output.max(1)[1]
            correct += prediction.eq(label).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, test_accuracy



In [None]:
# ✅ 학습 루프
for epoch in range(1, epochs + 1):
    train(vit, trainloader, optimizer, log_interval=5)
    val_loss, val_acc = evaluate(vit, valloader)
    print(f"\n[Epoch: {epoch}],\t Val Loss : {val_loss:.4f},\t Val Accuracy : {val_acc:.2f} %\n")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(vit.state_dict(), f'./pt/model_epoch_{epoch}_Accuracy_{val_acc:.2f}.pt')

    early_stopping(val_loss, vit)
    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def evaluate_2(model, test_loader):
    model.eval()
    test_loss, correct = 0, 0
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for image, label in test_loader:
            image, label = image.to(device), label.to(device).long()
            output = model(image)
            loss = criterion(output, label)
            test_loss += loss.item()

            prob = torch.softmax(output, dim=1)[:, 1]  # 클래스 1의 확률
            pred = torch.argmax(output, dim=1)

            all_probs.extend(prob.cpu().numpy())
            all_preds.extend(pred.cpu().numpy())
            all_labels.extend(label.cpu().numpy())
            correct += pred.eq(label).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)

    # 🧪 추가 지표 계산
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    try:
        auroc = roc_auc_score(all_labels, all_probs)
    except ValueError:
        auroc = float('nan')  # 클래스 1만 있는 경우 등 예외 처리

    # 🔙 모든 지표 리턴
    return {
        'loss': test_loss,
        'accuracy': test_accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auroc': auroc
    }


In [None]:
print("🔍 Final Test Evaluation...")
test_loss, test_acc = evaluate(vit, testloader)
re= evaluate_2(vit, testloader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%")
print(re)
