In [1]:
from os import path, rename, mkdir, listdir, makedirs

import torch
from torch import nn, optim
from torch.utils.data import DataLoader

from torchvision import datasets, utils, transforms, models

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pygwalker as pyg
import wandb

datasets.utils.tqdm = tqdm
%matplotlib inline

# WandB Initialization
wandb.init(project="dAiv-ai-competition-2024-pro")

# Set CUDA Device Number 0~7
DEVICE_NUM = 7

device = torch.device("cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(DEVICE_NUM)
    device = torch.device("cuda")
print("INFO: Using device -", device)

from typing import Callable, Optional
from sklearn.model_selection import train_test_split


class ImageDataset(datasets.ImageFolder):
    download_url = "https://daiv-cnu.duckdns.org/contest/ai_competition[2024]_pro/dataset/archive.zip"
    random_state = 20241028

    def __init__(
            self, root: str, force_download: bool = True,
            train: bool = False, valid: bool = False, split_ratio: float = 0.8,
            test: bool = False, unlabeled: bool = False,
            transform: Optional[Callable] = None, target_transform: Optional[Callable] = None
    ):
        self.download(root, force=force_download)  # Download Dataset from server

        if train or valid:  # Set-up directory
            root = path.join(root, "train")
        else:
            root = path.join(root, "test" if test else "unlabeled" if unlabeled else None)

        # Initialize ImageFolder
        super().__init__(root=root, transform=transform, target_transform=target_transform)

        if train or valid:  # Split Train and Validation Set
            seperated = train_test_split(
                self.samples, self.targets, test_size=1-split_ratio, stratify=self.targets, random_state=self.random_state
            )
            self.samples, self.targets = (seperated[0], seperated[2]) if train else (seperated[1], seperated[3])
            self.imgs = self.samples

    @property
    def df(self) -> pd.DataFrame:
        return pd.DataFrame(dict(path=[d[0] for d in self.samples], label=[self.classes[lb] for lb in self.targets]))

    @classmethod
    def download(cls, root: str, force: bool = False):
        if force or not path.isfile(path.join(root, "archive.zip")):
            # Download and Extract Dataset
            datasets.utils.download_and_extract_archive(cls.download_url, download_root=root, extract_root=root, filename="archive.zip")

            # Arrange Dataset Directory
            for target_dir in [path.join(root, "test"), path.join(root, "unlabeled")]:
                for file in listdir(target_dir):
                    mkdir(path.join(target_dir, file.replace(".jpg", "")))
                    rename(path.join(target_dir, file), path.join(target_dir, file.replace(".jpg", ""), file))

            print("INFO: Dataset archive downloaded and extracted.")
        else:
            print("INFO: Dataset archive found in the root directory. Skipping download.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrnoro5122[0m ([33mrnoro5122-chungnam-national-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO: Using device - cuda


In [2]:
# Import libraries and load Datasets has already been completed.
# Image Resizing and Tensor Conversion
IMG_SIZE = (256, 256)
IMG_NORM = dict(  # ImageNet Normalization
    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
)

resizer = transforms.Compose([
    transforms.Resize(IMG_SIZE),  # Resize Image
    transforms.ToTensor(),  # Convert Image to Tensor
    transforms.Normalize(**IMG_NORM)  # Normalization
])

DATA_ROOT = path.join(".", "data")

train_dataset = ImageDataset(root=DATA_ROOT, force_download=False, train=True, transform=resizer)
valid_dataset = ImageDataset(root=DATA_ROOT, force_download=False, valid=True, transform=resizer)

test_dataset = ImageDataset(root=DATA_ROOT, force_download=False, test=True, transform=resizer)
unlabeled_dataset = ImageDataset(root=DATA_ROOT, force_download=False, unlabeled=True, transform=resizer)

print(f"INFO: Dataset loaded successfully. Number of samples - Train({len(train_dataset)}), Valid({len(valid_dataset)}), Test({len(test_dataset)}), Unlabeled({len(unlabeled_dataset)})")

#Data Augmentation if needed
ROTATE_ANGLE = 20
COLOR_TRANSFORM = 0.1

augmenter = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0), ratio=(0.75, 1.333)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(ROTATE_ANGLE),
    transforms.ColorJitter(
        brightness=COLOR_TRANSFORM, contrast=COLOR_TRANSFORM,
        saturation=COLOR_TRANSFORM, hue=COLOR_TRANSFORM
    ),
    transforms.ToTensor(),  # Convert Image to Tensor
    transforms.Normalize(**IMG_NORM)  # Normalization
])

train_dataset = ImageDataset(root=DATA_ROOT, force_download=False, train=True, transform=augmenter)

print(f"INFO: Train dataset has been overridden with augmented state. Number of samples - Train({len(train_dataset)})")

#Label Transform
CLASS_LABELS = len(train_dataset.classes) + 1
COMBINATION_AXIS = 2

import itertools

class LabelTransformer:
    def __init__(self, num_classes: int, comb_axis: int):
        self.num_classes = num_classes
        self.comb_axis = comb_axis
        self.combinations = [(-1, n) for n in (*range(num_classes), -2)] + list(itertools.combinations((-2, *range(num_classes)), comb_axis))
        self.num_combinations = len(self.combinations)

    def find(self, comb_id):
        return self.combinations[comb_id]

label_transformer = LabelTransformer(CLASS_LABELS, COMBINATION_AXIS)

#DataLoader
BATCH_SIZE = 300 # Set Batch Size

MULTI_PROCESSING = True  # Set False if DataLoader is causing issues

from platform import system
if MULTI_PROCESSING and system() != "Windows":  # Multiprocess data loading is not supported on Windows
    import multiprocessing
    cpu_cores = multiprocessing.cpu_count()
    print(f"INFO: Number of CPU cores - {cpu_cores}")
else:
    cpu_cores = 0
    print("INFO: Using DataLoader without multi-processing.")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=cpu_cores)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=cpu_cores)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=cpu_cores)

#Define Model
# class SelfAttention(nn.Module):
#     def __init__(self, hidden_dim):
#         super().__init__()
#         self.query = nn.Linear(hidden_dim, hidden_dim)
#         self.key = nn.Linear(hidden_dim, hidden_dim)
#         self.value = nn.Linear(hidden_dim, hidden_dim)
#         self.scale = 1 / math.sqrt(hidden_dim)
#         self.softmax = nn.Softmax(dim=-1)
# 
#     def forward(self, x):
#         # x의 형태: (batch_size, seq_len, hidden_dim)
#         Q = self.query(x)  # (batch_size, seq_len, hidden_dim)
#         K = self.key(x)    # (batch_size, seq_len, hidden_dim)
#         V = self.value(x)  # (batch_size, seq_len, hidden_dim)
# 
#         # Scaled Dot-Product Attention
#         scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale  # (batch_size, seq_len, seq_len)
#         attention_weights = self.softmax(scores)  # (batch_size, seq_len, seq_len)
#         context = torch.matmul(attention_weights, V)  # (batch_size, seq_len, hidden_dim)
#         return context

# class VisualEmbedding(nn.Module):
#     """ Visual Embedding Model """
# 
#     def __init__(self, embedding_dim: int):
#         super().__init__()
# 
#         self.resnet = models.resnet34(pretrained=True)
#         self.resnet.avgpool = nn.AdaptiveMaxPool2d((1, 1))
#         self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embedding_dim)
# 
#     def forward(self, x):
#         return self.resnet(x)

# class VisualEmbedding(nn.Module):
#     """ Visual Embedding Model """
#     def __init__(self, embedding_dim: int):
#         super().__init__()
#         resnet = models.resnet34(pretrained=True)
#         self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 두 레이어 제거
#         self.conv = nn.Conv2d(512, embedding_dim, kernel_size=1)  # 채널 수 조정
# 
#     def forward(self, x):
#         x = self.features(x)  # (batch_size, 512, H, W)
#         x = self.conv(x)      # (batch_size, embedding_dim, H, W)
#         return x

# class ImageClassifier(nn.Module):
#     def __init__(self, embedding_dim: int, comb_axis: int, num_combinations: int, num_classes: int):
#         super().__init__()
# 
#         # Visual Embedding
#         self.visual_embedding = VisualEmbedding(embedding_dim)
#         self.hidden_size = embedding_dim
#         self.semantic_embedding = nn.Sequential(
#             nn.LayerNorm(self.hidden_size),
#             nn.Linear(self.hidden_size, self.hidden_size // 2),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(self.hidden_size // 2, comb_axis),
#             nn.Sigmoid()
#         )
#         self.converter = nn.Linear(comb_axis, num_combinations)
# 
#     def forward(self, x) -> torch.Tensor:
#         embedding = self.visual_embedding(x)
#         embedding = self.semantic_embedding(embedding)
#         logits = self.converter(embedding)
#         return logits

# class ImageClassifier(nn.Module):
#     def __init__(self, embedding_dim: int, comb_axis: int, num_combinations: int, num_classes: int):
#         super().__init__()
#         self.visual_embedding = VisualEmbedding(embedding_dim)
#         self.self_attention = SelfAttention(embedding_dim)
#         self.flatten = nn.Flatten()
#         self.hidden_size = embedding_dim
#         self.semantic_embedding = nn.Sequential(
#             nn.LayerNorm(self.hidden_size),
#             nn.Linear(self.hidden_size, self.hidden_size // 2),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(self.hidden_size // 2, comb_axis),
#             nn.Sigmoid()
#         )
#         self.converter = nn.Linear(comb_axis, num_combinations)
# 
#     def forward(self, x):
#         feature_map = self.visual_embedding(x)  # (batch_size, embedding_dim, H, W)
#         batch_size, embedding_dim, H, W = feature_map.size()
# 
#         # 피처 맵을 시퀀스로 변환
#         feature_seq = feature_map.view(batch_size, embedding_dim, -1).permute(0, 2, 1)  # (batch_size, seq_len, embedding_dim)
# 
#         # Self-Attention 적용
#         attended_features = self.self_attention(feature_seq)  # (batch_size, seq_len, embedding_dim)
# 
#         # 시퀀스를 하나의 벡터로 변환 (평균 풀링)
#         embedding = torch.mean(attended_features, dim=1)  # (batch_size, embedding_dim)
# 
#         # 기존의 semantic_embedding과 converter 적용
#         embedding = self.semantic_embedding(embedding)
#         logits = self.converter(embedding)
#         return logits

INFO: Dataset archive found in the root directory. Skipping download.
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Dataset loaded successfully. Number of samples - Train(7478), Valid(1870), Test(1110), Unlabeled(380)
INFO: Dataset archive found in the root directory. Skipping download.
INFO: Train dataset has been overridden with augmented state. Number of samples - Train(7478)
INFO: Number of CPU cores - 48


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torchvision import models

# Multi-Head Self-Attention 클래스 정의
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        assert hidden_dim % num_heads == 0, "hidden_dim은 num_heads로 나누어 떨어져야 합니다."
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads

        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim)
        self.scale = 1 / math.sqrt(self.head_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # 쿼리, 키, 밸류 계산
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Multi-Head로 형태 변환
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale  # (batch_size, num_heads, seq_len, seq_len)
        attention_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attention_weights, V)  # (batch_size, num_heads, seq_len, head_dim)

        # 헤드 연결
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        output = self.out_proj(context)  # (batch_size, seq_len, hidden_dim)
        return output

# 2D 위치 인코딩 클래스 정의
class PositionalEncoding2D(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

    def forward(self, x):
        batch_size, channels, height, width = x.size()
        device = x.device

        # 위치 좌표 생성
        y_pos = torch.linspace(0, 1, steps=height, device=device).unsqueeze(1).repeat(1, width)
        x_pos = torch.linspace(0, 1, steps=width, device=device).unsqueeze(0).repeat(height, 1)
        y_pos = y_pos.unsqueeze(0).unsqueeze(0).repeat(batch_size, self.hidden_dim // 2, 1, 1)
        x_pos = x_pos.unsqueeze(0).unsqueeze(0).repeat(batch_size, self.hidden_dim // 2, 1, 1)

        # 위치 인코딩 생성
        pos_encoding = torch.cat([x_pos, y_pos], dim=1)
        return pos_encoding  # (batch_size, hidden_dim, height, width)

# Visual Embedding 클래스 정의
class VisualEmbedding(nn.Module):
    """Visual Embedding Model"""
    def __init__(self, embedding_dim: int):
        super().__init__()
        resnet = models.resnet34(pretrained=True)
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 두 레이어 제거
        self.conv = nn.Conv2d(512, embedding_dim, kernel_size=1)  # 채널 수 조정
        self.bn = nn.BatchNorm2d(embedding_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.features(x)  # (batch_size, 512, H, W)
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x  # (batch_size, embedding_dim, H, W)

# ImageClassifier 클래스 정의
class ImageClassifier(nn.Module):
    def __init__(self, embedding_dim: int, comb_axis: int, num_combinations: int, num_classes: int):
        super().__init__()
        self.visual_embedding = VisualEmbedding(embedding_dim)
        self.position_encoding = PositionalEncoding2D(embedding_dim)
        self.self_attention = nn.Sequential(
            nn.LayerNorm(embedding_dim),
            MultiHeadSelfAttention(embedding_dim, num_heads=8),
            nn.LayerNorm(embedding_dim)
        )
        self.hidden_size = embedding_dim
        self.semantic_embedding = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(self.hidden_size // 2, comb_axis),
            nn.Sigmoid()
        )
        self.converter = nn.Linear(comb_axis, num_combinations)

    def forward(self, x):
        feature_map = self.visual_embedding(x)  # (batch_size, embedding_dim, H, W)
        pos_encoding = self.position_encoding(feature_map)
        feature_map = feature_map + pos_encoding  # 위치 인코딩 추가

        batch_size, embedding_dim, H, W = feature_map.size()
        feature_seq = feature_map.view(batch_size, embedding_dim, -1).permute(0, 2, 1)  # (batch_size, seq_len, embedding_dim)

        # Self-Attention 적용
        attended_features = self.self_attention(feature_seq)  # (batch_size, seq_len, embedding_dim)

        # Residual Connection
        attended_features = attended_features + feature_seq

        # Global Max Pooling
        embedding, _ = torch.max(attended_features, dim=1)  # (batch_size, embedding_dim)

        # Semantic Embedding과 Converter 적용
        embedding = self.semantic_embedding(embedding)
        logits = self.converter(embedding)
        return logits  # (batch_size, num_combinations)

In [None]:
EMBEDDING_DIM = 24  # 8~16: log(labels)

MODEL_PARAMS = dict(
    embedding_dim=EMBEDDING_DIM, comb_axis=COMBINATION_AXIS,
    num_combinations=label_transformer.num_combinations, num_classes=CLASS_LABELS
)

# Initialize Model
model = ImageClassifier(**MODEL_PARAMS)
model_id = "visual_embedding_attention"
model.to(device)

LEARNING_RATE = 0.0001
EPOCH = 400

criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE, steps_per_epoch=len(train_loader), epochs=EPOCH)

wandb.watch(model, criterion, log="all", log_freq=10)

# 모델 저장 및 불러오는 함수 정의
def save_checkpoint(epoch, model, optimizer, loss, PATH):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }
    torch.save(checkpoint, PATH)
    print(f" Model saved.")

def load_checkpoint(PATH, model, optimizer):
    if path.isfile(PATH):
        checkpoint = torch.load(PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        loss = checkpoint['loss']
        print(f"체크포인트 '{path.basename(PATH)}'에서 모델 로드 완료 (시작 에포크: {start_epoch})")
        return start_epoch, loss
    else:
        print(f"체크포인트 '{path.basename(PATH)}'를 찾을 수 없습니다. 새로 훈련을 시작합니다.")
        return 0, None

#Traning Loop
train_length, valid_length = map(len, (train_loader, valid_loader))

PATH = path.join('checkpoints', f"{model_id}_checkpoint.pt.tar") # 모델 체크포인트 저장 경로
# 해당 경로에 폴더가 없을 경우 폴더 생성
makedirs('checkpoints', exist_ok=True)
save_cycle = 5

# 체크포인트 로드
start_epoch, _ = load_checkpoint(PATH, model, optimizer)

epochs = tqdm(range(start_epoch, EPOCH), desc="Running Epochs")
with (tqdm(total=train_length, desc="Training") as train_progress,
      tqdm(total=valid_length, desc="Validation") as valid_progress):  # Set up Progress Bars

    for epoch in epochs:
        train_progress.reset(total=train_length)
        valid_progress.reset(total=valid_length)

        # Training
        model.train()
        for i, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()

            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()  # Update Learning Rate

            train_progress.update(1)
            #if i != train_length-1: wandb.log({'Loss': loss.item()})
            print(f"\rEpoch [{epoch+1:2}/{EPOCH}], Step [{i+1:2}/{train_length}], Loss: {loss.item():.6f}", end="")

        val_acc, val_loss = 0, 0

        # Validation
        model.eval()
        with torch.no_grad():
            for inputs, targets in valid_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)

                val_loss += criterion(outputs, targets).item() / valid_length
                val_acc += (torch.max(outputs, 1)[1] == targets.data).sum() / len(valid_dataset)
                valid_progress.update(1)

        #wandb.log({'Loss': loss.item(), 'Val Acc': val_acc, 'Val Loss': val_loss})
        print(f"\rEpoch [{epoch+1:2}/{EPOCH}], Step [{train_length}/{train_length}], Loss: {loss.item():.6f}, Valid Acc: {val_acc:.6%}, Valid Loss: {val_loss:.6f}", end="\n" if (epoch+1) % 5 == 0 or (epoch+1) == EPOCH else "")

        # save_cycle마다 모델 저장
        if (epoch + 1) % save_cycle == 0:
            save_checkpoint(epoch, model, optimizer, loss.item(), PATH)



체크포인트 'visual_embedding_attention_checkpoint.pt.tar'를 찾을 수 없습니다. 새로 훈련을 시작합니다.


Running Epochs:   0%|          | 0/400 [00:00<?, ?it/s]

Training:   0%|          | 0/25 [00:00<?, ?it/s]

Validation:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch [ 4/400], Step [20/25], Loss: 9.178624, Valid Acc: 0.000000%, Valid Loss: 9.114741

In [None]:
if not path.isdir(path.join(".", "models")):
    mkdir(path.join(".", "models"))

# Model Save
save_path = path.join(".", "models", f"visual_embedding.pt")
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")