In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from trainer import EarlyStopping,model_delete,mytrainsform, AugmentedDataset, dir_clear
import torch, torchvision
import torch.nn.functional as F
from torch import nn, optim
from torchvision import transforms, datasets
import os
from tqdm import trange

### 세팅 

In [2]:
# Computational device
# Device will be set to GPU if it is available.(you should install valid Pytorch version with CUDA. Otherwise, it will be computed using CPU)
from model import MODEL_CLASS2
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
DIR = './results/temp_2'
if not os.path.exists(DIR):
    os.makedirs(DIR)
print("Using Device:", DEVICE)

Using Device: cuda


In [3]:
# Fashion MNIST dataset
trainset = datasets.FashionMNIST(
    root      = './.data/', train = True,
    download  = True,
    transform = transforms.ToTensor())
testset = datasets.FashionMNIST(
    root      = './.data/', train     = False,
    download  = True,
    transform = transforms.ToTensor())

In [4]:
SELECT_NORMAL = 2 # Set 2 class as train dataset.
trainset.data = trainset.data[trainset.targets == SELECT_NORMAL]
trainset.targets = trainset.targets[trainset.targets == SELECT_NORMAL] # Set 2 class as train dataset.

test_label = [2,4,6] # Define actual test class that we use
actual_testdata = torch.isin(testset.targets, torch.tensor(test_label))
testset.data = testset.data[actual_testdata]
testset.targets = testset.targets[actual_testdata]

test_loader = torch.utils.data.DataLoader(
    dataset     = testset, batch_size  = 1,
    shuffle     = False,num_workers = 2)

train_data_size = len(trainset)
test_data_size = len(testset)

print("Train data size:", train_data_size, "Test data size:", test_data_size)

Train data size: 6000 Test data size: 3000


#### 데이터 증강 기법 사용 class 

In [5]:
# 데이터셋을 먼저 train과 val로 나누고, train에 대해서만 증강을 적용
n_val = int(len(trainset) * 0.2)
n_train = len(trainset) - n_val
BATCH_SIZE = 1024

augset, valset = torch.utils.data.random_split(trainset, [n_train, n_val], generator=torch.Generator().manual_seed(2025))

augset = AugmentedDataset(augset, transform=mytrainsform, augmentation_factor=5) # augmentation_factor = 5 for baseline 모델 찾기 
# valset은 증강을 적용하지 않음

train_loader = torch.utils.data.DataLoader(
    dataset     = augset, batch_size  = BATCH_SIZE,
    shuffle     = True,num_workers = 0) 

val_loader = torch.utils.data.DataLoader(
    dataset     = valset, batch_size = BATCH_SIZE,
    shuffle     = False,num_workers = 0)

# data size check
print("Train data size:", len(augset),"Val data size:", len(valset),"Test data size:", len(testset))

Train data size: 24000 Val data size: 1200 Test data size: 3000


### 모델 및 loss 불러오기 

In [6]:
model_classes  = MODEL_CLASS2

In [7]:
# loss function
# loss function 추천 조합
from loss.losses import FlexibleLoss, FlexibleDiffusionLoss

reconstruction_loss = {"MSE": FlexibleLoss(mode="mse"),}

diffusion_loss = {
    "MSE": FlexibleDiffusionLoss(mode="mse"),
    "MSE+Gradient": FlexibleDiffusionLoss(mode="mse+gradient", beta=1.0, alpha=0.1),
}
loss_functions = {
    "reconstruction": reconstruction_loss,
    "diffusion": diffusion_loss,
}

print("Available reconstruction loss functions:", reconstruction_loss.keys())
print("Available diffusion loss functions:", diffusion_loss.keys())

Available reconstruction loss functions: dict_keys(['MSE'])
Available diffusion loss functions: dict_keys(['MSE', 'MSE+Gradient'])


In [8]:
# 모델과 loss function 조합을 삭제
model_classes = model_delete(model_classes, loss_functions, DIR)
print("Remaining models:", model_classes.keys())

✅ 0 models and loss functions will be deleted: []
✅ 2 models and loss functions will be kept: ['SlimDeepCAE_Bottleneck32_Dropout', 'SlimDeepCAE_Combo']
Remaining models: dict_keys(['SlimDeepCAE_Bottleneck32_Dropout', 'SlimDeepCAE_Combo'])


### Trainer 실행 

In [9]:
# Check the shape of a batch from train_loader and test_loader
train_images, train_labels = next(iter(train_loader))
test_images, test_labels = next(iter(test_loader))

print("Train batch image shape:", train_images.shape)
print("Train batch label shape:", train_labels.shape)
print("Test batch image shape:", test_images.shape)
print("Test batch label shape:", test_labels.shape)

Train batch image shape: torch.Size([1024, 1, 28, 28])
Train batch label shape: torch.Size([1024])
Test batch image shape: torch.Size([1, 1, 28, 28])
Test batch label shape: torch.Size([1])


In [10]:
dir_clear(DIR + "/logs")
dir_clear(DIR + "/checkpoints")
dir_clear(DIR + "/eval_results")

✅ ./results/temp_2/logs cleared.
✅ ./results/temp_2/checkpoints cleared.
✅ ./results/temp_2/eval_results cleared.


In [11]:
EPOCHS = 500
PATIENCE = 40
# GridSearchTrainerfp16
from trainer import GridSearchTrainerFP16
trainer = GridSearchTrainerFP16(
    models=model_classes,
    criterions_dict=loss_functions,
    train_loader=train_loader,
    val_loader=val_loader,
    n_epochs=EPOCHS,
    patience=PATIENCE,
    save_dir=f'{DIR}/checkpoints',
    verbose=False, 
    device=DEVICE,
    lr=1e-3 * BATCH_SIZE / 256, # default learning rate for AdamW
    log_dir=f'{DIR}/logs',
)
results = trainer.run()

results_df = pd.DataFrame(results)
# Save the results to a CSV file
results_df.to_csv(f'{DIR}/training.csv', index=False)

Total Models: 2
Reconstruction Losses: 1 Diffusion Losses: 2
Total Combinations: 2
▶ Training [SlimDeepCAE_Bottleneck32_Dropout] with [MSE] (FP16)


SlimDeepCAE_Bottleneck32_Dropout | MSE (FP16):   0%|          | 0/500 [00:08<?, ?it/s]


AttributeError: 'GridSearchTrainerFP16' object has no attribute '_step'

In [None]:
results_df

Unnamed: 0,model,loss,best_val_loss,best_epoch,gpu_peak_usage(MB),save_path
0,SlimDeepCAE_Bottleneck16_Dropout,MSE,0.02114,41,70.255371,./results/temp_2/checkpoints/SlimDeepCAE_Bottl...
1,SlimDeepCAE_Bottleneck8_Dropout,MSE,0.042504,19,63.175781,./results/temp_2/checkpoints/SlimDeepCAE_Bottl...


In [None]:
print(f"tensorboard --logdir {DIR}/logs") # tensorboard --logdir ./results/gridsearch16/logs/1

tensorboard --logdir ./results/temp_2/logs
