### torchvision의 models 모듈로 pretrained 모델 생성

In [3]:
import torch
import torch.nn as nn
from torchvision import models

def create_resnet_model(model_name, num_classes=10, weights='DEFAULT'):
    model = None
    if model_name == 'resnet50':
        model = models.resnet50(weights=weights)
    elif model_name == 'resnext50_32x4d':
        model = models.resnext50_32x4d(weights=weights)
    
    num_in_features = model.fc.in_features
    model.fc = nn.Linear(in_features=num_in_features, out_features=num_classes)

    return model
    
model = create_resnet_model('resnet50', num_classes=2, weights='DEFAULT') #resnet50, resnext50_32x4d

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 78.0MB/s]


### 개와 고양이 데이터 세트 다운로드 후 메타 데이터 생성

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split

def create_cnd_meta_df(file_dir):
    paths = [] # 이미지 파일 경로 리스트
    dataset_gubuns = [] # 학습과 테스트
    label_gubuns = [] # 개와 고양이
    # os.walk()를 이용하여 특정 디렉토리 밑에 있는 모든 하위 디렉토리를 모두 조사. 
    # cat-and-dog 하위 디렉토리 밑에 jpg 확장자를 가진 파일이 모두 이미지 파일임
    # cat-and-dog 밑으로 /train/, /test/ 하위 디렉토리 존재(학습, 테스트 용 이미지 파일들을 가짐)
    for dirname, _, filenames in os.walk(file_dir):
        for filename in filenames:
            # 이미지 파일이 아닌 파일도 해당 디렉토리에 있음.
            if '.jpg' in filename:
                # 파일의 절대 경로를 file_path 변수에 할당. 
                file_path = dirname+'/'+ filename
                paths.append(file_path)
                # 파일의 절대 경로에 training_set, test_set가 포함되어 있으면 데이터 세트 구분을 'train'과 'test'로 분류. 
                if '/training_set/' in file_path:
                    dataset_gubuns.append('train')  
                elif '/test_set/' in file_path:
                    dataset_gubuns.append('test')
                else: dataset_gubuns.append('N/A')
                
                # 파일의 절대 경로에 dogs가 있을 경우 해당 파일은 dog 이미지 파일이고, cats일 경우는 cat 이미지 파일임. 
                if 'dogs' in file_path:
                    label_gubuns.append('DOG')
                elif 'cats' in file_path:
                    label_gubuns.append('CAT')
                else: label_gubuns.append('N/A')
    # DataFrame 메타 데이터 생성. 
    data_df = pd.DataFrame({'path':paths, 
                            'dataset':dataset_gubuns, 
                            'label':label_gubuns})
    # Target값 0, 1 변환
    label_mapping = {'DOG': 0, 'CAT': 1}
    data_df['target'] = data_df['label'].map(label_mapping)

    return data_df

### 학습, 검증, 테스트 데이터로 나누고 Dataset 및 DataLoader 생성
* 학습 시간을 줄이기 위해서 학습과 검증 데이터를 줄임

In [12]:
!ls /kaggle/input

test_set  training_set


In [5]:
data_df = create_cnd_meta_df(file_dir='/kaggle/input/')
print(data_df.shape)

(10028, 4)


In [6]:
# 전체 데이터 세트에서 학습과 테스트용 메타 정보 DataFrame 생성. 
train_df = data_df[data_df['dataset']=='train']
test_df = data_df[data_df['dataset']=='test']

# train_df의 50%를 train_df_temp로 할당. 
train_df_temp, _ = train_test_split(train_df, test_size=0.5, stratify=train_df['target'], random_state=2025)

# 다시 train_df_temp의 50%씩 tr_df와 val_df로 할당.   
tr_df, val_df = train_test_split(train_df_temp, test_size=0.5, stratify=train_df_temp['target'], random_state=2025)

print(data_df.shape, train_df.shape, tr_df.shape, val_df.shape)

(10028, 4) (8005, 4) (2001, 4) (2001, 4)


#### Dataset 및 학습/검증용 DataLoader 생성

In [7]:
import torch
from torch.utils.data import Dataset

from PIL import Image

class CnD_Dataset(Dataset):
    # 이미지 파일리스트, 타겟 파일리스트, transforms 등 이미지와 타겟 데이터 가공에 필요한 인자들을 입력 받음
    def __init__(self, image_paths, targets=None, transform=None):
        self.image_paths = image_paths
        self.targets = targets
        self.transform = transform
    
    # 전체 건수를 반환
    def __len__(self):
        return len(self.image_paths)
        
    # idx로 지정된 하나의 image, label을 tensor 형태로 반환
    def __getitem__(self, idx):    
        # PIL을 이용하여 이미지 로딩하고 PIL Image 객체 반환.
        pil_image = Image.open(self.image_paths[idx])
        # 보통은 transform이 None이 되는 경우는 거의 없음(Tensor 변환이라도 있음)
        image = self.transform(pil_image)

        if self.targets is not None:
            # 개별 target값을 tensor로 변환.
            target = torch.tensor(self.targets[idx])
            return image, target
        # 테스트 데이터의 경우 targets가 입력 되지 않을 수 있으므로 이를 대비. 
        else:
            return image, None

In [8]:
from torch.utils.data import DataLoader
from torchvision import transforms as T

BATCH_SIZE = 16
IMG_SIZE = 224
IMG_MEANS = [0.485, 0.456, 0.406] # ImageNet 데이터세트의 이미지 채널별 평균값
IMG_STD = [0.229, 0.224, 0.225] # ImageNet 데이터세트의 이미지 채널별 표준편차값

def create_tr_val_loader(tr_df, val_df, transform):
    tr_dataset = CnD_Dataset(image_paths=tr_df['path'].to_list(), 
                            targets=tr_df['target'].to_list(), transform=transform)
    val_dataset = CnD_Dataset(image_paths=val_df['path'].to_list(), 
                            targets=val_df['target'].to_list(), transform=transform)
    tr_loader = DataLoader(tr_dataset, batch_size = BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=2*BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    return tr_loader, val_loader

transform_01 = T.Compose([
            T.Resize(size=(IMG_SIZE, IMG_SIZE)),
            T.ToTensor(), T.Normalize(mean=IMG_MEANS, std=IMG_STD)
])

tr_loader, val_loader = create_tr_val_loader(tr_df=tr_df, val_df=val_df, transform=transform_01)

### Trainer 클래스 모듈화
* 실습 github에서 utils.py를 다운로드 받고, 이를 kaggle 파일시스템에 저장한 뒤 해당 파일을 import하여 Trainer 클래스 로딩
* Predictor, ModelCheckpoint, EarlyStopping 모두 utils.py에 클래스로 생성됨. 

In [17]:
!pwd

/kaggle/working


In [23]:
# /kaggle/working/modular/v1 디렉토리에 utils.py 파일 다운로드
!rm -rf ./modular/v1
!mkdir -p ./modular/v1
!wget -O ./modular/v1/utils.py https://raw.githubusercontent.com/chulminkw/CNN_PG_Torch/main/modular/v1/utils.py?raw=true
!ls ./modular/v1

--2025-01-31 04:42:39--  https://raw.githubusercontent.com/chulminkw/CNN_PG_Torch/main/modular/v1/utils.py?raw=true
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13594 (13K) [text/plain]
Saving to: ‘./modular/v1/utils.py’


2025-01-31 04:42:39 (23.3 MB/s) - ‘./modular/v1/utils.py’ saved [13594/13594]

utils.py


In [24]:
!cat /kaggle/working/modular/v1/utils.py

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import os

class Trainer:
    def __init__(self, model, loss_fn, optimizer, train_loader, val_loader, scheduler=None, 
                 callbacks=None, device=None):
        self.model = model.to(device)
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        # scheduler 추가
        self.scheduler = scheduler
        self.device = device
        # 현재 learning rate 변수 추가
        self.current_lr = self.optimizer.param_groups[0]['lr']
        #checkpoint와 early stopping 클래스들을 list로 받음. 
        self.callbacks = callbacks
        
    def train_epoch(self, epoch):
        self.model.train()

        # running 평균 loss 계산.
        accu_loss = 0.0
        running_avg_loss = 0.0
        # 정확도, 정확도 계산을 위한 전체 건수 및 누적 정확건수
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        # tqdm으로

In [25]:
import sys

# 반드시 system path를 아래와 같이 잡아줘야 함. 
sys.path.append('/kaggle/working')

#아래가 수행되는지 반드시 확인
from modular.v1.utils import Trainer, ModelCheckpoint, EarlyStopping

checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_loss', mode='min', save_interval=1, verbose=1)
earlystop_cb = EarlyStopping(monitor='val_loss', mode='min', early_patience=2, verbose=1)

#### Trainer 클래스를 utils.py에서 import하여 생성 및 모델 학습

In [19]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
# 다운로드된 utils.py에서 class import
from modular.v1.utils import Trainer, ModelCheckpoint, EarlyStopping

model = create_resnet_model('resnet50', num_classes=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = Adam(model.parameters(), lr=0.001) # model.fc.parameters(), weight_decay=0.9
loss_fn = nn.CrossEntropyLoss()
# Patience를 3. 
scheduler = ReduceLROnPlateau(
            optimizer=optimizer, mode='min', factor=0.5, patience=3, threshold=0.01, min_lr=0.00001)

checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_acc', mode='max', save_interval=1, verbose=1)
earlystop_cb = EarlyStopping(monitor='val_acc', mode='max', early_patience=5, verbose=1)
callbacks = [checkpoint_cb, earlystop_cb]
trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer,
                  train_loader=tr_loader, val_loader=val_loader, scheduler=scheduler, 
                  callbacks=callbacks,
                  device=device)
# 학습 및 평가. 
history = trainer.fit(30)

Epoch 1 [Training..]: 100%|██████████| 126/126 [00:13<00:00,  9.44it/s, Loss=0.267, Accuracy=0.901]
Epoch 1 [Validating]: 100%|██████████| 63/63 [00:05<00:00, 11.35it/s, Loss=1.39, Accuracy=0.866] 


Epoch 1/30, Train Loss: 0.2673 Train Accuracy: 0.9005 , Val Loss: 1.3874 Val Accuracy: 0.8661 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_1_val_acc_0.8661.pt


Epoch 2 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.62it/s, Loss=0.276, Accuracy=0.884]
Epoch 2 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.28it/s, Loss=0.228, Accuracy=0.913]


Epoch 2/30, Train Loss: 0.2764 Train Accuracy: 0.8836 , Val Loss: 0.2277 Val Accuracy: 0.9130 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_2_val_acc_0.9130.pt


Epoch 3 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.182, Accuracy=0.929]
Epoch 3 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 13.27it/s, Loss=0.21, Accuracy=0.918] 


Epoch 3/30, Train Loss: 0.1824 Train Accuracy: 0.9290 , Val Loss: 0.2104 Val Accuracy: 0.9175 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_3_val_acc_0.9175.pt


Epoch 4 [Training..]: 100%|██████████| 126/126 [00:12<00:00, 10.47it/s, Loss=0.16, Accuracy=0.944] 
Epoch 4 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.94it/s, Loss=0.191, Accuracy=0.922]


Epoch 4/30, Train Loss: 0.1600 Train Accuracy: 0.9435 , Val Loss: 0.1905 Val Accuracy: 0.9215 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_4_val_acc_0.9215.pt


Epoch 5 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.63it/s, Loss=0.147, Accuracy=0.951]
Epoch 5 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.88it/s, Loss=0.683, Accuracy=0.811]


Epoch 5/30, Train Loss: 0.1471 Train Accuracy: 0.9510 , Val Loss: 0.6831 Val Accuracy: 0.8111 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 6 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.62it/s, Loss=0.259, Accuracy=0.901]
Epoch 6 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.96it/s, Loss=0.265, Accuracy=0.878]


Epoch 6/30, Train Loss: 0.2595 Train Accuracy: 0.9010 , Val Loss: 0.2651 Val Accuracy: 0.8781 , Current lr:0.001000
EarlyStopping counter: 2/5


Epoch 7 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.63it/s, Loss=0.173, Accuracy=0.936]
Epoch 7 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.42it/s, Loss=0.228, Accuracy=0.917]


Epoch 7/30, Train Loss: 0.1733 Train Accuracy: 0.9355 , Val Loss: 0.2277 Val Accuracy: 0.9165 , Current lr:0.001000
EarlyStopping counter: 3/5


Epoch 8 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.64it/s, Loss=0.0725, Accuracy=0.978]
Epoch 8 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.97it/s, Loss=0.266, Accuracy=0.918]


Epoch 8/30, Train Loss: 0.0725 Train Accuracy: 0.9775 , Val Loss: 0.2656 Val Accuracy: 0.9180 , Current lr:0.000500
EarlyStopping counter: 4/5


Epoch 9 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.63it/s, Loss=0.0477, Accuracy=0.984]
Epoch 9 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.81it/s, Loss=0.159, Accuracy=0.944]


Epoch 9/30, Train Loss: 0.0477 Train Accuracy: 0.9840 , Val Loss: 0.1595 Val Accuracy: 0.9440 , Current lr:0.000500
Saved model checkpoint at checkpoints/checkpoint_epoch_9_val_acc_0.9440.pt


Epoch 10 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.63it/s, Loss=0.0142, Accuracy=0.998]
Epoch 10 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.45it/s, Loss=0.15, Accuracy=0.951] 


Epoch 10/30, Train Loss: 0.0142 Train Accuracy: 0.9980 , Val Loss: 0.1502 Val Accuracy: 0.9505 , Current lr:0.000500
Saved model checkpoint at checkpoints/checkpoint_epoch_10_val_acc_0.9505.pt


Epoch 11 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.64it/s, Loss=0.0203, Accuracy=0.995]
Epoch 11 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.87it/s, Loss=0.121, Accuracy=0.955]


Epoch 11/30, Train Loss: 0.0203 Train Accuracy: 0.9945 , Val Loss: 0.1212 Val Accuracy: 0.9550 , Current lr:0.000500
Saved model checkpoint at checkpoints/checkpoint_epoch_11_val_acc_0.9550.pt


Epoch 12 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.55it/s, Loss=0.00698, Accuracy=0.999]
Epoch 12 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.72it/s, Loss=0.139, Accuracy=0.953]


Epoch 12/30, Train Loss: 0.0070 Train Accuracy: 0.9985 , Val Loss: 0.1388 Val Accuracy: 0.9525 , Current lr:0.000500
EarlyStopping counter: 1/5


Epoch 13 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.64it/s, Loss=0.00787, Accuracy=0.999]
Epoch 13 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.76it/s, Loss=0.148, Accuracy=0.955]


Epoch 13/30, Train Loss: 0.0079 Train Accuracy: 0.9985 , Val Loss: 0.1480 Val Accuracy: 0.9545 , Current lr:0.000500
EarlyStopping counter: 2/5


Epoch 14 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.54it/s, Loss=0.0346, Accuracy=0.989]
Epoch 14 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.76it/s, Loss=0.157, Accuracy=0.946]


Epoch 14/30, Train Loss: 0.0346 Train Accuracy: 0.9890 , Val Loss: 0.1567 Val Accuracy: 0.9460 , Current lr:0.000500
EarlyStopping counter: 3/5


Epoch 15 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.65it/s, Loss=0.0207, Accuracy=0.994]
Epoch 15 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 13.71it/s, Loss=0.23, Accuracy=0.942] 


Epoch 15/30, Train Loss: 0.0207 Train Accuracy: 0.9940 , Val Loss: 0.2303 Val Accuracy: 0.9420 , Current lr:0.000250
EarlyStopping counter: 4/5


Epoch 16 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.61it/s, Loss=0.00825, Accuracy=0.999]
Epoch 16 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 15.04it/s, Loss=0.17, Accuracy=0.947] 

Epoch 16/30, Train Loss: 0.0082 Train Accuracy: 0.9985 , Val Loss: 0.1696 Val Accuracy: 0.9465 , Current lr:0.000250
EarlyStopping counter: 5/5
Early stopping happens and train stops





#### Predictor 클래스를 import 하여 테스트 데이터 평가 예측


In [20]:
from modular.v1.utils import Predictor

test_image_paths = test_df['path'].to_list()
test_targets = test_df['target'].to_list()

IMG_SIZE=224
test_transform = T.Compose([
                        T.Resize(size=(IMG_SIZE, IMG_SIZE)),
                        T.ToTensor(), 
                        T.Normalize(mean=[0.485, 0.456, 0.406], 
                                    std=[0.229, 0.224, 0.225])
])

test_dataset = CnD_Dataset(image_paths=test_image_paths, 
                            targets=test_targets, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

trained_model = trainer.get_trained_model()

predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

[Evaluating]: 100%|██████████| 64/64 [00:05<00:00, 10.97it/s, Accuracy=0.948]

test dataset evaluation:0.9476





### torchmetrics를 활용한 평가
* 모델의 성능을 측정하기 위한 다양한 평가 지표가(정확도, F1 Score, ROC-AUC등) 있지만 pytorch framework은 이들 성능 지표를 위한 라이브러리를 제공하지 않음. 
* torchmetric(https://lightning.ai/docs/torchmetrics/stable/) 은 다양한 유형의 Metric을 라이브러리로 제공하므로 직접 Metric을 작성할 필요가 없음
* 분류, 클러스터링 뿐만 아니라 Object Detection에 관련된 다양한 Metric을 제공.
* metric객체의 주요 메소드
    * update() 메소드는 train/valid/test loader의 loop내에서 호출되어 지표 계산에 필요한 메트릭 내부 변수의 값을 update함(counter나 현재까지 정확한 예측 건수등)
    * compute() 메소드는 메트릭 값을 계산하여 반환.
    * reset() 메소드는 지금까지 update된 메트릭 내부 변수의 값을 초기화

In [None]:
#이미 캐글 커널에 torchmetrics가 설치되어 있음. 
#!pip install torchmetrics

In [10]:
import torchmetrics
print(torchmetrics.__version__)

1.6.1


#### Accuracy Class 활용하기
* 정확도는 torch.metric.classification.Accuracy로 제공. 이진 분류인 경우는 BinaryAccuracy, 다중 분류인 경우는 MulticlassAccuracy 클래스를 활용할 수도 있지만, 보통은 Accuracy 클래스의 생성 인자로 이들을 구분하여 적용함. MulticlassAccuracy 적용시에는 average의 default값이 micro가 아니라 macro이므로 이에 유의해야 함.
* torchmetrics의 Metric 클래스(예를 들어 Accuracy)는 device를 입력 tensor의 device와 동일하게 설정해 줘야함 to(device)로 설정하며 입력 tensor의 device와 Metric 객체의 device가 다르면 오류 발생.
* update() 메소드의 입력 인자로 사용되는 예측 Tensor는 argmax() 적용된 클래스값 또는 클래스 개수 레벨로 되어 있는 logit이나 class probability을 예측 Tensor로 입력하면 Accuracy Class가 argmax를 적용하여 계산함. 

In [14]:
from torch import tensor
from torchmetrics.classification import Accuracy, MulticlassAccuracy

target = tensor([2, 1, 0, 0])
preds = tensor([2, 1, 0, 1])

# Accuracy 클래스는 task에 multiclass로 MulticlassAccuray 설정. Accuracy의 average default값은 micro
acc_metric = Accuracy(task="multiclass", num_classes=3) #average='micro'
acc_tensor = acc_metric(preds, target)
print('Accuracy 적용:', acc_tensor)

# MulticlassAccuracy의 average default값은 macro
m_acc_metric = MulticlassAccuracy(num_classes=3)
print('MulticlassAccuracy 적용:', m_acc_metric(preds, target).item())

Accuracy 적용: tensor(0.7500)
MulticlassAccuracy 적용: 0.8333333730697632


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

target = tensor([2, 1, 0, 0]).to(device)
preds = tensor([2, 1, 0, 1]).to(device)

accuracy_metric = Accuracy(task="multiclass", num_classes=3).to(device) # to(device)제거
accuracy_metric.update(preds, target)

print(accuracy_metric.compute()) #accuracy_metric.compute().item()

tensor(0.7500, device='cuda:0')


### torchmetrics 를 적용한 TrainerWithTM 클래스 생성

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import os

class TrainerWithTM:
    # metric 입력 변수 추가. 
    def __init__(self, model, loss_fn, optimizer, train_loader, val_loader, scheduler=None, 
                 callbacks=None, metric=None, device=None):
        self.model = model.to(device)
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        # scheduler 추가
        self.scheduler = scheduler
        self.device = device
        # 현재 learning rate 변수 추가
        self.current_lr = self.optimizer.param_groups[0]['lr']
        #checkpoint와 early stopping 클래스들을 list로 받음. 
        self.callbacks = callbacks
        #torchmetrics 객체 입력 받음
        self.metric = metric.to(device)
        
    def train_epoch(self, epoch):
        self.model.train()

        # running 평균 loss 계산.
        accu_loss = 0.0
        running_avg_loss = 0.0
        # 정확도, 정확도 계산을 위한 전체 건수 및 누적 정확건수
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        # metric 초기화
        self.metric.reset()
        
        # tqdm으로 실시간 training loop 진행 상황 시각화
        with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1} [Training..]", leave=True) as progress_bar:
            for batch_idx, (inputs, targets) in enumerate(self.train_loader):
                # 반드시 to(self.device). to(device) 아님.
                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                # Forward pass
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets)

                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # batch 반복 시 마다 누적  loss를 구하고 이를 batch 횟수로 나눠서 running 평균 loss 구함.
                accu_loss += loss.item()
                running_avg_loss = accu_loss /(batch_idx + 1)

                # # accuracy metric 계산
                # # outputs 출력 예측 class값과 targets값 일치 건수 구하고
                # num_correct = (outputs.argmax(-1) == targets).sum().item()
                # # 배치별 누적 전체 건수와 누적 전체 num_correct 건수로 accuracy 계산  
                # num_total += inputs.shape[0]
                # accu_num_correct += num_correct
                # accuracy = accu_num_correct / num_total

                # torchmetric accuracy 계산. 예측된 output값을 넣거나, outputs.argmax(-1)로 예측 클래스값을 넣어도 됨.
                self.metric.update(outputs, targets)
                
                #tqdm progress_bar에 진행 상황 및 running 평균 loss와 정확도 표시
                progress_bar.update(1)
                if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  # 20 batch 횟수마다 또는 맨 마지막 batch에서 update
                    progress_bar.set_postfix({"Loss": running_avg_loss,
                                              # "Accuracy": accuracy,
                                              "Accuracy": self.metric.compute().item() })

        if (self.scheduler is not None) and (not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)):
            self.scheduler.step()
            self.current_lr = self.scheduler.get_last_lr()[0]
        
        return running_avg_loss, self.metric.compute().item()
        # return running_avg_loss, accuracy, self.metric.compute().item()

    def validate_epoch(self, epoch):
        if not self.val_loader:
            return None

        self.model.eval()

        # running 평균 loss 계산.
        accu_loss = 0
        running_avg_loss = 0
        # 정확도, 정확도 계산을 위한 전체 건수 및 누적 정확건수
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        current_lr = self.optimizer.param_groups[0]['lr']
        # metric 초기화
        self.metric.reset()
        with tqdm(total=len(self.val_loader), desc=f"Epoch {epoch+1} [Validating]", leave=True) as progress_bar:
            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(self.val_loader):
                    inputs = inputs.to(self.device)
                    targets = targets.to(self.device)

                    outputs = self.model(inputs)

                    loss = self.loss_fn(outputs, targets)
                    # batch 반복 시 마다 누적  loss를 구하고 이를 batch 횟수로 나눠서 running 평균 loss 구함.
                    accu_loss += loss.item()
                    running_avg_loss = accu_loss /(batch_idx + 1)

                    # # accuracy metric 계산
                    # # outputs 출력 예측 class값과 targets값 일치 건수 구하고
                    # num_correct = (outputs.argmax(-1) == targets).sum().item()
                    # # 배치별 누적 전체 건수와 누적 전체 num_correct 건수로 accuracy 계산  
                    # num_total += inputs.shape[0]
                    # accu_num_correct += num_correct
                    # accuracy = accu_num_correct / num_total

                    # torchmetric 계산 
                    self.metric.update(outputs, targets)
                    
                    #tqdm progress_bar에 진행 상황 및 running 평균 loss와 정확도 표시
                    progress_bar.update(1)
                    if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  # 20 batch 횟수마다 또는 맨 마지막 batch에서 update
                        progress_bar.set_postfix({"Loss": running_avg_loss,
                                                  # "Accuracy": accuracy,
                                                  "Accuracy": self.metric.compute().item()})
        # scheduler에 검증 데이터 기반에서 epoch레벨로 계산된 loss를 입력해줌.
        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            self.scheduler.step(running_avg_loss)
            self.current_lr = self.scheduler.get_last_lr()[0]

        return running_avg_loss, self.metric.compute().item()
        # return running_avg_loss, accuracy, self.metric.compute().item()

    def fit(self, epochs):
        # epoch 시마다 학습/검증 결과를 기록하는 history dict 생성. learning rate 추가
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'lr': []}
        for epoch in range(epochs):
            train_loss, train_acc = self.train_epoch(epoch)
            val_loss, val_acc = self.validate_epoch(epoch)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f}",
                  f", Val Loss: {val_loss:.4f} Val Accuracy: {val_acc:.4f}" if val_loss is not None else "",
                  f", Current lr:{self.current_lr:.6f}")
            # epoch 시마다 학습/검증 결과를 기록. learning rate 추가
            history['train_loss'].append(train_loss); history['train_acc'].append(train_acc)
            history['val_loss'].append(val_loss); history['val_acc'].append(val_acc)
            history['lr'].append(self.current_lr)

            # 만약 callbacks가 생성 인자로 들어온 다면 아래 수행. 만약 early stop 되어야 하면 is_epoch_loop_break로 for loop break
            if self.callbacks:
                is_epoch_loop_break = self._execute_callbacks(self.callbacks, self.model, epoch, val_loss, val_acc)
                if is_epoch_loop_break:
                    break
                                
        return history

    # 생성 인자로 들어온 callbacks list을 하나씩 꺼내서 ModelCheckpoint, EarlyStopping을 수행. 
    # EarlyStopping 호출 시 early stop 여부를 판단하는 is_early_stopped 반환
    def _execute_callbacks(self, callbacks, model, epoch, val_loss, val_acc):
        is_early_stopped = False
        
        for callback in self.callbacks:
            if isinstance(callback, ModelCheckpoint):
                if callback.monitor == 'val_loss':    
                    callback.save(model, epoch, val_loss)
                elif callback.monitor == 'val_acc':
                    callback.save(model, epoch, val_acc)
            if isinstance(callback, EarlyStopping):
                if callback.monitor == 'val_loss':
                    is_early_stopped = callback.check_early_stop(val_loss)
                if callback.monitor == 'val_acc':
                    is_early_stopped = callback.check_early_stop(val_acc)
                
        return is_early_stopped

    # 학습이 완료된 모델을 return
    def get_trained_model(self):
        return self.model

In [26]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import Accuracy

model = create_resnet_model('resnet50', num_classes=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = Adam(model.parameters(), lr=0.001) # model.fc.parameters(), weight_decay=0.9
loss_fn = nn.CrossEntropyLoss()
# torchmetrics의 Accuracy 생성. 
accuracy_metric = Accuracy(task='multiclass', num_classes=2)
# Patience를 3. 
scheduler = ReduceLROnPlateau(
            optimizer=optimizer, mode='min', factor=0.5, patience=3, threshold=0.01, min_lr=0.00001)

checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_acc', mode='max', save_interval=1, verbose=1)
earlystop_cb = EarlyStopping(monitor='val_acc', mode='max', early_patience=5, verbose=1)
callbacks = [checkpoint_cb, earlystop_cb]

trainer = TrainerWithTM(model=model, loss_fn=loss_fn, optimizer=optimizer,
                  train_loader=tr_loader, val_loader=val_loader, scheduler=scheduler, 
                  callbacks=callbacks, metric=accuracy_metric,
                  device=device)
# 학습 및 평가. 
history = trainer.fit(5)

Epoch 1 [Training..]: 100%|██████████| 126/126 [00:13<00:00,  9.52it/s, Loss=0.373, Accuracy_01=0.856, Accuracy_02=0.856]
Epoch 1 [Validating]: 100%|██████████| 63/63 [00:05<00:00, 10.62it/s, Loss=0.386, Accuracy_01=0.826, Accuracy_02=0.826]


Epoch 1/10, Train Loss: 0.3727 Train Accuracy: 0.8556 , Val Loss: 0.3862 Val Accuracy: 0.8261 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_1_val_acc_0.8261.pt


Epoch 2 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.53it/s, Loss=0.234, Accuracy_01=0.908, Accuracy_02=0.908]
Epoch 2 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 13.97it/s, Loss=0.224, Accuracy_01=0.907, Accuracy_02=0.907]


Epoch 2/10, Train Loss: 0.2341 Train Accuracy: 0.9075 , Val Loss: 0.2236 Val Accuracy: 0.9070 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_2_val_acc_0.9070.pt


Epoch 3 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.247, Accuracy_01=0.905, Accuracy_02=0.905]
Epoch 3 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.15it/s, Loss=0.363, Accuracy_01=0.879, Accuracy_02=0.879]


Epoch 3/10, Train Loss: 0.2471 Train Accuracy: 0.9050 , Val Loss: 0.3634 Val Accuracy: 0.8791 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 4 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.54it/s, Loss=0.164, Accuracy_01=0.943, Accuracy_02=0.943]
Epoch 4 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.28it/s, Loss=0.288, Accuracy_01=0.875, Accuracy_02=0.875]


Epoch 4/10, Train Loss: 0.1640 Train Accuracy: 0.9425 , Val Loss: 0.2879 Val Accuracy: 0.8751 , Current lr:0.001000
EarlyStopping counter: 2/5


Epoch 5 [Training..]:   4%|▍         | 5/126 [00:00<00:19,  6.22it/s, Loss=0.206, Accuracy_01=0.875, Accuracy_02=0.875]


KeyboardInterrupt: 