## 12.4 성능 개선
* 에폭 늘리기
* 스케줄러 추가
* TTA(테스트 단계 데이터 증강) 기법
* 레이블 스무딩 적용

In [None]:
import os
import random
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import timm
from timm import create_model
from tqdm.auto import tqdm

In [None]:
seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

# 장비 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data_path = '../../data/12_plant/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

train, valid = train_test_split(
    train,
    test_size=0.1,
    stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
    random_state=50
)

class ImageDataSet(Dataset):
    def __init__(self, df, img_dir=data_path+'images/', transform=None, is_test=False):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform if transform else lambda x:x
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image=image)['image']
        if self.is_test:
            return image
        else:
            label = np.argmax(self.df.iloc[idx, 1:5])
            return image, label

transform_train = A.Compose([
    A.Resize(450, 650), # 이미지 크기 조절
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.3), # 밝기 대비 조절
    A.VerticalFlip(p=0.2), # 상하 대칭 변환
    A.HorizontalFlip(p=0.5), # 좌우 대칭 변환
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=0.3), # 이동, 스케일링, 회전
    A.OneOf([
        A.Emboss(p=1), # 양각화
        A.Sharpen(p=1), # 날카로움
        A.Blur(p=1) # 불러 효과
    ], p=0.3),
    A.PiecewiseAffine(p=0.3), # 어파인 변환
    A.Normalize(),
    ToTensorV2()
])

transform_test  = A.Compose([
    A.Resize(450, 650),
    A.Normalize(),
    ToTensorV2()
])

img_dir = data_path+'images/'
dataset_train = ImageDataSet(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataSet(valid, img_dir=img_dir, transform=transform_test)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
g = torch.Generator()
g.manual_seed(0)

batch_size = 4
loader_train = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    worker_init_fn=seed_worker,
    generator=g,
    num_workers=2
)
loader_valid = DataLoader(
    dataset_valid,
    batch_size=batch_size,
    shuffle=False,
    worker_init_fn=seed_worker,
    generator=g,
    num_workers=2
)

### 12.4.1 모델 훈련 및 성능 검증

In [None]:
model = timm.create_model('efficientnet_b4', pretrained=True, num_classes=4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)

epochs = 5

# 스케쥴러 생성
from torch.optim import lr_scheduler
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer=optimizer,
    T_0=20,
    T_mult=1,
    eta_min=1e-6
)

In [None]:
for epoch in range(epochs):
    model.train()
    train_iter_loss = []
    for idx, (images, labels) in tqdm(enumerate(loader_train)):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        train_iter_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        # 스케쥴러 학습률 갱신
        scheduler.step()
    print(f'Epoch [{epoch+1}/{epochs}] - train loss : {sum(train_iter_loss)/len(loader_train):.4f}')

    model.eval()
    valid_iter_loss = []
    preds_list = []
    true_onehot_list = []
    with torch.no_grad():
        for idx, (images, labels) in tqdm(enumerate(loader_train)):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            valid_iter_loss.append(loss.item())
            preds = torch.softmax(outputs.cpu(), dim=1).numpy()
            true_onehot = torch.eye(4)[labels.cpu()].numpy()
            preds_list.extend(preds)
            true_onehot_list.extend(true_onehot)
    print(f'Epoch [{epoch+1}/{epochs}] - valid loss : {sum(valid_iter_loss)/len(loader_valid):.4f}')
    print(f'Epoch [{epoch+1}/{epochs}] - valid roc auc : {roc_auc_score(true_onehot_list, preds_list):.4f}')

### 12.4.2 예측
* TTA(테스트 단계 데이터 증강)
- 테스트 단계에서 활용하는 데이터 증강 기법
- 진행 절차
    - 테스트 데이터에 여러 변환 적용
    - 변환된 테스트 데이터별로 타깃 확률값을 예측
    - 타깃 예측 확률의 평구 구함

In [None]:
# 테스트 데이터 원본용 데이터셋 및 데이터 로더
dataset_test = ImageDataSet(test, img_dir=data_path+'images/', transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=2)

# TTA용 데이터셋 및 데이터 로더
dataset_TTA = ImageDataSet(test, img_dir=data_path+'images/', transform=transform_train, is_test=True)
loader_TTA = DataLoader(dataset_TTA, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=2)

In [None]:
# 예측
model.eval()
preds_test = np.zeros((len(test),4))
with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        preds_part = torch.sortmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds_test[i*batch_size: (i+1)*batch_size] += preds_part

In [None]:
# 제출 샘플 복사
submission_test = submission.copy()
submission_test[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds_test

In [None]:
# TTA 횟수
num_TTA = 7

preds_tta = np.zeros((len(test), 4))
for i in range(num_TTA):
    with torch.no_grad():
        for i, images in enumerate(loader_TTA):
            images = images.to(device)
            outputs = model(images)
            preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
            preds_tta[i*batch_size:(i+1)*batch_size] += preds_part
preds_tta /= num_TTA

In [None]:
submission_tta = submission.copy()
submission_tta[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds_tta

In [None]:
# 제출 파일 생성
submission_test.to_csv(data_path+'submission_test.csv', index=False)
submission_tta.to_csv(data_path+'submission_tta.csv', index=False)

### 레이블 스무딩
* 과잉 확신한 예측값을 보정
* $(1-\alpha) * preds + \frac{\alpha}{K}$
* $\alpha$ : 레이블 스무딩 강도
* K : 타깃값 개수
* preds : 예측 활률값
* ex> preds : (0,0,1,0), $\alpha$ : 0.1 => (0.025, 0.025, 0.925, 0.025)

In [None]:
def apply_label_smoothing(df, target, alpha, threshold):
    df_target = df[target].copy()
    k = len(target) # 타깃값 개수

    for idx, row in df_target.itterows():
        if (row > threshold).any():
            row = (1-alpha)*row + alpha/k
            df_target.iloc[idx] = row
    return df_target

In [None]:
alpha = 0.001
threshold = 0.999
submission_test_ls = submission_test.copy()
submission_tta_ls = submission_tta.copy()

target = ['healthy', 'multiple_diseases', 'rust', 'scab']
submission_test_ls[target] = apply_label_smoothing(submission_test_ls, target, alpha, threshold)
submission_tta_ls[target] = apply_label_smoothing(submission_tta_ls, target, alpha, threshold)

submission_test_ls.to_csv('submission_test_ls.csv', index=False)
submission_tta_ls.to_csv('submission_tta_ls.csv', index=False)