In [None]:
import numpy as np
import pandas as pd
import json
from PIL import Image
import os

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master/')
sys.path.append('../input/adamp-optimizer/AdamP-master')
import timm
from adamp import AdamP
!conda install -c conda-forge nvidia-apex --yes
from apex import amp

# 파일들 불러오기

In [None]:
TRAIN_DIR='../input/cassava-leaf-disease-classification/train_images/'
TEST_DIR='../input/cassava-leaf-disease-classification/test_images/'
labels=json.load(open("../input/cassava-leaf-disease-classification/label_num_to_disease_map.json"))

train=pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train.head()

DEVICE=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(DEVICE)

# 학습 파라미터 설정

In [None]:
class CFG:
    BATCH=32
    EPOCHS=20
    LR=0.0001
    IMG_SIZE=256
    model_name='tf_efficientnet_b3_ns'
    target_size=5
    weight_decay=1e-6

# 학습 이미지와 라벨 구분

In [None]:
X_Train , Y_Train = train['image_id'].values, train['label'].values
X_Test=[name for name in (os.listdir(TEST_DIR))]

# 데이터 불러오는 클래스 선언

In [None]:
class GetData(Dataset):
    def __init__(self, Dir, FNames, Labels, Transform):
        self.dir=Dir
        self.fnames=FNames
        self.transform=Transform
        self.labels=Labels
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, index):
        x=Image.open(os.path.join(self.dir, self.fnames[index]))
        
        if "train" in self.dir:
            return self.transform(x), self.labels[index]
        
        elif "test" in self.dir:
            return self.transform(x), self.fnames[index]

# Transform 정의 (Augmentation? 그럼 test에선 왜하지? TTA?)

In [None]:
Transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((CFG.IMG_SIZE, CFG.IMG_SIZE)),
    transforms.RandomRotation(90),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.Normalize((0.485,0.456,0.406), (0.229,0.224,0.225))])

In [None]:
trainset= GetData(TRAIN_DIR, X_Train, Y_Train, Transform)
trainloader=DataLoader(trainset,batch_size=CFG.BATCH, shuffle=True, num_workers=4)

# 테스트 데이터는 왜 라벨 안 넣어줌?? -> 애초에 테스트 라벨링 파일 자체가 없음
testset=GetData(TEST_DIR, X_Test, None, Transform)
testloader=DataLoader(testset, batch_size=1, shuffle=False, num_workers=4)


# 모델 선언

In [None]:
class EfficientNet(nn.Module):
    def __init__(self, model_arch, n_class, pretrained=False):
        super().__init__()
        self.model = timm.create_model(CFG.model_name,pretrained=pretrained)
        #n_features = self.model.fc.in_features
        #self.model.fc = nn.Linear(n_features, CFG.target_size)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, n_class)
        # ResNext에서는 self.model.fc 였던 것이 
        # EfficientNet에서는 self.model.classifier로 바뀜
        # 왜 그런지 확인할 것
        
    def forward(self,x):
        x=self.model(x)
        return x

# SymmeetricCrossEntropy

In [None]:
class SymmetricCrossEntropy(nn.Module):
    
    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes
        
    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [None]:
#model=CustomResNext(model_name=CFG.model_name, pretrained=False)
model = EfficientNet(CFG.model_name, train.label.nunique(), pretrained=True).to(DEVICE)
# 모델을 GPU로 올려줘야함
model.to(DEVICE)
#criterion=nn.CrossEntropyLoss()
criterion = SymmetricCrossEntropy().to(DEVICE)
#optimizer=torch.optim.Adam(model.parameters(),lr=CFG.LR)
optimizer = AdamP(model.parameters(), lr=CFG.LR, weight_decay=CFG.weight_decay)

#Apex의 MixedPrecision => 학습 속도 빨라짐
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


In [None]:
print('Training Start!\n')
for epoch in range(CFG.EPOCHS):
    train_loss=0.0
    
    model=model.train()
    
    for i, (images,labels) in enumerate(trainloader):
        images=images.to(DEVICE)
        labels=labels.to(DEVICE)
        
        logits=model(images)
        loss=criterion(logits, labels)
        optimizer.zero_grad()
        
        # Apex의 MixedPrecision 사용 
        with amp.scale_loss(loss, optimizer) as scaled_loss: 
            scaled_loss.backward()

        #oss.backward()
        optimizer.step()
        
        # 찾아볼 것
        train_loss+=loss.detach().item()
    #찾아볼 것 
    model.eval()
    print('Epoch: %d | Loss: %.4f'%(epoch, train_loss / i))

apex의 AMP를 사용했을 때 
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to~~~ 에 대한 답변 (깃헙 이슈)

Occasionally seeing a message like “overflow detected, skipping step, reducing loss scale” is normal behavior with dynamic loss scaling, and it usually happens in the first few iterations because Amp begins by trying a high loss scale.

Seeing nan loss values (ie, the loss scalar resulting from the forward pass is nan or inf) is NOT normal, and indicates something has gone wrong. As Piotr says, if this is the case, a minimal repro would be helpful.