##  데이터 준비

In [None]:
import pandas as pd


data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

## EDA

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

## 데이터 시각화

### 타깃값 분포

In [None]:
healthy = train.loc[train['healthy']==1]
multiple_diseases = train.loc[train['multiple_diseases']==1]
rust = train.loc[train['rust']==1]
scab = train.loc[train['scab']==1]

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

mpl.rc('font', size=15)
plt.figure(figsize=(7, 7))

label = ['healthy', 'multiple diseases', 'rust', 'scab'] 

plt.pie([len(healthy), len(multiple_diseases), len(rust), len(scab)], 
        labels=label, 
        autopct='%.1f%%');

In [None]:
import matplotlib.gridspec as gridspec
import cv2

def show_image(img_ids, rows=2, cols=3): 
    assert len(img_ids) <= rows*cols 

    plt.figure(figsize=(15, 8))          
    grid = gridspec.GridSpec(rows, cols)

  
    for idx, img_id in enumerate(img_ids):
        img_path = f'{data_path}/images/{img_id}.jpg'  
        image = cv2.imread(img_path)                   
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        ax = plt.subplot(grid[idx])
        ax.imshow(image) 

## 이미지 출력

In [None]:
num_of_imgs = 6
last_healthy_img_ids = healthy['image_id'][-num_of_imgs:]
last_multiple_diseases_img_ids = multiple_diseases['image_id'][-num_of_imgs:]
last_rust_img_ids = rust['image_id'][-num_of_imgs:]
last_scab_img_ids = scab['image_id'][-num_of_imgs:]

In [None]:
show_image(last_healthy_img_ids)

In [None]:
show_image(last_multiple_diseases_img_ids) 

In [None]:
show_image(last_rust_img_ids) 

In [None]:
show_image(last_scab_img_ids) 

# 모델

##  시드값 고정 및 GPU 장비 설정
### 시드값 고정

In [None]:
import torch 
import random
import numpy as np
import os


seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

### GPU 장비 설정

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

### 훈련 데이터, 검증 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split


train, valid = train_test_split(train, 
                                test_size=0.1,
                                stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
                                random_state=50)

### 데이터셋 클래스 정의

In [None]:
import cv2
from torch.utils.data import Dataset 
import numpy as np

class ImageDataset(Dataset):
 
    def __init__(self, df, img_dir='./', transform=None, is_test=False):
        super().__init__() 
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
    
     
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]             
        img_path = self.img_dir + img_id + '.jpg' 
        image = cv2.imread(img_path)              
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
       
        if self.transform is not None:
            image = self.transform(image=image)['image']
     
        if self.is_test:
            return image 
        else:
           
            label = np.argmax(self.df.iloc[idx, 1:5]) 
            return image, label 

### 이미지 변환기

In [None]:

import albumentations as A
from albumentations.pytorch import ToTensorV2

transform_train = A.Compose([
    A.RandomResizedCrop(512,512),
    A.Rotate(20),
    A.Flip(),
    A.Transpose(),
    A.Resize(512,512),
    A.Normalize(p=1.0),
    ToTensorV2(),
])

transform_test = A.Compose([
    A.Resize(450, 650), 
    A.Normalize(),      
    ToTensorV2()       
])

### 데이터셋 및 데이터 로더 생성

In [None]:
img_dir = '/kaggle/input/plant-pathology-2020-fgvc7/images/'

dataset_train = ImageDataset(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataset(valid, img_dir=img_dir, transform=transform_test)

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

In [None]:
from torch.utils.data import DataLoader 

batch_size = 4

loader_train = DataLoader(dataset_train, batch_size=batch_size, 
                          shuffle=True, worker_init_fn=seed_worker,
                          generator=g, num_workers=2)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size, 
                          shuffle=False, worker_init_fn=seed_worker,
                          generator=g, num_workers=2)

##  모델 생성

### EfficientNet 모델 생성

In [None]:
!pip install efficientnet-pytorch==0.7.1

In [None]:
from efficientnet_pytorch import EfficientNet 

In [None]:

model = EfficientNet.from_pretrained('efficientnet-b7', num_classes=4) 

model = model.to(device) 

## 모델 훈련 및 성능 검증

### 손실 함수와 옵티마이저 설정

In [None]:
import torch.nn as nn 


criterion = nn.CrossEntropyLoss()

In [None]:

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)

### 훈련 및 성능 검증

In [None]:
from sklearn.metrics import roc_auc_score 
from tqdm.notebook import tqdm 

epochs = 3


for epoch in range(epochs):
    
    model.train()       
    epoch_train_loss = 0 
    
     
    for images, labels in tqdm(loader_train):
         
        images = images.to(device)
        labels = labels.to(device)
        
        
        optimizer.zero_grad()
        
        outputs = model(images)
       
        loss = criterion(outputs, labels)
      
        epoch_train_loss += loss.item() 
        loss.backward() 
        optimizer.step() 
    
    print(f'에폭 [{epoch+1}/{epochs}] - 훈련 데이터 손실값 : {epoch_train_loss/len(loader_train):.4f}')
    
   
    model.eval()           
    epoch_valid_loss = 0  
    preds_list = []        
    true_onehot_list = []  
    
    with torch.no_grad(): 
     
        for images, labels in loader_valid:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_valid_loss += loss.item()
            
            preds = torch.softmax(outputs.cpu(), dim=1).numpy() # 예측 확률값
         
            true_onehot = torch.eye(4)[labels].cpu().numpy()  
           
            preds_list.extend(preds)
            true_onehot_list.extend(true_onehot)

    print(f'에폭 [{epoch+1}/{epochs}] - 검증 데이터 손실값 : {epoch_valid_loss/len(loader_valid):.4f} / 검증 데이터 ROC AUC : {roc_auc_score(true_onehot_list, preds_list):.4f}')  

## 12.3.5 예측 및 결과 제출

In [None]:
dataset_test = ImageDataset(test, img_dir=img_dir, 
                            transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, 
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)

### 예측

In [None]:
model.eval()

preds = np.zeros((len(test), 4)) 

with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds[i*batch_size:(i+1)*batch_size] += preds_part

### 결과 제출

In [None]:
submission[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds
submission.to_csv('sampleSubmission.csv', index=False)