In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import random
import numpy as np
import os

seed = 50
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
import pandas as pd

data_path = '/home/baebro/nipa_ws/plant_pathology/data/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [5]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train,
                               test_size=0.1,
                                stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
                               random_state=50)

In [6]:
import cv2
from torch.utils.data import Dataset
import numpy as np

class ImageDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform is not None:
            image = self.transform(image=image)['image']
        if self.is_test:
            return image
        else:
            label = np.argmax(self.df.iloc[idx, 1:5])
            return image, label

In [7]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

transform_train = A.Compose([
    A.Resize(450, 650),
    A.RandomBrightnessContrast(brightness_limit=0.2,contrast_limit=0.2, p=0.3),
    A.VerticalFlip(p=0.2),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.1,
        scale_limit=0.2,
        rotate_limit=30,
        p=0.3
    ),
    A.OneOf([A.Emboss(p=1),
            A.Sharpen(p=1),
            A.Blur(p=1)], p=0.3),
    A.PiecewiseAffine(p=0.3),
    A.Normalize(),
    ToTensorV2()
])

  from .autonotebook import tqdm as notebook_tqdm
  original_init(self, **validated_kwargs)


In [8]:
transform_test = A.Compose([
    A.Resize(450, 650),
    A.Normalize(),
    ToTensorV2()
])

In [9]:
img_dir = '/home/baebro/nipa_ws/plant_pathology/data/images/'

dataset_train = ImageDataset(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataset(valid, img_dir=img_dir, transform=transform_test)

In [10]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f87661d6050>

In [11]:
from torch.utils.data import DataLoader

batch_size = 2

loader_train = DataLoader(dataset_train, batch_size=batch_size,
                         shuffle=True, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size,
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)

In [12]:
# !pip install efficientnet-pytorch==0.7.1

In [13]:
from efficientnet_pytorch import EfficientNet

# pretrained num_classes 1000 -> transfer learning num_classes 4

# method 1 : define num_classes
model = EfficientNet.from_pretrained('efficientnet-b7', num_classes=4)

model.to(device)

# method 2 : modify fc layer output
"""
model = EfficientNet.from_pretrained('efficientnet-b7')
model._fc = nn.Sequential(
    nn.Linear(model._fc.in_features, fc.out_features),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(model._fc.out_features=4)
)
"""

Loaded pretrained weights for efficientnet-b7


"\nmodel = EfficientNet.from_pretrained('efficientnet-b7')\nmodel._fc = nn.Sequential(\n    nn.Linear(model._fc.in_features, fc.out_features),\n    nn.ReLU(),\n    nn.Dropout(p=0.5),\n    nn.Linear(model._fc.out_features=4)\n)\n"

In [14]:
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)

In [15]:
from transformers import get_cosine_schedule_with_warmup
epochs = 39

scheduler = get_cosine_schedule_with_warmup(optimizer,
                                           num_warmup_steps=len(loader_train)*3,
                                           num_training_steps=len(loader_train)*epochs)

In [16]:
from sklearn.metrics import roc_auc_score
# from tqdm.notebook import tqdm

for epoch in range(epochs):
    
    model.train()
    epoch_train_loss = 0
    
    for images, labels in loader_train:

        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f'epoch [{epoch+1}/{epochs}] train_loss : {epoch_train_loss/len(loader_train):.4f}')
    
    # validation loop
    model.eval()
    epoch_valid_loss = 0
    pred_lists = []
    true_onehot_list = []
        
    with torch.no_grad():
        for images, labels in loader_valid:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_valid_loss += loss.item()

            preds = torch.softmax(outputs.cpu(), dim=1).numpy()
            true_onehot = torch.eye(4, device=device)[labels].cpu().numpy() 
            
            pred_lists.extend(preds)
            true_onehot_list.extend(true_onehot)

    print(f'epoch [{epoch+1}/{epochs}] epoch_valid_loss : {epoch_valid_loss/len(loader_valid):.4f} / ROC AUC : {roc_auc_score(true_onehot_list, pred_lists):.4f}')

epoch [1/39] train_loss : 1.2810
epoch [1/39] epoch_valid_loss : 0.8565 / ROC AUC : 0.9158
epoch [2/39] train_loss : 0.7274
epoch [2/39] epoch_valid_loss : 0.2891 / ROC AUC : 0.9441
epoch [3/39] train_loss : 0.4096
epoch [3/39] epoch_valid_loss : 0.3950 / ROC AUC : 0.9307
epoch [4/39] train_loss : 0.2878
epoch [4/39] epoch_valid_loss : 0.2162 / ROC AUC : 0.9623
epoch [5/39] train_loss : 0.2107
epoch [5/39] epoch_valid_loss : 0.2138 / ROC AUC : 0.9566
epoch [6/39] train_loss : 0.1727
epoch [6/39] epoch_valid_loss : 0.2667 / ROC AUC : 0.9528
epoch [7/39] train_loss : 0.1667
epoch [7/39] epoch_valid_loss : 0.2147 / ROC AUC : 0.9833
epoch [8/39] train_loss : 0.1324
epoch [8/39] epoch_valid_loss : 0.2522 / ROC AUC : 0.9670
epoch [9/39] train_loss : 0.0979
epoch [9/39] epoch_valid_loss : 0.1828 / ROC AUC : 0.9744
epoch [10/39] train_loss : 0.0997
epoch [10/39] epoch_valid_loss : 0.2937 / ROC AUC : 0.9452
epoch [11/39] train_loss : 0.1025
epoch [11/39] epoch_valid_loss : 0.4356 / ROC AUC : 0.

**TTA setting**

In [17]:
# original test dataset & dataloader

dataset_test = ImageDataset(test, img_dir=img_dir,
                           transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size,
                        shuffle=False, worker_init_fn=seed_worker,
                        generator=g, num_workers=2)


# TTA test dataset & dataloader
dataset_TTA = ImageDataset(test, img_dir=img_dir,
                           transform=transform_train, is_test=True)
loader_TTA = DataLoader(dataset_TTA, batch_size=batch_size,
                        shuffle=False, worker_init_fn=seed_worker,
                        generator=g, num_workers=2)

evaluation with Normal_test_dataset

In [18]:
model.eval()
preds_test = np.zeros((len(test), 4))

with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        
        # check = torch.softmax(outputs.cpu(), dim=1)
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds_test[i*batch_size:(i+1)*batch_size] += preds_part


In [19]:
submission_test = submission.copy()
submission_test[['healthy', 'mutiple_diseases', 'rust', 'scab']] = preds_test

evaluation with TTA test dataset

In [20]:
num_TTA = 3

preds_tta = np.zeros((len(test), 4))

for i in range(num_TTA):
    with torch.no_grad():
        for i, images in enumerate(loader_TTA):
            images = images.to(device)
            outputs = model(images)

            preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
            preds_tta[i*batch_size:(i+1)*batch_size] +=preds_part

In [31]:
preds_tta /= num_TTA

submission_tta = submission.copy()
submission_tta[['healthy','multiple_diseases', 'rust', 'scab']] = preds_tta


In [32]:
submission_tta.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,2.036825e-07,8.513529e-05,0.03695158,1.183408e-07
1,Test_1,2.175818e-06,8.728873e-05,0.03694642,1.151385e-06
2,Test_2,1.183294e-06,3.049645e-05,9.292826e-08,0.03700527
3,Test_3,0.037037,3.962494e-09,2.826887e-08,1.045393e-08
4,Test_4,3.447594e-06,5.525732e-05,0.03697741,9.194543e-07


In [33]:
output_path = '/home/baebro/nipa_ws/plant_pathology/output/'

submission_test.to_csv(output_path + 'submission_test.csv', index=False)
submission_tta.to_csv(output_path + 'submission_tta.csv', index=False)

label smoothing

In [42]:
def apply_label_smoothing(df, target, alpha, threshold):
    df_target = df[target].copy()
    k = len(target)

    for idx, row in df_target.iterrows():
        if (row>threshold).any():
            row = (1-alpha)*row + alpha/k
            df_target.iloc[idx] = row
    return df_target

In [43]:
alpha = 0.001
threshold = 0.999

submission_test_ls = submission_test.copy()
submission_tta_ls = submission_tta.copy()

target = ['healthy','multiple_diseases', 'rust', 'scab']

submission_test_ls[target] = apply_label_smoothing(submission_test_ls, target, alpha, threshold)
submission_tta_ls[target] = apply_label_smoothing(submission_tta_ls, target, alpha, threshold)

submission_test_ls.to_csv(output_path + 'submission_test_ls.csv', index=False)
submission_tta_ls.to_csv(output_path + 'submission_tta_ls.csv', index=False)

In [44]:
submission_tta.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,2.036825e-07,8.513529e-05,0.03695158,1.183408e-07
1,Test_1,2.175818e-06,8.728873e-05,0.03694642,1.151385e-06
2,Test_2,1.183294e-06,3.049645e-05,9.292826e-08,0.03700527
3,Test_3,0.037037,3.962494e-09,2.826887e-08,1.045393e-08
4,Test_4,3.447594e-06,5.525732e-05,0.03697741,9.194543e-07


In [45]:
submission_tta_ls.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,2.036825e-07,8.513529e-05,0.03695158,1.183408e-07
1,Test_1,2.175818e-06,8.728873e-05,0.03694642,1.151385e-06
2,Test_2,1.183294e-06,3.049645e-05,9.292826e-08,0.03700527
3,Test_3,0.037037,3.962494e-09,2.826887e-08,1.045393e-08
4,Test_4,3.447594e-06,5.525732e-05,0.03697741,9.194543e-07


In [46]:
submission_test.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,mutiple_diseases
0,Test_0,9e-06,0.25,0.998174,4.824771e-06,0.001812098
1,Test_1,4.6e-05,0.25,0.9971003,1.85742e-05,0.002835458
2,Test_2,5.2e-05,0.25,3.791062e-06,0.9992241,0.0007202495
3,Test_3,0.999999,0.25,4.642465e-07,1.606875e-07,6.438761e-08
4,Test_4,0.000112,0.25,0.9976721,2.785815e-05,0.002187973


In [47]:
submission_test_ls.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,mutiple_diseases
0,Test_0,9e-06,0.25,0.998174,5e-06,0.001812098
1,Test_1,4.6e-05,0.25,0.9971,1.9e-05,0.002835458
2,Test_2,0.000302,0.25,0.000254,0.998475,0.0007202495
3,Test_3,0.999249,0.25,0.00025,0.00025,6.438761e-08
4,Test_4,0.000112,0.25,0.997672,2.8e-05,0.002187973
