In [None]:
pip install pretrainedmodels

# Plant Pathology
- Dataset: https://arxiv.org/abs/2004.11958
- Approach from: https://isaac-flath.github.io/fastblog/deep%20learning/2021/02/15/PlantPathology.html (author: [Isaac Flath](https://github.com/Isaac-Flath))
- General Hints: https://twitter.com/abhi1thakur/status/1360954451104829441
- Alternative Approach: https://hamonk.github.io/2020/12/05/plant_pathology.html#what-worked (not implemted here)
- you need accuracy with variance attached to it, https://www.youtube.com/watch?v=0LIACHcxpHU

In [None]:
pip install torchsummary

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# create data directory
!rm -rf data
!mkdir data

# copy data
!cp -r /kaggle/input/plant-pathology-2020-fgvc7/images data
!cp /kaggle/input/plant-pathology-2020-fgvc7/train.csv data
!cp /kaggle/input/plant-pathology-2020-fgvc7/test.csv data
!cp /kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv data

# lists data
!ls data

In [None]:
# copy resume files
!cp /kaggle/input/plant-pathology-resume-files/distilled_labels.csv .
#!cp /kaggle/input/plant-pathology-resume-files/fold_idx.pkl .
#!cp /kaggle/input/plant-pathology-resume-files/pred2.pkl .
#!cp /kaggle/input/plant-pathology-resume-files/true2.pkl .
!cp /kaggle/input/plant-pathology-resume-files/fold_idx_sl.pkl .
!cp /kaggle/input/plant-pathology-resume-files/pred_sl2.pkl .
!cp /kaggle/input/plant-pathology-resume-files/true_sl2.pkl .

# list current working directory
!ls

In [None]:
import platform
import fastai
from fastai.vision.all import *
import torch
import torchvision
from torchsummary import summary
import pretrainedmodels
import albumentations
from albumentations import (
    Compose,GaussianBlur,HorizontalFlip,MedianBlur,MotionBlur,OneOf,
    RandomBrightness,RandomContrast,Resize,ShiftScaleRotate,VerticalFlip
)
import sklearn
from sklearn.model_selection import StratifiedKFold
import os
import cv2
import pandas as pd
import pickle

print('python version:           {}'.format(platform.python_version()))
print('fastai version:           {}'.format(fastai.__version__))
print('torch version:            {}'.format(torch.__version__))
print('torchvision version:      {}'.format(torchvision.__version__))
print('pretrainedmodels version: {}'.format(pretrainedmodels.__version__))
print('albumentations version:   {}'.format(albumentations.__version__))
print('sklearn version:          {}'.format(sklearn.__version__))
print('opencv version:           {}'.format(cv2.__version__))
print('pandas version:           {}'.format(pd.__version__))

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu") 
print('CUDA available:           {}'.format(use_cuda))
print('cuDNN enabled:            {}'.format(torch.backends.cudnn.enabled))
print('num gpus:                 {}'.format(torch.cuda.device_count()))

if use_cuda:
    print('gpu:                      {}'.format(torch.cuda.get_device_name(0)))

    print()
    print('------------------------- CUDA -------------------------')
    ! nvcc --version

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

## Data

In [None]:
data_dir = Path('./data')
img_dir = data_dir/'images'
labels = ['healthy', 'multiple_diseases', 'rust', 'scab']
image_size = [480, 768]

### Load Data

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    df['image_id'] = 'data/images/' + df.image_id + '.jpg'
    df['label'] = df[labels].idxmax(1)
    return df

In [None]:
train_df = load_data(data_dir/'train.csv')
print('dataset size: {}'.format(len(train_df.index)))
train_df.sample(n=5)

### EDA

In [None]:
train_df[labels].sum().plot(kind='bar');

In [None]:
train_df['label'].value_counts()

### Clean Data

In [None]:
for img in train_df.image_id:
    img_loaded= Image.open(img)
    if img_loaded.shape == (1365, 2048): continue
    print(img,img_loaded.shape)

In [None]:
img = Image.open(img_dir/'Train_245.jpg'); img.size

In [None]:
img = Image.open(img_dir/'Train_1156.jpg'); img.size

In [None]:
for img in train_df.image_id:
    img_loaded= Image.open(img)
    if img_loaded.shape == (1365, 2048): continue
    img_loaded.transpose(Image.TRANSPOSE).save(img)

In [None]:
img = Image.open(img_dir/'Train_245.jpg'); img.size

In [None]:
img = Image.open(img_dir/'Train_1156.jpg'); img.size

### Define Transforms

In [None]:
class AlbumentationsTransform(RandTransform):
    "A transform handler for multiple `Albumentation` transforms"
    split_idx,order=None,2
    def __init__(self, train_aug, valid_aug): store_attr()
    
    def before_call(self, b, split_idx):
        self.idx = split_idx
    
    def encodes(self, img: PILImage):
        if self.idx == 0:
            aug_img = self.train_aug(image=np.array(img))['image']
        else:
            aug_img = self.valid_aug(image=np.array(img))['image']

        return PILImage.create(aug_img)

In [None]:
def get_train_aug(image_size): return Compose(
    [
        Resize(height=image_size[0], width=image_size[1]),
        OneOf([RandomBrightness(limit=0.1, p=1), RandomContrast(limit=0.1, p=1)]), #fastai has
        OneOf([MotionBlur(blur_limit=3), MedianBlur(blur_limit=3), GaussianBlur(blur_limit=3)], p=0.5),
        VerticalFlip(p=0.5),#Dihedral
        HorizontalFlip(p=0.5),
        ShiftScaleRotate(
            shift_limit=0.2,
            scale_limit=0.2,
            rotate_limit=20,
            interpolation=cv2.INTER_LINEAR,
            border_mode=cv2.BORDER_REFLECT_101,
            p=1,
        ),
    ]
)

def get_valid_aug(image_size): return  Compose(
    [
        Resize(height=image_size[0], width=image_size[1]),
    ]
)

In [None]:
item_tfms = [AlbumentationsTransform(get_train_aug(image_size), get_valid_aug(image_size))]
batch_tfms = [Normalize.from_stats(*imagenet_stats)]

### Create Dataloader

In [None]:
dls = ImageDataLoaders.from_df(
    train_df, bs=16, seed=2020, item_tfms=item_tfms, batch_tfms=batch_tfms, label_col=5
)

In [None]:
dls.show_batch(max_n=16)

## Model

In [None]:
def l2_norm(input, axis=1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)
    return output


class BinaryHead(nn.Module):
    def __init__(self, num_class=4, emb_size=2048, s=16.0):
        super(BinaryHead, self).__init__()
        self.s = s
        self.fc = nn.Sequential(nn.Linear(emb_size, num_class))

    def forward(self, fea):
        fea = l2_norm(fea)
        logit = self.fc(fea) * self.s
        return logit


class se_resnext50_32x4d(nn.Module):
    def __init__(self):
        super(se_resnext50_32x4d, self).__init__()

        self.model_ft = nn.Sequential(
            *list(pretrainedmodels.__dict__["se_resnext50_32x4d"](num_classes=1000, pretrained="imagenet").children())[
                :-2
            ]
        )
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.model_ft.last_linear = None
        self.fea_bn = nn.BatchNorm1d(2048)
        self.fea_bn.bias.requires_grad_(False)
        self.binary_head = BinaryHead(4, emb_size=2048, s=1)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):

        img_feature = self.model_ft(x)
        img_feature = self.avg_pool(img_feature)
        img_feature = img_feature.view(img_feature.size(0), -1)
        fea = self.fea_bn(img_feature)
        # fea = self.dropout(fea)
        output = self.binary_head(fea)

        return output

In [None]:
model = se_resnext50_32x4d()

In [None]:
model

In [None]:
summary(model.to(device), (3, 224, 224))

## Train

### Loss Function

In [None]:
class CrossEntropyLossOneHot(nn.Module):
    def __init__(self):
        super(CrossEntropyLossOneHot, self).__init__()
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, preds, labels):
        return torch.mean(torch.sum(-labels * self.log_softmax(preds), -1))

### One-Hot-Label Callback

In [None]:
class OneHotLabelCB(Callback):
    
    def before_train(self):
        self.imgs_list = L(o for o in self.dl.items.iloc[:,0].values) # get list of images in the order they are drawn this epoch
        self.df = self.dl.items.set_index('image_id')
    
    def before_validate(self):
        self.imgs_list = L(o for o in self.dl.items.iloc[:,0].values) # get list of images in the order they are drawn this epoch
        self.df = self.dl.items.set_index('image_id')

    def before_batch(self):
        df = self.df
        imgs = self.imgs_list[self.dl._DataLoader__idxs[self.iter*self.dl.bs:self.iter*self.dl.bs+self.dl.bs]]
        one_hot_yb = df.loc[imgs,df.columns[:-1]].values
        self.learn.yb = (Tensor(one_hot_yb).cuda(),)

### Metric

In [None]:
def accuracy(inp, targ, axis=-1):
    "Compute accuracy with `targ` when `pred` is bs * n_classes"
    pred,targ = flatten_check(inp.argmax(dim=axis), targ.argmax(dim=axis))
    return (pred == targ).float().mean()

### Inference

In [None]:
def test_predict(cnt,msg):
    # Create Test Dataloaders
    test = load_data('data/sample_submission.csv')
    test_dl = dls.test_dl(test) 

    # predict with test time augmentation
    preds, _ = learn.tta(dl=test_dl) 
    p = preds.softmax(axis=1) 

    # format submission file
    test = pd.read_csv('data/sample_submission.csv')['image_id']
    out_a = pd.concat([test,pd.DataFrame(p,columns = learn.dls.vocab)],axis=1)[['image_id','healthy','multiple_diseases','rust','scab']]

    # write to csv and submit to kaggle
    out_a.to_csv(f'submission{cnt}.csv',index=False)

Stratified K-Fold: https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2

In [None]:
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
#fold_idx = list(skf.split(train_df.image_id,train_df.label))
#
#with open('fold_idx.pkl', 'wb') as f:
#    pickle.dump(fold_idx, f)

In [None]:
#with open('fold_idx.pkl', 'rb') as f:
#    fold_idx = pickle.load(f)
#    
#fold_idx = fold_idx[3:]; len(fold_idx)

In [None]:
#true = pd.DataFrame(columns = L(o for o in train_df.columns))
#pred = pd.DataFrame(columns = L(o for o in train_df.columns))

In [None]:
#with open('true2.pkl', 'rb') as f:
#    true = pickle.load(f)
#    
#with open('pred2.pkl', 'rb') as f:
#    pred = pickle.load(f)

In [None]:
#splits, preds, targs, preds_c,  = [],[],[],[]
##i = 0
#i = 3
#
#for _, val_idx in fold_idx:
#    splitter = IndexSplitter(val_idx)
#
#    # Create dataloaders splittin on indexes defined by StratifiedKFold
#    db = DataBlock(
#        blocks=(ImageBlock,CategoryBlock),
#        get_x=ColReader(0),get_y=ColReader(5),
#        item_tfms=item_tfms,batch_tfms=batch_tfms,
#        splitter=splitter
#    )
#    dls = db.dataloaders(train_df, bs=24)
#
#    #train model with fastai dataloaders, pytorch model, pytorch loss function, fastai gradient clipping, custom callback, on fp16 precision 
#    learn = Learner(dls,se_resnext50_32x4d(),loss_func=CrossEntropyLossOneHot(),cbs=[GradientClip,OneHotLabelCB()], metrics=[accuracy]).to_fp16()
#    learn.fine_tune(80,reset_opt=True) # Train freeze epoch then unfreeze for 80 epochs   
#
#    p, _ = learn.tta() # test time augmentation
#    p = p.softmax(axis=1) # Convert to probabilities
#
#    # Format dataframe to save
#    items_pred = pd.DataFrame(p, columns=dls.vocab)
#    items_pred['label'] = [dls.vocab[int(o)] for o in p.argmax(dim=1)]
#    items_pred['image_id'] = dls.valid.items.image_id.values
#    items_pred = items_pred[train_df.columns]
#    
#    true = pd.concat([true,dls.valid.items])
#    with open(f'true{i}.pkl', 'wb') as f:
#        pickle.dump(true, f)
#    
#    pred = pd.concat([pred,items_pred])
#    with open(f'pred{i}.pkl', 'wb') as f:
#        pickle.dump(pred, f)
#    
#    # predict and submit to kaggle
#    test_predict(i,f'distilling labels fold count {i}') 
#    i += 1

In [None]:
#pred.to_csv('distilled_labels.csv',index=False)

## Train (Soft Labeling)

In [None]:
train_df = load_data(data_dir/'train.csv')
train_df = train_df.sort_values('image_id')

In [None]:
distilled_labels = pd.read_csv('distilled_labels.csv')
distilled_labels = distilled_labels.sort_values('image_id');

# Get one hot encoded labels (zeros and ones)
distilled_labels.iloc[:,1:-1] = pd.get_dummies(distilled_labels.label)

In [None]:
assert (train_df.image_id.values==distilled_labels.image_id.values).all()
distilled_labels.reset_index(drop=True,inplace=True); train_df.reset_index(drop=True,inplace=True); 

# get soft labels
train_df.iloc[:,1:-1] = distilled_labels.iloc[:,1:-1] * .3 + train_df.iloc[:,1:-1] * .7
train_df.loc[train_df.healthy == 0.3][:5]

In [None]:
train_df.head()

### Train (5 Folds)

In [None]:
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
#fold_idx = list(skf.split(train_df.image_id,train_df.label))
#
#with open('fold_idx_sl.pkl', 'wb') as f:
#    pickle.dump(fold_idx, f)

In [None]:
with open('fold_idx_sl.pkl', 'rb') as f:
    fold_idx = pickle.load(f)
    
fold_idx = fold_idx[3:]; len(fold_idx)

In [None]:
#true = pd.DataFrame(columns = L(o for o in train_df.columns))
#pred = pd.DataFrame(columns = L(o for o in train_df.columns))

In [None]:
with open('true_sl2.pkl', 'rb') as f:
    true = pickle.load(f)
    
with open('pred_sl2.pkl', 'rb') as f:
    pred = pickle.load(f)

In [None]:
splits, preds, targs, preds_c,  = [],[],[],[]
#i = 0
i = 3

for _, val_idx in fold_idx:
    splitter = IndexSplitter(val_idx)

    # Create dataloaders splittin on indexes defined by StratifiedKFold
    db = DataBlock(
        blocks=(ImageBlock,CategoryBlock),
        get_x=ColReader(0),get_y=ColReader(5),
        item_tfms=item_tfms,batch_tfms=batch_tfms,
        splitter=splitter
    )
    dls = db.dataloaders(train_df, bs=24)

    #train model with fastai dataloaders, pytorch model, pytorch loss function, fastai gradient clipping, custom callback, on fp16 precision 
    learn = Learner(dls,se_resnext50_32x4d(),loss_func=CrossEntropyLossOneHot(),cbs=[GradientClip,OneHotLabelCB()], metrics=[accuracy]).to_fp16()
    learn.fine_tune(80,reset_opt=True) # Train freeze epoch then unfreeze for 80 epochs   

    p, _ = learn.tta() # test time augmentation
    p = p.softmax(axis=1) # Convert to probabilities

    # Format dataframe to save
    items_pred = pd.DataFrame(p, columns=dls.vocab)
    items_pred['label'] = [dls.vocab[int(o)] for o in p.argmax(dim=1)]
    items_pred['image_id'] = dls.valid.items.image_id.values
    items_pred = items_pred[train_df.columns]
    
    true = pd.concat([true,dls.valid.items])
    with open(f'true_sl{i}.pkl', 'wb') as f:
        pickle.dump(true, f)
    
    pred = pd.concat([pred,items_pred])
    with open(f'pred_sl{i}.pkl', 'wb') as f:
        pickle.dump(pred, f)
    
    # predict and submit to kaggle
    test_predict(i,f'distilling labels fold count {i}') 
    i += 1

In [None]:
test = pd.read_csv('data/sample_submission.csv')['image_id']
out = pd.concat([test,pred],axis=1)[['image_id','healthy','multiple_diseases','rust','scab']]

In [None]:
pred.to_csv('submission_final.csv',index=False)

In [None]:
pred

In [None]:
test.shape

In [None]:
pred2 = pred.copy()

In [None]:
pred2['image_id'] = test

In [None]:
pred2[['image_id','healthy','multiple_diseases','rust','scab']].to_csv('submission_final_real.csv',index=False)

In [None]:
distilled_labels = pd.read_csv('distilled_labels.csv'); distilled_labels

In [None]:
distilled_labels['image_id'] = test

In [None]:
distilled_labels[['image_id','healthy','multiple_diseases','rust','scab']].to_csv('submission_distilled_real.csv',index=False)