In [None]:
from typing import List, Dict

import random
import os

import numpy as np
import pandas as pd
import PIL

import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt
import seaborn as sns
import torchvision
import torch.onnx
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms as T

import skimage.io as io
from tqdm.notebook import tqdm

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [None]:
def read_image_labels():
    df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv').set_index('image')
    return df

In [None]:
img_labels = read_image_labels().sample(
    frac=1.0, 
    random_state=42
)

img_labels.head()

In [None]:
def get_image_infos(img_labels):
  
    df = img_labels.reset_index().groupby(by='labels').count().reset_index()
    df.columns = ['disease', 'count']
    
    df['%'] = np.round((df['count'] / img_labels.shape[0]), 2) * 100
    df = df.set_index('disease').sort_values(by='count', ascending=False)

    return df

In [None]:
get_image_infos(img_labels)

In [None]:
folders = dict({
        'data': '../input/plant-pathology-2021-fgvc8',
        'train': '../input/resized-plant2021/img_sz_256',
        'val': '../input/resized-plant2021/img_sz_256',
        'test':  '../input/plant-pathology-2021-fgvc8/test_images',
        'submiss': '../input/plant-pathology-2021-fgvc8/sample_submission.csv'
    })

def get_image(image_id, kind='train'):
    """Loads an image from file
    """
    fname = os.path.join(folders[kind], image_id)
    return PIL.Image.open(fname)

def plot_image_counts(img_labels):
    fig, ax = plt.subplots(figsize=(18, 7))
    sns.set_style("whitegrid")
    palette = sns.color_palette("Blues_r", 12)

    sns.countplot(
        x='labels', 
        palette=palette,
        data=img_labels,
        order=img_labels['labels'].value_counts().index,
    );

    plt.ylabel("# of observations", size=20);
    plt.xlabel("Class names", size=20)

    plt.xticks(rotation=45)
    
    fig.tight_layout()
    plt.show()

In [None]:
plot_image_counts(img_labels)  

In [None]:
img_labels.head()

In [None]:
def get_single_labels(unique_labels) -> List[str]:
    single_labels = []
    
    for label in unique_labels:
        single_labels += label.split()
        
    single_labels = set(single_labels)
    return list(single_labels)

In [None]:
def get_one_hot_encoded_labels(dataset_df) -> pd.DataFrame:
    df = dataset_df.copy()
    
    unique_labels = ['rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot']
    column_names = get_single_labels(unique_labels)
    
    df[column_names] = 0
    print(column_names)
    # one-hot-encoding
    for label in unique_labels:                
        label_indices = df[df['labels'] == label].index
        splited_labels = label.split()
        df.loc[label_indices, splited_labels] = 1
    
    return df


In [None]:
one_hot_encoded_labels = get_one_hot_encoded_labels(img_labels)
one_hot_encoded_labels.head()

In [None]:
train_transform = A.Compose([
    A.Rotate(
        always_apply=False, 
        p=0.1, 
        limit=(-68, 178), 
        interpolation=1, 
        border_mode=0, 
        value=(0, 0, 0), 
        mask_value=None
    ),
    A.RandomShadow(
        num_shadows_lower=1, 
        num_shadows_upper=1, 
        shadow_dimension=3, 
        shadow_roi=(0, 0.6, 1, 1), 
        p=0.4
    ),
    A.ShiftScaleRotate(
        shift_limit=0.05, 
        scale_limit=0.05, 
        rotate_limit=15, 
        p=0.6
    ),
    A.RandomFog(
        fog_coef_lower=0.2, 
        fog_coef_upper=0.2, 
        alpha_coef=0.2, 
        p=0.3
    ),
    A.RGBShift(
        r_shift_limit=15, 
        g_shift_limit=15, 
        b_shift_limit=15, 
        p=0.3
    ),
    A.RandomBrightnessContrast(
        p=0.3
    ),
    A.GaussNoise(
        var_limit=(50, 70),  
        always_apply=False, 
        p=0.3
    ),
    A.Resize(
        height=224,
        width=224,
    ),
    A.CoarseDropout(
        max_holes=5, 
        max_height=5, 
        max_width=5, 
        min_holes=3, 
        min_height=5, 
        min_width=5,
        always_apply=False, 
        p=0.2
    ),
    A.Normalize(
        mean=(0.485, 0.456, 0.406), 
        std=(0.229, 0.224, 0.225)
    ),
    ToTensorV2(),
])

val_transform = A.Compose([
    A.Resize(
        height=224,
        width=224,
    ),
    A.Normalize(
        mean=(0.485, 0.456, 0.406), 
        std=(0.229, 0.224, 0.225)
    ),
    ToTensorV2(),
])

In [None]:
from scipy.stats import bernoulli
from torch.utils.data import Dataset

class PlantDataset(Dataset):
    """
    """
    def __init__(self, 
                 image_ids, 
                 targets,
                 transform=None, 
                 target_transform=None, 
                 kind='train'):
        self.image_ids = image_ids
        self.targets = targets
        self.transform = transform
        self.target_transform = target_transform
        self.kind = kind
    
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        # load and transform image
        img = np.array(get_image(self.image_ids.iloc[idx], kind=self.kind))
        
        if self.transform:
            img = self.transform(image=img)['image']
        
        # get image target 
        target = self.targets[idx]
        if self.target_transform:
            target = self.target_transform(target)
        
        return img, target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_vaild, y_train, y_vaild = train_test_split(
    pd.Series(img_labels.index), 
    np.array(one_hot_encoded_labels[[
        'rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot'
    ]]),  
    test_size=0.3, 
    random_state=42
)

In [None]:
train_set = PlantDataset(X_train, y_train, transform=train_transform, kind='train')
val_set = PlantDataset(X_vaild, y_vaild, transform=val_transform, kind='val')
X_train.head()

In [None]:
print(f'Train size: {len(train_set)}')
print(f'Validation size: {len(val_set)}')

In [None]:
from torch.utils.data import DataLoader
from torch.nn import BatchNorm2d

batch_size = 32

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)

In [None]:
X_train

In [None]:
def create_model(pretrained=True):
    model = torchvision.models.resnet101(pretrained=pretrained).to(device)
    
    ct = 0
    for child in model.children():
        ct += 1
        if ct < 5:
            for param in child.parameters():
                param.requires_grad = False
    
    model.fc = torch.nn.Sequential(
        torch.nn.Linear(
            in_features=model.fc.in_features,
            out_features=6
        ),
        torch.nn.Sigmoid()
    ).to(device)
    
    return model

In [None]:
model = create_model(pretrained=True).to(device)

In [None]:
class MetricMonitor:
    def __init__(self):
        self.reset()

    def reset(self):
        self.losses = []
        self.accuracies = []
        self.scores = []
        self.metrics = dict({
            'loss': self.losses,
            'acc': self.accuracies,
            'f1': self.scores
        })

    def update(self, metric_name, value):
        self.metrics[metric_name] += [value]

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def get_metrics(
    y_pred_proba, 
    y_test, 
    threshold=0.25,
    labels=[
        'rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot'
    ]) -> None:
    """
    """
    y_pred = np.where(y_pred_proba > threshold, 1, 0)

    y1 = y_pred.round().astype(np.float)
    y2 = y_test.round().astype(np.float)
    
    f1 = f1_score(y1, y2, average='micro')
    acc = accuracy_score(y1, y2, normalize=True)

    return acc, f1

In [None]:
def training_loop(
    dataloader, 
    model, 
    loss_fn, 
    optimizer, 
    epoch, 
    monitor = MetricMonitor(), 
    is_train=True
) -> None:
    """
    """
    size = len(dataloader.dataset)
    
    loss_val = 0
    accuracy = 0
    f1score = 0
    
    if is_train:
        model.train()
    else:
        model.eval()
    
    stream = tqdm(dataloader)
    for batch, (X, y) in enumerate(stream, start=1):
        X = X.to(device)
        y = y.to(device)
        
        # compute prediction and loss
        pred_prob = model(X)
        loss = loss_fn(pred_prob, y)
        
        
        
        if is_train:
            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        
        loss_val += loss.item()
        acc, f1 = get_metrics(to_numpy(pred_prob), to_numpy(y))
        
        accuracy += acc 
        f1score += f1

        phase = 'Train' if is_train else 'Val'
        stream.set_description(
            f'Epoch {epoch:3d}/{7} - {phase} - Loss: {loss_val/batch:.4f}, ' + 
            f'Acc: {accuracy/batch:.4f}, F1: {f1score/batch:.4f}'
        )

    monitor.update('loss', loss_val/batch)
    monitor.update('acc', accuracy/batch)
    monitor.update('f1', f1score/batch) 

In [None]:
train_monitor = MetricMonitor()
test_monitor = MetricMonitor()

In [None]:
# initialize the loss function
loss_fn = nn.MultiLabelSoftMarginLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.0001
)

In [None]:
%%time

epoch = 7

for epoch in range(1, epoch + 1):
        
    if(epoch == 5):
        ct = 0
        for child in model.children():
            ct += 1
            if ct < 5:
                for param in child.parameters():
                    param.requires_grad = True
        
    
    # training loop
    training_loop(
        train_loader, 
        model, 
        loss_fn, 
        optimizer, 
        epoch, 
        train_monitor,
        is_train=True
    )
    
    # validation loop
    training_loop(
        valid_loader, 
        model, 
        loss_fn, 
        optimizer, 
        epoch, 
        test_monitor,
        is_train=False
    )

In [None]:
from matplotlib.ticker import MaxNLocator 

def plot_result(
    train_losses, 
    test_losses, 
    train_accuracies, 
    test_accuracies, 
    train_scores,
    test_scores
) -> None:
    
    epochs = range(1, len(train_losses) + 1)
    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(22, 5))
    
    # plot loss values
    ax[0].plot(epochs, train_losses, label='Training loss', marker ='o')
    ax[0].plot(epochs, test_losses, label='Validation loss', marker ='o')
    ax[0].legend(frameon=False, fontsize=14)
    
    ax[0].get_xaxis().set_major_locator(MaxNLocator(integer=True))
    ax[0].set_title('Loss', fontsize=18)
    ax[0].set_xlabel('Epoch', fontsize=14) 
    ax[0].set_ylabel('Loss', fontsize=14)  
    
    # plot accuracies 
    ax[1].plot(epochs, train_accuracies, label='Training Accuracy', marker ='o')
    ax[1].plot(epochs, test_accuracies, label='Validation accuracy', marker ='o')
    ax[1].legend(frameon=False, fontsize=14)
    
    ax[1].get_xaxis().set_major_locator(MaxNLocator(integer=True))
    ax[1].set_title('Accuracy', fontsize=18)
    ax[1].set_xlabel('Epoch', fontsize=14) 
    ax[1].set_ylabel('Accuracy', fontsize=14)
    
    ax[2].plot(epochs, train_scores, label='Training F1-Score', marker ='o')
    ax[2].plot(epochs, test_scores, label='Validation F1-Score', marker ='o')
    ax[2].legend(frameon=False, fontsize=14)
    
    ax[2].get_xaxis().set_major_locator(MaxNLocator(integer=True))
    ax[2].set_title('F1-Score', fontsize=18)
    ax[2].set_xlabel('Epoch', fontsize=14) 
    ax[2].set_ylabel('F1-Score', fontsize=14) 
        
    plt.show()

In [None]:
plot_result(
    train_monitor.losses, 
    test_monitor.losses,
    train_monitor.accuracies, 
    test_monitor.accuracies, 
    train_monitor.scores,
    test_monitor.scores
)    

In [None]:
torch.save(model.state_dict(), 'v5.pkl')

In [None]:
batch = 32

y_true = np.empty(shape=(0, 6), dtype=np.int)
y_pred_proba = np.empty(shape=(0, 6), dtype=np.int)

stream = tqdm(valid_loader)
for batch, (X, y) in enumerate(stream, start=1):
    X = X.to(device)
    y = to_numpy(y.to(device))
    pred = to_numpy(model(X))
    
    y_true = np.vstack((y_true, y))
    y_pred_proba = np.vstack((y_pred_proba, pred))
    

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

def plot_confusion_matrix(
    y_test, 
    y_pred_proba, 
    threshold=0.25, 
    label_names=[
        'rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot'
    ]
)-> None:
    """
    """
    y_pred = np.where(y_pred_proba > threshold, 1, 0)
    c_matrices = multilabel_confusion_matrix(y_test, y_pred)
    
    cmap = plt.get_cmap('Blues')
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))

    for cm, label, ax in zip(c_matrices, label_names, axes.flatten()):
        sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap=cmap);

        ax.set_xlabel('Predicted labels');
        ax.set_ylabel('True labels'); 
        ax.set_title(f'{label}');

    plt.tight_layout()    
    plt.show()

In [None]:
plot_confusion_matrix(y_true, y_pred_proba)    

In [None]:
y_pred = np.where(y_pred_proba > 0.25, 1, 0)
accuracy, f1 = get_metrics(y_pred, y_true)

pd.DataFrame({
    'name': ['F1', 'Accuracy'],
    'sorce': [f1, accuracy]
}).set_index('name')


In [None]:
submission_df = pd.read_csv('/kaggle/input/plant-pathology-2021-fgvc8/sample_submission.csv').set_index('image')
# submission_df.labels = None
submission_df.head()

In [None]:
one_hot_encoded_labels = get_one_hot_encoded_labels(submission_df)
one_hot_encoded_labels.head()

In [None]:

X_test = pd.Series(submission_df.index)
y_test = np.array(one_hot_encoded_labels[[
        'rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot'
    ]])
print(len(y_test))


In [None]:
test_transform = A.Compose([
    A.Resize(
        height=224,
        width=224,
    
    ),
A.Normalize(
        mean=(0.485, 0.456, 0.406), 
        std=(0.229, 0.224, 0.225)
    ),    
    ToTensorV2(),
])

In [None]:
test_set = PlantDataset(X_test, y_test, transform=test_transform, kind='test')

batch_size = 32
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)
X_test

In [None]:
batch = 32

y_true = np.empty(shape=(0, 6), dtype=np.int)
y_pred_proba = np.empty(shape=(0, 6), dtype=np.int)

stream = tqdm(test_loader)
for batch, (X, y) in enumerate(stream):
    X = X.float().to(device)
    y = to_numpy(y.to(device))
    pred = to_numpy(model(X))
    
    y_true = np.vstack((y_true, y))
    y_pred_proba = np.vstack((y_pred_proba, pred))

In [None]:
# y_pred_proba_df= pd.DataFrame(y_pred_proba, columns = ['healthy','scab','rust','frog_eye_leaf_spot','complex','powdery_mildew'])

In [None]:
y_pred_proba

In [None]:
y_pred_proba = y_pred_proba.tolist()
indices =  []
for pred in y_pred_proba:
    temp = []
    for category in pred:
        if category >= 0.25:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    
print(indices)

In [None]:
labels =  ['rust', 
        'complex', 
        'healthy', 
        'powdery_mildew', 
        'scab', 
        'frog_eye_leaf_spot']
testlabels = []


for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))

print(testlabels)

In [None]:
sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sub['labels'] = testlabels
sub.to_csv('submission.csv', index=False)