# Imports

In [None]:
import sys
sys.path.append('../input/d/kozodoi/timm-pytorch-image-models/pytorch-image-models-master')
import timm

In [None]:
import os
import numpy as np
import pytorch_lightning as pl
import torch
import pandas as pd
import torch.nn as nn

from PIL import Image
from sklearn.model_selection import KFold
from torchvision import transforms as tsfm
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.metrics import Metric
from typing import List, Dict
import albumentations as A
from albumentations.pytorch import ToTensorV2

import matplotlib.pyplot as plt
from scipy.optimize import minimize

from glob import glob

# Config

In [None]:
class CFG:
    # data config
    root_dir_origin = "../input/plant-pathology-2021-fgvc8"
    root_dir_resized = "../input/resized-plantpathology2021fgvc8-train-data/resized_plant-pathology-2021-fgvc8_train_data"
    
    train_csv_path = os.path.join(root_dir_origin, 'train.csv')
    folds_csv_path = "../input/pp2021-kfold-tfrecords-0/folds.csv"
#     folds_csv_path = "../input/pp2021-dataset-gnueih/6folds_pp2021.csv"

    train_imgs_dir = os.path.join(root_dir_resized, 'resized_train_images_360_512')
    test_imgs_dir = os.path.join(root_dir_origin, 'test_images')
    
    num_classes = 5
    labels = np.array(['powdery_mildew',
                     'scab',
                     'complex',
                     'frog_eye_leaf_spot',
                     'rust',])
    
    # model config
    model_name = 'tf_efficientnet_b4_ns'
    
    model_paths = glob('../input/pp2021-models/ef4_ns_v18_5fold5/*')
#     model_paths = [ '../input/final-pp2021-training/ckpt/tf_efficientnet_b4_ns_kag_final_v18/ftf_efficientnet_b4_ns_epoch=02-valid_f1=0.9091.ckpt',
#                     '../input/final-pp2021-training/ckpt/tf_efficientnet_b4_ns_kag_final_v18/ftf_efficientnet_b4_ns_epoch=02-valid_f1=0.9053.ckpt',
#                     '../input/final-pp2021-training/ckpt/tf_efficientnet_b4_ns_kag_final_v18/ftf_efficientnet_b4_ns_epoch=02-valid_f1=0.9160.ckpt',
#                     '../input/final-pp2021-training/ckpt/tf_efficientnet_b4_ns_kag_final_v18/ftf_efficientnet_b4_ns_epoch=02-valid_f1=0.9081.ckpt',
#                     '../input/final-pp2021-training/ckpt/tf_efficientnet_b4_ns_kag_final_v18/ftf_efficientnet_b4_ns_epoch=02-valid_f1=0.9086.ckpt',]
    # training hyper-parameters
    seed = 42
    batch_size = 32
    n_fold = 5
    num_workers = 4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# CFG.model_paths.sort()
CFG.model_paths

# Helper functions

In [None]:
# Hàm predict một model
def predict_one_model(model, dataloader, device, tta=1, valid=False):
    model.eval()
    model.to(device)
    outputs = []
    with torch.no_grad():
        for b in dataloader:
            if valid:
                imgs = b[0]
            else:
                imgs = b
            imgs = imgs.to(device)
            y_pred = model(imgs).detach()
            outputs.append(y_pred)
    return torch.cat(outputs, dim=0)

# Hàm predict nhiều model, sau đó lấy trung bình các predictions 
def predict_multi_model(models, dataloader, device, tta=1, valid=False):
    preds = None
    for model in models:
        pred = predict_one_model(model, dataloader, CFG.device, tta=tta, valid=valid)
        if preds is None:
            preds = torch.sigmoid(pred)
        else:
            preds += torch.sigmoid(pred)
    return preds / len(models)

In [None]:
class ImageDataset(Dataset):
    """ Leaf Disease Dataset """
    def __init__(self,
                image_names,
                labels,
                image_dir, 
                transforms):        
        self.image_names = image_names
        self.image_dir = image_dir
        self.transforms = transforms                
        self.labels = labels

    def __len__(self) -> int:
        return len(self.image_names)
    
    def get_orig_img(self, idx: int):
        return Image.open(os.path.join(self.image_dir, self.image_names[idx]))
    
    def __getitem__(self, idx: int):
        image = np.array(self.get_orig_img(idx))        
        transformed_image = self.transforms(image=image)['image']
        if self.labels is not None:
            target = self.labels[idx]
            return transformed_image, target
        return transformed_image

In [None]:
valid_transform = A.Compose([
    A.Resize(height=360, width=512, p=1.0),
    A.Normalize(),
    ToTensorV2(),
])

In [None]:
def get_val_loader(valid_df):
    valid_dataset = ImageDataset(image_names=valid_df.image.values, 
                                labels=valid_df[CFG.labels].values, 
                                image_dir=CFG.train_imgs_dir, 
                                transforms=valid_transform)
    valid_loader = DataLoader(
                    valid_dataset,
                    batch_size=CFG.batch_size,
                    num_workers=CFG.num_workers,
                    shuffle=False,
                    pin_memory=True)
    return valid_loader

In [None]:
"""
Define F1 score metric
"""
class F1Score(Metric):
    def __init__(self, threshold: float = 0.5, dist_sync_on_step=False):
        super().__init__(dist_sync_on_step=dist_sync_on_step)
        self.threshold = threshold
        self.add_state("tp", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("fp", default=torch.tensor(0), dist_reduce_fx="sum")
        self.add_state("fn", default=torch.tensor(0), dist_reduce_fx="sum")

    def update(self, preds: torch.Tensor, target: torch.Tensor, sigmoid=True):
        assert preds.shape == target.shape
        with torch.no_grad():
            if sigmoid: preds = torch.sigmoid(preds)
            preds = (preds > self.threshold).type(torch.long)

            target_healthy = 1 - torch.clip(target.sum(dim=-1, keepdim=True), 0, 1)
            pred_healthy = 1 - torch.clip(preds.sum(dim=-1, keepdim=True), 0, 1)
            preds = torch.cat([preds, pred_healthy], -1)
            target = torch.cat([target, target_healthy], -1)

            tp = (preds*target).sum()
            fp = preds.sum() - tp
            fn = ((1 - preds)*target).sum()
        
        self.tp += tp.item()
        self.fp += fp.item()
        self.fn += fn.item()

    def compute(self):
        f1 = 2.0 * self.tp / (2.0 * self.tp + self.fn + self.fp)
        return f1

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns

def plot_confusion_matrix(
    y_test, 
    y_pred_proba, 
    threshold, 
    label_names=CFG.labels
)-> None:
    """
    """
    y_pred = np.where(y_pred_proba > threshold, 1, 0)
    c_matrices = multilabel_confusion_matrix(y_test, y_pred)
    
    cmap = plt.get_cmap('Blues')
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))

    for cm, label, ax in zip(c_matrices, label_names, axes.flatten()):
        sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap=cmap);

        ax.set_xlabel('Predicted labels');
        ax.set_ylabel('True labels'); 
        ax.set_title(f'{label}');

    plt.tight_layout()    
    plt.show()

# Model

Load các model có path trong `CFG.model_paths`

In [None]:
models = []
for path in CFG.model_paths:
    print(path)
    pretrain = torch.load(path, map_location=CFG.device)
    state_dict = {k[6:]:v for k,v in pretrain['state_dict'].items() if 'model' in k}
    model = timm.create_model(CFG.model_name, pretrained=False, num_classes=CFG.num_classes)
    model.load_state_dict(state_dict)
    models.append(model)
print(len(models))

# Threshold configuration
Một số phương pháp tìm threshold cho các class

In [None]:
def one_hot_encoded_df(dataset_df):
    # copy dataframe
    unique_labels = dataset_df.labels.unique()
    new_column_names = list(set(' '.join(unique_labels).split()))
    # initialize columns with zero
    dataset_df[new_column_names] = 0        
    # one-hot-encoding using the column names
    for labels in unique_labels:                
        label_indices = dataset_df[dataset_df['labels'] == labels].index
        splited_labels = labels.split()
        dataset_df.loc[label_indices, splited_labels] = 1
    return dataset_df

folds_df = pd.read_csv(CFG.folds_csv_path)
df = one_hot_encoded_df(pd.read_csv(CFG.train_csv_path))
df = folds_df.merge(df, on='image')
df.head()

### Predict trên validation dataset

In [None]:
# pred_df = df.drop([*CFG.labels, 'healthy', 'labels'], axis=1)
# pred_df[CFG.labels] = 0
# pred_df.head()

#### Out of fold predictions

In [None]:
# for fold_num in range(0, 5):
# #     fold_num = 5
#     print(fold_num)
#     valid_df = df[df.fold == fold_num].reset_index()
#     valid_loader = get_val_loader(valid_df)
#     valid_pred = predict_one_model(models[fold_num], valid_loader, CFG.device, valid=True)
#     pred_df.loc[pred_df.fold == fold_num, CFG.labels] = torch.sigmoid(valid_pred).cpu().numpy()    

#### Sử dụng scipy optimizer tìm threshold

In [None]:
# def my_metric(thresholds):
#     torch_metric = F1Score()
#     torch_metric.threshold = torch.tensor(thresholds)
#     return 1 - torch_metric(torch.tensor(pred_df[pred_df.fold == 4][CFG.labels].values), 
#                       torch.tensor(df[df.fold == 4][CFG.labels].values, 
#                                    dtype=torch.long), 
#                                    False).cpu().numpy()
# opt_thresh = minimize(my_metric, np.array([0.22 for i in range(CFG.num_classes)]),method='POWELL', bounds=[(0.2, 0.7) for i in range(CFG.num_classes)])
# print(opt_thresh)

#### Plot Confusion matrix

In [None]:
# y_true, y_pred_proba = df[CFG.labels].values, pred_df[CFG.labels].values
# plot_confusion_matrix(y_true, y_pred_proba, threshold=opt_thresh.x)

### Một cách khác tìm threshold

In [None]:
# import tensorflow_addons as tfa
# y_true, y_pred = df[CFG.labels].values, pred_df[CFG.labels].values
# thresholds = np.arange(.01, 1., .01)
# scores = []

# for threshold in thresholds:
#     m = tfa.metrics.F1Score(
#         num_classes=5, 
#         average=None, 
#         threshold=threshold)
#     m.update_state(y_true, y_pred)
#     scores.append(m.result().numpy())
    
# pdf = pd.DataFrame(columns=CFG.labels, data=scores, index=pd.Index(thresholds, name='threshold'))

In [None]:
# pdf.head()

In [None]:
# thresholds2 = []#lưu threshold mà class có giá trị lớn nhất
# scores = []#lưu giá trị f1 lớn nhất của classs

# for x in CFG.labels:
#     thresholds2.append(pdf[x].idxmax())#tìm threshold mà x có giá trị lớn nhất
#     scores.append(pdf[x].max())#tìm scores có giá tị lớn nhất
#     print(f'{x}: {pdf.loc[.5, x]:.4f} >>> {pdf.loc[thresholds2[-1], x]:.4f} ({thresholds2[-1]:.2f})')#lấy threshold 0.5 để so sánh giữa khách quan nhất với lớn nhất
# # print(df.loc[0.39])
# # print(df.complex.sort_values())
# # df.loc[thresholds[-1]]
# print(f'\nmean score: {pdf.loc[.5].mean():.4f} >>> {np.mean(scores):.4f}')

In [None]:
# plot_confusion_matrix(y_true, y_pred_proba, threshold=np.array(thresholds2))

### Một cách khác nữa

In [None]:
# thresholds = np.linspace(0.2, 0.7, 31)
# scores = [1 - my_metric(np.ones(5)*t) for t in thresholds]

# threshold_best_index = np.argmax(scores) 
# score_best = scores[threshold_best_index]
# threshold_best = thresholds[threshold_best_index]

# plt.plot(thresholds, scores)
# plt.plot(threshold_best, score_best, "xr", label="Best threshold")
# plt.xlabel("Threshold")
# plt.ylabel("IoU")
# plt.title("Threshold vs IoU ({}, {})".format(threshold_best, score_best))
# plt.legend()

In [None]:
# plot_confusion_matrix(y_true, y_pred_proba, threshold=np.ones((5,))*threshold_best)

In [None]:
# plot_confusion_matrix(y_true, y_pred_proba, threshold=np.ones((5,))*0.4333)

## Pred test

In [None]:

test_dataset = ImageDataset(image_names=os.listdir(CFG.test_imgs_dir), 
                            labels=None, 
                            image_dir=CFG.test_imgs_dir, 
                            transforms=valid_transform,)
test_loader = DataLoader(
    test_dataset,
    batch_size=CFG.batch_size,
    num_workers=CFG.num_workers,
    shuffle=False,
    pin_memory=True,
)

In [None]:
figure, axes = plt.subplots(1, 3, figsize=[20, 10])

for i, ax in enumerate(axes):
    image = test_dataset.get_orig_img(i)
    ax.imshow(image)
    ax.axis('off')
    
plt.show()

In [None]:
# predicts = predict(model, test_loader, CFG.device)
# predicts = torch.sigmoid(predicts)
logits = predict_multi_model(models, test_loader, CFG.device, valid=False)

## Create submission.csv

In [None]:
# thresholds = torch.round(torch.FloatTensor([0.49692652, 0.47886734, 0.45990339, 0.50891365, 0.50]) * 10000) / 10000
# thresholds = torch.round(torch.FloatTensor([0.47, 0.47886734, 0.45990339, 0.50891365, 0.50]) * 10000) / 10000

# print(thresholds)
# tensor([0.5398, 0.5112, 0.5599, 0.5087, 0.5776])
# powdery_mildew: 0.9641 >>> 0.9670 (0.54)
# scab: 0.9118 >>> 0.9131 (0.53)
# complex: 0.7404 >>> 0.7459 (0.56)
# frog_eye_leaf_spot: 0.8815 >>> 0.8834 (0.56)
# rust: 0.9297 >>> 0.9350 (0.58)

# mean score: 0.8855 >>> 0.8889

In [None]:
# for i in range(len(predicts)):
# #     predicts[i] = predicts[i] > thresholds
# predicts = logits > thresholds.to(CFG.device)
predicts = logits > 0.4333
predicts = predicts.type(torch.bool).cpu().numpy()
labels = []

for i in range(len(predicts)):
    labels.append(' '.join(CFG.labels[predicts[i]]))
    
labels = ['healthy' if ('healthy' in x or x == '') else x for x in labels]
    
sdf = pd.DataFrame({
    'image': test_dataset.image_names,
    'labels': labels})

sdf.to_csv('submission.csv', index=False)
display(sdf.head())