# NOTE!

Making this notebook public just to recieve some feedback on my approach. I am facing some issues with validation f1_score during training and after training.

## Import Packages

In [None]:
from typing import List, Dict
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tqdm

import cv2
import albumentations as A
from albumentations.core.composition import Compose
from albumentations.pytorch import ToTensorV2

from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.metrics import FBeta

import torch
import torchvision.models as models
from torch import nn
from torch.optim import AdamW, Adam
import torch.nn.functional as F

## Directories

In [None]:
ROOT_DIR = '../input/plant-pathology-2021-fgvc8/'
TRAIN_CSV = 'train.csv'
TRAIN_IMAGES_FOLDER = 'train_images'
TEST_IMAGES_FOLDER = 'test_images'
SAMPLE_SUBMISSION_CSV = 'sample_submission.csv'
# though the name containes resnet18, it's actually resnet34.
CKPT_PATHS = ['../input/multi-label-classification-training-with-pl/lightning_logs/version_0/checkpoints/resnet18-foldnum-0_epoch=5_valid_loss_epoch=0.1809_f1_valid_epoch=0.7877.ckpt']

## Configurations

In [None]:
RANDOM_SEED = 42
# Set seed for everythin(numpy, torch and python)

from pytorch_lightning import seed_everything
seed_everything(RANDOM_SEED)

In [None]:
configurations = {
    "BATCH_SIZE": 32,
    "NUM_WORKERS": 4,
    "IMAGE_HEIGHT": 334, 
    "IMAGE_WIDTH": 334,
    "LEARNING_RATE": 0.003,
    "MAX_EPOCHS": 6,
    "FOLD_NUM": 0,
}

## Prepare Dataset

In [None]:
dataset_df = pd.read_csv(os.path.join(ROOT_DIR, TRAIN_CSV))
dataset_df.head()

In [None]:
def get_single_labels(unique_labels) -> List[str]:
    """Splitting multi-labels and returning a list of classes"""
    single_labels = []
    for label in unique_labels:
        single_labels += label.split()
        
    single_labels = set(single_labels)
    
    return list(single_labels)


def get_one_hot_encoded_dataframe(dataset_df):
    # copy dataframe
    dataset_df_copy = dataset_df.copy()
    
    unique_labels = dataset_df_copy.labels.unique()
    
    new_column_names = get_single_labels(unique_labels)
    # initialize columns with zero
    dataset_df_copy[new_column_names] = 0        
    
    # one-hot-encoding using the column names
    for label in unique_labels:                
        label_indices = dataset_df_copy[dataset_df_copy['labels'] == label].index
        splited_labels = label.split()
        dataset_df_copy.loc[label_indices, splited_labels] = 1
    
    return dataset_df_copy

In [None]:
dataset_df_copy = get_one_hot_encoded_dataframe(dataset_df)
dataset_df_copy.head()

## Prepare Dataset Class

In [None]:
class ImageDataset(Dataset):
    """ Leaf Disease Dataset """
    def __init__(self,
                image_names: List[str],
                labels: List[List[int]],
                image_dir: str, 
                transforms):        
        self.image_names = image_names
        self.image_dir = image_dir
        self.transforms = transforms                
        self.labels = labels


    def __len__(self) -> int:
        return len(self.image_names)

    def __getitem__(self, idx: int):
        image_path = os.path.join(self.image_dir, self.image_names[idx])           
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)                
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    

        target = self.labels[idx]

        transformed_image = self.transforms(image=image)['image']
        sample = {'image_path': image_path, 'image': transformed_image, 'target': target}

        return sample

In [None]:
class ImageDataModule(pl.LightningDataModule):
    def __init__(self,
                 df: pd.DataFrame,
                 train_transforms,
                 valid_transforms,
                 image_dir: str,
                 fold_num: int,
                 configurations: Dict[str, int]):
        super().__init__()
        self.df = df
        self.train_transforms = train_transforms
        self.valid_transforms = valid_transforms
        self.image_dir = image_dir
        self.fold_num = fold_num
    
    def setup(self, stage=None) -> None:
        folds = StratifiedKFold(n_splits=5, shuffle=True)
        
        train_indexes, valid_indexes = list(folds.split(self.df, self.df['labels']))[self.fold_num]
        
        print(f"Size of Train Dataset: {len(train_indexes)}")
        print(f"Size of Validation Dataset: {len(valid_indexes)}")
        
        train_df = self.df.iloc[train_indexes]
        valid_df = self.df.iloc[valid_indexes]
        
        self.train_dataset = ImageDataset(image_names=train_df.image.values, 
                                        labels=train_df.iloc[:, 2:].values, 
                                        image_dir=self.image_dir, 
                                        transforms=self.train_transforms,
                                        )

        self.valid_dataset = ImageDataset(image_names=valid_df.image.values, 
                                        labels=valid_df.iloc[:, 2:].values, 
                                        image_dir=self.image_dir, 
                                        transforms=self.valid_transforms,
                                        )
        
        
    def train_dataloader(self):                
        return None

    def val_dataloader(self):        
        valid_loader = DataLoader(
            self.valid_dataset,
            batch_size=configurations.get("BATCH_SIZE"),
            num_workers=configurations.get("NUM_WORKERS"),
            shuffle=False,
        )
        return valid_loader

    def test_dataloader(self):
        return None

## Augmentation

In [None]:
train_augs = A.Compose([    
    A.Resize(height=configurations.get("IMAGE_HEIGHT"), width=configurations.get("IMAGE_WIDTH"), p=1.0),    
    A.Normalize(),
    ToTensorV2(),
])

valid_augs = A.Compose([
    A.Resize(height=configurations.get("IMAGE_HEIGHT"), width=configurations.get("IMAGE_WIDTH"), p=1.0),
    A.Normalize(),
    ToTensorV2(),
])

## DataModule Object Creation

In [None]:
data_module = ImageDataModule(df=dataset_df_copy,
                               train_transforms=train_augs,
                               valid_transforms=valid_augs,
                               image_dir=os.path.join(ROOT_DIR, TRAIN_IMAGES_FOLDER),
                               fold_num=configurations.get("FOLD_NUM"),
                               configurations=configurations)

data_module.setup()

## Preparing Model

In [None]:
class ClassifierModule(pl.LightningModule):
    def __init__(self, learning_rate=0.003, num_classes=6):
        super().__init__()        
        self.metric = FBeta(num_classes=num_classes, beta=0.5, multilabel=True)
        self.learning_rate = learning_rate
        # Try different architectures
        self.model = models.resnet34(pretrained=False)        
        self.model.fc = nn.Linear(in_features=self.model.fc.in_features, out_features=num_classes)        
        
        
    def forward(self, x):
        batch_size, _, _, _ = x.shape
        x = self.model(x)                
        x = torch.sigmoid(x)
        
        return x.reshape(batch_size, -1)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=0.001)        

        return optimizer            
    
    def _get_loss(self, y_hat, y): 
        pass
    
    def training_step(self, batch, batch_idx):
        pass              
        
    def validation_step(self, batch, batch_idx):
        pass

## Loading Trained Model

In [None]:
lightning = ClassifierModule()
lightning.load_from_checkpoint(checkpoint_path=CKPT_PATHS[0])
lightning.freeze()
lightning.cuda();

In [None]:
# Metric
metric = FBeta(num_classes=6, beta=0.5, multilabel=True)

In [None]:
from tqdm import tqdm

image_paths_batch = []
targets = []
predictions = []
batch_num = 1
f1_scores = []
for batch in tqdm(data_module.val_dataloader()):
    #print(f"Now Predicting Batch: {batch_num}..")
    target = batch['target'].detach().cpu()
        
    
    image_paths_batch.append(batch['image_path'])
    
    pred = lightning(batch['image'].cuda()).detach().cpu()
        
    f1_scores.append(metric(pred, target))
    
    targets.append(target.numpy())
    predictions.append(pred.numpy())            
    
    batch_num += 1    

In [None]:
print(f"AVG F1 Score: {sum(f1_scores)/len(f1_scores)}")

In [None]:
labels_str = np.array(['scab', 'healthy', 'frog_eye_leaf_spot', 'rust', 'powdery_mildew', 'complex'])

In [None]:
predicted_labels = []
actual_labels = []
image_paths = []

for batch_idx in range(len(predictions[:])):
    for i, pred in enumerate(predictions[batch_idx]):  
        if batch_idx == 0:
            print(f"\033[1mPredicted Label: {predictions[batch_idx][i]}\033[0m")
            print(f"Target Label: {targets[batch_idx][i]}")
            
        predicted_label = " ".join(labels_str[np.around(predictions[batch_idx][i]) > 0.5])
        actual_label = " ".join(labels_str[targets[batch_idx][i] == 1])

        predicted_labels.append(predicted_label)
        actual_labels.append(actual_label)
        image_paths.append(image_paths_batch[batch_idx][i])                

In [None]:
val_pred_df = pd.DataFrame({"image_path": image_paths, "actual_label": actual_labels, "predicted_label": predicted_labels})
val_pred_df.head()

In [None]:
val_pred_df.predicted_label.value_counts()