In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import glob
import os.path as osp

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
from torch.utils.data import DataLoader, Dataset

from torchvision import datasets, models
from torchvision.utils import make_grid

In [None]:
import os
import time
from PIL import Image
from IPython.display import display
import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 12
pretrained = False
criterion = nn.CrossEntropyLoss()
        
class Config:
    ep1= {
        "img_size" : 224,
        "batch_size" : 64,
        "epoch": 20,
        "optimizer": Adam
    }
    ep2 = {
        "img_size" : 224,
        "batch_size" : 64,
        "epoch" : 20,
        "optimizer": SGD
    }    

img_size, batch_size, epoch, optimizer = Config.ep2.values()

## **1. Load dataset**

In [None]:
df_train = pd.read_csv("/kaggle/input/plant-pathology-2021-fgvc8/train.csv")
df_sub = pd.read_csv("/kaggle/input/plant-pathology-2021-fgvc8/sample_submission.csv")

In [None]:
#The number of labels
len(df_train.labels.unique())

In [None]:
#The no.values per label
df_train.labels.value_counts()

> ### 1.1   Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
def encode_label(df):
    df['encoded_label'] = le.fit_transform(df.labels.values)
    return df

encode_label(df_train)
    
df_labels_idx = df_train.loc[df_train.duplicated(["labels", "encoded_label"])==False]\
                [["encoded_label", "labels"]].set_index("encoded_label").sort_index()
display(df_labels_idx)

> ### 1.2 Make datapath list for training, valuation, testing sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def make_datapath_list(phase='train', val_size=0.25):
    if phase in ["train", "val"]:
        phase_path = "train_images"
    elif phase in ["test"]:
        phase_path = "test_images"
    else:
        print(f"{phase} not in path")    
        
    if phase == 'train' or phase == 'val':
        rootpath = "/kaggle/input/resized-plant2021/img_sz_256/"
    else:
        rootpath = "/kaggle/input/plant-pathology-2021-fgvc8/test_images/"
    
    target_path = osp.join(rootpath+"/*.jpg")
    path_list = []
    
    for path in glob.glob(target_path):
        path_list.append(path)
        
    if phase in ["train", "val"]:
        train, val = train_test_split(path_list, test_size=val_size, random_state=0, shuffle=True)
        if phase == "train":
            path_list = train
        else:
            path_list = val
    
    return path_list

In [None]:
train_list = make_datapath_list(phase='train')
print(f'The length of training set: {len(train_list)}')
val_list = make_datapath_list(phase='val')
print(f'The length of valuation set: {len(val_list)}')
test_list = make_datapath_list(phase='test')
print(f'The length of testing set: {len(test_list)}')

> ### 1.4 Augumentation

In [None]:
import albumentations as A
from albumentations import Compose
from albumentations.pytorch import ToTensorV2
import cv2

In [None]:
transform = {
    'train': Compose([
        A.Rotate(p=0.1, limit=(-85, 80)),
        A.RandomShadow(
            num_shadows_lower=2, 
            num_shadows_upper=3, 
            shadow_dimension=3, 
            shadow_roi=(0, 0.7, 0.4, 0.8), 
            p=0.4
        ),
        A.ShiftScaleRotate(
            shift_limit=0.055, 
            scale_limit=0.065, 
            rotate_limit=35, 
            p=0.6
        ),
        A.RandomFog(
            fog_coef_lower=0.2, 
            fog_coef_upper=0.2, 
            alpha_coef=0.2, 
            p=0.3
        ),
        A.RGBShift(
            r_shift_limit=25, 
            g_shift_limit=15, 
            b_shift_limit=15, 
            p=0.3
        ),
        A.RandomBrightnessContrast(p=0.3),
        A.GaussNoise(
            var_limit=(50, 70),  
            always_apply=False, 
            p=0.3
        ),
        A.Resize(height=img_size, width=img_size),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2()
    ]),
    'val': Compose([
        A.Resize(img_size, img_size),
        A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),            
        ToTensorV2()
    ]),
    'test': Compose([
        A.Resize(img_size, img_size),
        A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
}

> ### 1.5 Create dataset class

In [None]:
class PlantDataset(Dataset):
    """
    Class to create a Dataset
    
    Attributes
    ----------
    df_train : DataFrame
        DataFrame containing the image labels.
    file_list : list
        A list containing the paths to the images
    transform : object
        Instance of the preprocessing class (ImageTransform)
    phase : 'train' or 'val' or 'test'
        Specify whether to use train, validation, or test
    """
    def __init__(self, df_train, file_list, transform=None, phase='train'):
        self.df_train = df_train
        self.df_labels_idx = df_labels_idx
        self.file_list = file_list
        self.transform = transform[phase]
        self.phase = phase
        
    def __len__(self):
        """
        Returns the number of images.
        """
        return len(self.file_list)
    
    def __getitem__(self, index):
        """
        Get data in Tensor format and labels of preprocessed images.
        """
        
        # Load the index number image.
        img_path = self.file_list[index]
        img = Image.open(img_path)
        
        # Preprocessing images
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_transformed = self.transform(image=img)
        
        # image name
        image_name = img_path[-20:]
        
        # Extract the labels
        if self.phase in ["train", "val"]:
            label = df_train.loc[df_train["image"]==image_name]["encoded_label"].values[0]
        elif self.phase in ["test"]:
            label = -1
        
        return img_transformed, label, image_name

In [None]:
train_dataset = PlantDataset(df_train, train_list, transform=transform, phase='train')
val_dataset = PlantDataset(df_train, val_list, transform=transform, phase='val')
test_dataset = PlantDataset(df_train, test_list, transform=transform, phase='test')

index = 0

print("【train dataset】")
print(f"img num : {train_dataset.__len__()}")
# print(f"img : {train_dataset.__getitem__(index)[0].size()}")
print(f"label : {train_dataset.__getitem__(index)[1]}")
print(f"image name : {train_dataset.__getitem__(index)[2]}")

print("\n【validation dataset】")
print(f"img num : {val_dataset.__len__()}")
# print(f"img : {val_dataset.__getitem__(index)[0].size()}")
print(f"label : {val_dataset.__getitem__(index)[1]}")
print(f"image name : {val_dataset.__getitem__(index)[2]}")

print("\n【test dataset】")
print(f"img num : {test_dataset.__len__()}")
# print(f"img : {test_dataset.__getitem__(index)[0].size()}")
print(f"label : {test_dataset.__getitem__(index)[1]}")
print(f"image name : {test_dataset.__getitem__(index)[2]}")

> ### 1.6 Create Dataloader

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=2,shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2, shuffle=False)

# to Dictionary
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader, "test": test_dataloader}

In [None]:
for i, image_data in enumerate(train_dataloader):
    break

In [None]:
plt.figure(figsize=(20, 20))

im = make_grid(image_data[0]['image'], nrow=16)
plt.imshow(np.transpose(im.numpy(), (1, 2, 0)))

## 2. **Define model**

> ### 2.1 Load model if exists

In [None]:
# # Load the Model back from file
# Pkl_Filename = '../input/pickle-test/Resmodel50_trained_1fc.pkl'
# with open(Pkl_Filename, 'rb') as file:  
#     model = pickle.load(file)

> ### 2.2 Define new model

In [None]:
models_config = [
    {
        "name": "2 FCs, 0.0001 Lr, 30 Epochs",
        "classifier": torch.nn.Sequential(
                        torch.nn.Linear(2048, 512),
                        torch.nn.Linear(512, 12)),
        "lr": 0.0001,
        "epoch": 30
    },
    {
        "name": "2 FCs, 0.001 Lr, 50 Epochs",
        "classifier": torch.nn.Sequential(
                        torch.nn.Linear(2048, 512),
                        torch.nn.Linear(512, 12)),
        "lr": 0.001,
        "epoch": 50
    },
    {
        "name": "1 FC, 0.0001 Lr, 30 Epochs",
        "classifier": torch.nn.Linear(2048, 12),
        "lr": 0.0001,
        "epoch": 30
    },
    {
        "name": "1 FCs, 0.001 Lr, 50 Epochs",
        "classifier": torch.nn.Linear(2048, 12),
        "lr": 0.001,
        "epoch": 50
    }
]

In [None]:
use_pretrained = True
pretrained_model = models.resnet50(pretrained=use_pretrained)

# for param in model.layer1.parameters():
#     param.requires_grad = False
        
# for param in model.layer2.parameters():
#     param.requires_grad = False  
        
# for param in model.layer3.parameters():
#     param.requires_grad = False 

In [None]:
from sklearn.metrics import f1_score, accuracy_score

## **3. Train, validate model**

In [None]:
def append_list(list, appended):
    for el in appended:
        list.append(el)
    return list

In [None]:
def plot_result(train_losses, train_accuracy, train_f1, val_losses, val_accuracy, val_f1, time):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 7))
    ax1.plot(train_losses, label='Train')
    ax1.plot(val_losses, label='Validation')
    ax1.set_title('Loss')
    ax1.legend()

    ax2.plot(train_accuracy, label='Train')
    ax2.plot(val_accuracy, label='Validation')
    ax2.set_title('Accuracy')
    ax2.legend()

    ax3.plot(train_f1, label='Train')
    ax3.plot(val_f1, label='Validation')
    ax2.set_title('F1 Score')
    ax3.legend()

In [None]:
def train_model(model_config, model, criterion, optimizer, num_epochs=3, is_inception=False):
    
    train_losses = []
    train_accuracy = []
    train_f1 = []

    val_losses = []
    val_accuracy = []
    val_f1 = []
    
    print(f"Devices to be used : {device}")
    model.to(device)
    torch.backends.cudnn.benchmark = True
    
    start_time = time.time()
        
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            
            epoch_targets = []
            epoch_predictions = []

            # Iterate over data.
            for i, data in enumerate(dataloaders_dict[phase]):
#                 inputs = np.transpose(data[0]['image'], (0, 3, 1, 2))
                inputs = data[0]['image']
                labels = data[1]
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics                
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
                np_preds = preds.cpu().data.numpy()
                np_labels = labels.cpu().data.numpy()
                append_list(epoch_predictions, np_preds)
                append_list(epoch_targets, np_labels)
                
                batch_f1 = f1_score(preds.cpu().data.numpy(), labels.cpu().data.numpy(), average='weighted')
                
                if i % 100 == 0 and i != 0:
                    print(f'Batch: {i}  |  Loss: {loss.item():.4f}   |   F1-score: {batch_f1:.4f}%')         

            epoch_loss = running_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders_dict[phase].dataset)
            
            epoch_f1 = f1_score(epoch_predictions, epoch_targets, average='weighted')
            
            if phase == 'train':
                train_losses.append(epoch_loss)
                train_accuracy.append(epoch_acc)
                train_f1.append(epoch_f1)
            else:
                val_losses.append(epoch_loss)
                val_accuracy.append(epoch_acc)
                val_f1.append(epoch_f1)
    
            print('{} Loss: {:.4f} Acc: {:.4f} F1_score: {:.4f}'.format('----> ' + phase.capitalize(), epoch_loss, epoch_acc, epoch_f1))
            
    print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed
    
    model_config['train_losses'] = train_losses
    model_config['train_accuracy'] = train_accuracy
    model_config['train_f1'] = train_f1
    model_config['val_losses'] = val_losses
    model_config['val_accuracy'] = val_accuracy
    model_config['val_f1'] = val_f1
    
    plot_result(train_losses, train_accuracy, train_f1, val_losses, val_accuracy, val_f1, time)
    
    return model

In [None]:
def save_model(model, filename):
    Pkl_Filename = name + ".pkl"

    with open(Pkl_Filename, 'wb') as file:
        pickle.dump(model, file)

In [None]:
models_trained = []

for model_config in models_config:
    name, classifier, lr, epoch = model_config.values()
    print(f'Model name: {name}')
    pretrained_model.fc = classifier
    
    optimizer = Adam(pretrained_model.parameters(), lr=lr)
     
    trained_model = train_model(model_config, pretrained_model, criterion, optimizer, num_epochs=epoch)
    
    save_model(trained_model, name)

## **4. Predict test data**

In [None]:
class PlantPredictor():
    """
    Class for predicting labels from output results
    
    Attributes
    ----------
    df_labels_idx: DataFrame
        DataFrame that associates INDEX with a label name
    """
    
    def __init__(self, model, df_labels_idx, dataloaders_dict):
        self.model = model
        self.df_labels_idx = df_labels_idx
        self.dataloaders_dict = dataloaders_dict
        self.df_submit = pd.DataFrame()
        
    
    def __predict_max(self, out):
        """
        Get the label name with the highest probability.
        
        Parameters
        ----------
        predicted_label_name: str
            Name of the label with the highest prediction probability
        """
        maxid = np.argmax(out.detach().numpy(), axis=1)
        df_predicted_label_name = self.df_labels_idx.iloc[maxid]
        
        return df_predicted_label_name
    
    def inference(self):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        df_pred_list = []
        for i, data in enumerate(self.dataloaders_dict['test']):
            image_name = data[2]
            self.model.to(device)
            inputs = data[0]['image']
            inputs = inputs.to(device)
            out = self.model(inputs)
            device = torch.device("cpu")
            out = out.to(device)
            df_pred = self.__predict_max(out).reset_index(drop=True)
            df_pred["image"] = image_name
            df_pred_list.append(df_pred)
            
        self.df_submit = pd.concat(df_pred_list, axis=0)
        self.df_submit = self.df_submit[["image", "labels"]].reset_index(drop=True)

In [None]:
# predictor = PlantPredictor(model, df_labels_idx, dataloaders_dict)
# predictor.inference()

In [None]:
# df_submit = predictor.df_submit.copy()

# df_submit.to_csv('submission.csv', index=False)
# df_submit