## This is the demo code for HKUST AIAA5023 Spring2022.

### Based on a simple CNN (resnet34).

### 1. Take a glance at the data
You may delve deeper of the data (**Exploratory Data Analysis**) to provide foundation and insights for your later methods.

In [None]:
import os
data_path = '../input/cassava-leaf-disease-classification'
print(len(os.listdir('../input/cassava-leaf-disease-classification/train_images')))
print(len(os.listdir('../input/cassava-leaf-disease-classification/test_images')))

There is only one pic in the 'test_images' fold. Hence you need to separate some data as validation data from the 'train_images' fold by yourself.

In [None]:
import pandas as pd
train_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train_df.head()

Meaning of labels and the number of data in different classes.

**Hint**: Class imbalance problem. Methods such as resampling, reweighting and focal loss may help.

In [None]:
import json
import seaborn as sns
with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as file:
    print(json.dumps(json.loads(file.read()), indent=4))
sns.countplot(x = 'label', data = train_df)
train_df['label'].value_counts()

### 2. Data preprocessing

#### 2.1 Read image

In [None]:
import cv2
import matplotlib.pyplot as plt
from PIL import Image

# def get_img(path):
#     im_bgr = cv2.imread(path)
#     im_rgb = im_bgr[:, :, ::-1]
#     return im_rgb

# a = get_img('../input/cassava-leaf-disease-classification/train_images/1000015157.jpg')
Image.open('../input/cassava-leaf-disease-classification/train_images/1000015157.jpg')
# print(a.shape)
# plt.imshow(a)

#### 2.2 Data split
split the data in fold 'train_images' to training data and validation data.

In [None]:
from sklearn.model_selection import train_test_split
data_list = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train_list, valid_list = train_test_split(data_list, test_size=0.2, random_state=42)
print(data_list)
print(train_list.shape)
print(valid_list.shape)

#### 2.3 Prepare Dataset and Dataloader class for using pytorch

Unlike datasets such as MNIST and CIFAR-10/100 which have existed sealed package, this new dataset does not have any. We need to create a Dataset class for the purpose of using Dataloader.

In [None]:
from torch.utils.data import Dataset,DataLoader
from PIL import Image

class CassavaDataset(Dataset):
    
    def __init__(self,  df, transforms=None, target_transform=None):
        super().__init__()
        self.data_root = "../input/cassava-leaf-disease-classification/train_images"
        self.df = df.reset_index(drop=True).copy()
        self.transforms = transforms
        self.target_transform = target_transform
 
    def __getitem__(self, index: int):
        img_path = "{}/{}".format(self.data_root, self.df.iloc[index]['image_id'])
#         img  = get_img(img_path)
        img = Image.open(img_path).convert('RGB')
        
        target = self.df.iloc[index]['label']

        if self.transforms is not None:
            img = self.transforms(img)
            
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target
    
    def __len__(self):
        return len(self.df)

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms


def Get_Dataloader(train_list, valid_list):
    
    # data augmentation is more abundant for training data than validation data
    train_transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.CenterCrop(224), # we will use resnet34, which requires the size of input to be 224*224*3, you can also change the network structure to accomodate the original image size
            transforms.ColorJitter(0.2, 0.2, 0.2),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
            transforms.Normalize(mean = [0.5,0.5,0.5], std = [0.5,0.5,0.5]),
        ])
    
    valid_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean = [0.5,0.5,0.5], std = [0.5,0.5,0.5]),
        ])
 
    # training and validation datasets
    train_data = CassavaDataset(train_list, transforms=train_transform, target_transform=None)
    valid_data = CassavaDataset(valid_list, transforms=valid_transform, target_transform=None)
    
    # training and validation dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=64,
        num_workers=2,
        shuffle=True, 
        pin_memory=True,
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_data, 
        batch_size=64,
        num_workers=2,
        shuffle=False,
        pin_memory=True,
    )
    
    return train_loader, valid_loader

train_loader, valid_loader = Get_Dataloader(train_list, valid_list)

### 3. Model training

#### 3.1 Some useful tools

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def save_model(model, save_file):
    state = {
        'model': model.state_dict(),
    }
    torch.save(state, save_file)
    del state
    

def adjust_learning_rate(optimizer, epoch, lr, milestones):
    '''decrease learning rate lr to 0.1*lr when train for milestone[0]th epoch, and decrease it to 0.01*lr when comes to milestone[1]th epoch
    '''
    lr_1 = lr*(0.1**(epoch>=milestones[0]))*(0.1**(epoch>=milestones[1]))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr_1

    return lr_1

#### 3.2 Train for one epoch

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
def train(model, criterion, optimizer, train_loader, device, epoch):
    losses = AverageMeter()
    accs = AverageMeter()
    
    model.train()
    for idx, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        bsz = labels.shape[0]
        
        output = model(images)
        loss = criterion(output, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(output.data, 1)
        acc = accuracy_score(labels.cpu(), predicted.cpu())
        
        accs.update(acc)
        losses.update(loss.item())
        
    return accs.avg, losses.avg

#### 3.3 Valid for one epoch

In [None]:
def valid(model, criterion, valid_loader, device, epoch):
    losses = AverageMeter()
    accs = AverageMeter()
    
    model.eval()
    with torch.no_grad():
        for idx, (images, labels) in enumerate(valid_loader):
            images = images.to(device)
            labels = labels.to(device)
            bsz = labels.shape[0]
        
            output = model(images)
            loss = criterion(output, labels)
        
            _, predicted = torch.max(output.data, 1)
            acc = accuracy_score(labels.cpu(), predicted.cpu())
        
            accs.update(acc)
            losses.update(loss.item())
      
    return accs.avg, losses.avg

### 4. Parameters and run for some epochs
For your training, you should run for more epochs. There are also many other methods for adjusting learning rate. Maybe you can save the model which achieves best acc on validation dataset.

You can also split the data to train and validation data in other ways. For example, K-folds.

There are much room for improvement!

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models

num_classes = 5
model = models.resnet34() # we use resnet34 as the backbone
model.fc = nn.Linear(model.fc.in_features, num_classes) # we should change the last fc layer of the predefined resnet34 network to accomodate our classification problem
lr = 0.1
optimizer = optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
n_epochs = 50  # just a demo
milestones = [30, 40]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)
train_loader, valid_loader = Get_Dataloader(train_list, valid_list)

os.makedirs('cassava/model')

best_acc = 0
for epoch in range(1, n_epochs+1):
    lr_now = adjust_learning_rate(optimizer, epoch, lr, milestones) # adjust learning rate
    acc_train, loss_train = train(model, criterion, optimizer, train_loader, device, epoch)
    print("Epoch[{}], Train acc: {:.3f}, Train loss: {:.3f}, lr {:.3f}.".format(epoch, acc_train, loss_train, lr_now))
    acc_val, loss_val = valid(model, criterion, valid_loader, device, epoch)
    print("Epoch[{}], Valid acc: {:.3f}, Valid loss: {:.3f}.".format(epoch, acc_val, loss_val))
    if acc_val > best_acc:
        save_model(model, 'cassava/model/best.pth') # save model
        best_acc = acc_val

If you use Kaggle notebook, I recommend you to download the saved model to your own computer and open another notebook to test your code, and submit it to kaggle.

### 5. Test and submit results to Kaggle
You may find that there is only one image in the fold 'test_images'. This image is for you to test the feasibility of your code. For this kaggle contest, you should submit your test code to the contest, and it will run your code on their private test dataset to get the performance of your model. Hence, following codes are what you should submit.

In [None]:
from glob import glob
import pandas as pd
import torch
import torch.nn as nn
from torchvision import models
from torch.utils.data import Dataset,DataLoader
from PIL import Image
import torchvision.transforms as transforms

# def a new dataset function
class CassavaDataset_test(Dataset):
    
    def __init__(self,  df, transforms=None, target_transform=None):
        super().__init__()
        self.data_root = "/kaggle/input/cassava-leaf-disease-classification/test_images"
        self.df = df.reset_index(drop=True).copy()
        self.transforms = transforms
        self.target_transform = target_transform
 
    def __getitem__(self, index: int):
        img_path = "{}/{}".format(self.data_root, self.df.iloc[index]['image_id'])
#         img  = get_img(img_path)
        img = Image.open(img_path).convert('RGB')
        
        img_id = self.df.iloc[index]['image_id']      # add
        target = self.df.iloc[index]['label']

        if self.transforms is not None:
            img = self.transforms(img)
            
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img_id, img, target                   # add
    
    def __len__(self):
        return len(self.df)

    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model
model_file = 'cassava/model/best.pth'
model = models.resnet34()
model.fc = nn.Linear(model.fc.in_features, 5)
checkpoint = torch.load(model_file)
state = checkpoint['model']
model.load_state_dict(state)
model.to(device)

# test image loader
result = {}
test_images = glob('/kaggle/input/cassava-leaf-disease-classification/test_images/*.jpg')
for img in test_images:
    img_name = img.split('/')[-1]
    if img_name not in result:
        result[img_name] = 99
test_list = pd.DataFrame.from_dict(result, orient='index', columns=['label']).reset_index().rename(columns={'index': 'image_id'})

valid_transform = transforms.Compose([transforms.Resize((224, 224)),transforms.ToTensor(), transforms.Normalize(mean = [0.5,0.5,0.5], std = [0.5,0.5,0.5]),])
test_data = CassavaDataset_test(test_list, transforms=valid_transform, target_transform=None)
test_loader = torch.utils.data.DataLoader(test_data,batch_size=64,num_workers=2,shuffle=False,pin_memory=True)

#  use model to test image and produce a 'submission.csv' file
model.eval()
with torch.no_grad():
    for idx, (image_id, images, _) in enumerate(test_loader):
        images = images.to(device)
        bsz = images.shape[0]
        output = model(images)
        _, predicted = torch.max(output.data, 1)
        for i in range(bsz):
            result[image_id[i]] = predicted[i].item()  # record predicted label
        
result = pd.DataFrame.from_dict(result, orient='index', columns=['label']).reset_index().rename(columns={'index': 'image_id'})
result.to_csv('submission.csv', index=False)


**Try to do some coding!** **Good luck~**