In [None]:
### notes
#1. dataset not balanced
#2. transforms not done sochke
#3. ensembling not tried
#4. FOLDS not tried out
#5. Cosine anneling/reduce on LR
#6. Mixed precision training  -- DONE
#check https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/203111#1111578

# IMPORTS

In [None]:
# !pip install efficientnet-pytorch
!pip install --no-deps imagededup==0.2.2 > /dev/null

import sys
sys.path = [
    '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',
] + sys.path
sys.path = [
    '../input/ttach-kaggle/ttach/',
] + sys.path

import numpy as np 
import pandas as pd 
from collections import Counter
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from torch.utils.data import Dataset, DataLoader
from torchvision.utils import make_grid
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torchvision.models as models
import torchvision.transforms as T
import torch.nn.functional as F
import torch.nn as nn
import json
import torch
import torchvision
import cv2
import PIL
import os
import random
from efficientnet_pytorch import EfficientNet
from imagededup.methods import PHash

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch()

# READ FILES

In [None]:
train_csv_path = "../input/cassavapreprocessed/merged_data.csv"
train_images_path = "../input/cassavapreprocessed/train_images/train_images"

# get image labels
data_csv = pd.read_csv(train_csv_path)
print(data_csv.head())
print("\ntotal label types = ", Counter(data_csv['label']))

# get disease names corresponding to labels
f = open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json')
real_labels = json.load(f)
real_labels = {int(k):v for k,v in real_labels.items()} #fixing datatype
real_labels

In [None]:
# just put in the names, same as label column
data_csv['class_name'] = data_csv.label.map(real_labels)
data_csv.head()

In [None]:
# read image given path
def get_image(path):
    img = Image.open(path)
    return img

# sample image
img = get_image("../input/cassavapreprocessed/train_images/train_images/1000015157.jpg")
img

# DBSCAN to remove duplicate images 

In [None]:
# takes around 10 mins, skipable in realtime
phasher = PHash()

encodings = phasher.encode_images(image_dir=train_images_path)
duplicates = phasher.find_duplicates(encoding_map=encodings, max_distance_threshold=0)

In [None]:
TRAIN_CLUSTERS=[]
TRAIN_CLUSTERING_CACHE= set()
BAD_CASES_DBSCAN_CACHE = set()

for image_id, values in tqdm(duplicates.items(), total=len(duplicates)):
    image_id = image_id.split('.')[0]
    if len(values) < 1:
        continue
#     if image_id not in TRAIN_IMAGE_IDS:
#         continue
    sorted_cluster = [image_id]
    for value in values:
        value = value.split('.')[0]
#         if value in TRAIN_IMAGE_IDS:
        sorted_cluster.append(value)

    sorted_cluster = sorted(sorted_cluster)
    if len(sorted_cluster) > 1:
        cluster_name = '.'.join(sorted_cluster)
#         if cluster_name in BAD_CASES_CACHE:
#             continue
        if cluster_name not in TRAIN_CLUSTERING_CACHE:
            TRAIN_CLUSTERING_CACHE.add(cluster_name)
            TRAIN_CLUSTERS.append(sorted_cluster)
            

TRAIN_CLUSTERS = sorted(TRAIN_CLUSTERS, key=lambda x: -len(x))

margin = 0
count = 20

draw_clusters = TRAIN_CLUSTERS[margin:margin+count]

size = min([5, len(draw_clusters[0])])

fig, ax = plt.subplots(count, size, figsize=(size*3, 4*count))

for j, image_ids in enumerate(draw_clusters):
    for i, image_id in enumerate(image_ids[:size]):
        image_id = image_id.split('.')[0]
        image = cv2.imread(f'../input/cassavapreprocessed/train_images/train_images/{image_id}.jpg', cv2.IMREAD_COLOR)
        image = cv2.resize(image, (256, 256), cv2.INTER_AREA)
        ax[j][i].imshow(image);
#         patient_id = df_folds.loc[image_id]['patient_id']
        ax[j][i].set_title(f'{image_id}')

In [None]:
#removing duplicates 

print("total duplicate images = ", len(TRAIN_CLUSTERS))
for group in TRAIN_CLUSTERS:
    data_csv = data_csv[data_csv.image_id != f'{group[0]}.jpg']
print(data_csv.shape)

# Loading data

In [None]:
#read dataset train/val/test
class GetDataset(Dataset):
    def __init__(self, df, data_root, transforms = None, output_label = True):
        super().__init__()
        self.df = df.reset_index(drop=True).copy()
        self.data_root = data_root
        self.transforms = transforms
        self.output_label = output_label
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index: int):    #enforces index to be int
            
        path = "{}/{}".format(self.data_root, self.df.iloc[index]['image_id'])
        img = get_image(path)         
            
        #if transforms exist then apply transforms
        if self.transforms:
            img = self.transforms(img)
            
        #if label exists then get label and return
        if self.output_label:
            label = self.df.iloc[index]['label']
            return img, label
        else:
            return img
        
        
IMG_SIZE = 512
BATCH_SIZE = 16
IMG_SHAPE = (IMG_SIZE,IMG_SIZE)
epochs = 6
max_lr = 10e-4
grad_clip = 0.1
weight_decay = 10e-4
opt_func=torch.optim.Adam

In [None]:
# splitting into train and validation
train_csv, val_csv = train_test_split(data_csv, test_size = 0.1, random_state = 23, stratify = data_csv['class_name'])
#stratify maintains the label ratios even after splitting


train_transforms = T.Compose([
#     T.RandomCrop(IMG_SHAPE),
    T.Resize(IMG_SHAPE),
    T.RandomHorizontalFlip(p = 0.5),
    T.RandomVerticalFlip(p = 0.5),
    T.ColorJitter(hue=.05, saturation=.05),
    T.RandomRotation(20, resample=PIL.Image.BILINEAR),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

validation_transforms = T.Compose([
#     T.CenterCrop(IMG_SHAPE),
    T.Resize(IMG_SHAPE),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_ds = GetDataset(train_csv, train_images_path, transforms = train_transforms, output_label = True)
val_ds = GetDataset(val_csv, train_images_path, transforms = validation_transforms, output_label = True)

# weights = [1.62, 8.31, 8.96, 9.77, 19.68]
# sampler = torch.utils.data.WeightedRandomSampler(weights, len(weights))

train = torch.utils.data.DataLoader(
    train_ds,
    batch_size = BATCH_SIZE,
    num_workers = 2,
#     sampler = sampler,
    shuffle = False,
    pin_memory = False
)
val = torch.utils.data.DataLoader(
    val_ds,
    batch_size = BATCH_SIZE,
    num_workers = 2,
    shuffle = False,
    pin_memory = False
)


In [None]:
def show_images(train_dl):
    for images, labels in train_dl:
        fig, ax = plt.subplots(figsize=(20, 20))
        ax.set_xticks([]); ax.set_yticks([])
        ax.imshow(make_grid(images[:32], nrow=8).permute(1,2,0))
        break
        
show_images(train)

#to get 1 batch i.e. batchsize
# dataiter = iter(train)
# images, labels = dataiter.next()

# plt.imshow(torchvision.utils.make_grid(images[2]))
# print(labels)

# DATA TO DEVICE

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
    
    def __iter__(self):
        for x in self.dl:
            yield to_device(x, self.device)
            
    def __len__(self):
        return len(self.dl)

device = get_device()
train = DeviceDataLoader(train, device)
val = DeviceDataLoader(val, device)
print("Model running on", device)

In [None]:
def accuracy(out, labels):
    _, preds = torch.max(out, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch
        out = self(images)
        loss = F.cross_entropy(out, labels)#,  weight= torch.tensor([1.62, 8.31, 8.96, 9.77, 19.68]).to(device))
        return loss
    
    def validation_step(self, batch):
        images, labels = batch
        out = self(images)
        loss = F.cross_entropy(out, labels)
        acc = accuracy(out, labels)
        return {"val_loss": loss.detach(), "val_acc": acc}
    
    def validation_epoch_end(self, outputs):
        batch_loss = [x["val_loss"] for x in outputs]
        epoch_loss = torch.stack(batch_loss).mean()
        batch_acc = [x["val_acc"] for x in outputs]
        epoch_acc = torch.stack(batch_acc).mean()
        return {"val_loss": epoch_loss.item(), "val_acc": epoch_acc.item()}
    
    def epoch_end(self, epoch, epochs, result):
        print("Epoch: [{}/{}], last_lr: {:.6f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
        epoch, epochs, result["lrs"][-1], result["train_loss"], result["val_loss"], result["val_acc"]))

# DEFINE MODEL

In [None]:
class Classifier(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = EfficientNet.from_name('efficientnet-b4')
        number_of_features =  self.network._fc.in_features
        self.network._fc = nn.Linear(number_of_features, 5)
        
    def forward(self, xb):
        return self.network(xb)
        
    def freeze(self):
        for param in self.network.parameters():
            param.requires_grad=False
        for param in self.network._fc.parameters():
            param.requires_grad=True
        
    def unfreeze(self):
        for param in self.network.parameters():
            param.requires_grad=True

In [None]:
model = to_device(Classifier(), device)


In [None]:
@torch.no_grad()
def evaluate(model, val_dl):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_dl]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]
    
def fit_one_cycle(epochs, max_lr, model, train_dl, val_dl, weight_decay=0, grad_clip=None,
                 opt_func=torch.optim.Adam):
    
    torch.cuda.empty_cache()
    
    history = []
    opt = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr, epochs=epochs,
                                                   steps_per_epoch=len(train_dl))
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(epochs):
        model.train()
        train_loss = []
        lrs = []
        for batch in tqdm(train_dl):
            opt.zero_grad()
            
            with torch.cuda.amp.autocast():
                loss = model.training_step(batch)
            
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()

#             loss = model.training_step(batch)
            train_loss.append(loss)
#             loss.backward()
            
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
                
#             opt.step()            
            lrs.append(get_lr(opt))
            sched.step()
            
        result = evaluate(model, val_dl)
        result["train_loss"] = torch.stack(train_loss).mean().item()
        result["lrs"] = lrs
        model.epoch_end(epoch, epochs, result)
        history.append(result)
    return history

In [None]:
history = [evaluate(model, val)]
history

# TRAIN

In [None]:
# model.freeze()

In [None]:
model.unfreeze()

In [None]:
!nvidia-smi

In [None]:
%%time
history=[]
history += fit_one_cycle(epochs, max_lr, model, train, val, weight_decay=weight_decay,
                        grad_clip=grad_clip, opt_func=opt_func)

In [None]:
!nvidia-smi

In [None]:
%%time
max_lr = 10e-5
history += fit_one_cycle(epochs, max_lr, model, train, val, weight_decay=weight_decay,
                        grad_clip=grad_clip, opt_func=opt_func)

In [None]:
!nvidia-smi

# PLOT

In [None]:
accuracy = [x["val_acc"] for x in history]
plt.plot(accuracy, "-rx")
plt.title("Accuracy vs number of epochs")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

In [None]:
val_loss = [x["val_loss"] for x in history]
train_loss = [x.get("train_loss") for x in history]
plt.plot(val_loss, "-bx")
plt.plot(train_loss, "-gx")
plt.title("Losses vs number of epochs")
plt.legend(["Validation loss", "Train loss"])
plt.xlabel("Epochs")

# SAVING MODEL

In [None]:
torch.save(model, "/kaggle/working/mod.pth")
model = torch.load('mod.pth', map_location=torch.device('cuda') )

for parameter in model.parameters():
    parameter.requires_grad = False

model.eval()

# TESTING


In [None]:
test_transforms = T.Compose([
    T.Resize(IMG_SIZE),
    T.ToTensor()
])

def predict_image(image):
    image_tensor = test_transforms(image).float()
    image_tensor = image_tensor.unsqueeze_(0)
    input = image_tensor.to(device)
    output = model(input)
    index = output.data.cpu().numpy().argmax()
    return index

TEST_DIR = '../input/cassava-leaf-disease-classification/test_images/'
test_images = os.listdir(TEST_DIR)
predictions = []

for image in test_images:
    img = Image.open(TEST_DIR + image)
    output = predict_image(img)
    predictions.append(output)

sub = pd.DataFrame({'image_id': test_images, 'label': predictions})
display(sub)
sub.to_csv('submission.csv', index = False)
