# Introduction : Librairies, Data & Processing

## Import de libraries

In [None]:
import os # data importation

# data structure/handling
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt # ploting
import cv2 # image visualisation
from PIL import Image

# Histogramme
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F

## Import des données

In [None]:
root = "/kaggle/input/aptos2019-blindness-detection"

train = pd.read_csv(os.path.join(root, 'train.csv')).drop_duplicates()
test = pd.read_csv(os.path.join(root, 'test.csv')).drop_duplicates()

## Processing : Recadrage

In [None]:
def remove_black(img,tol=7):
    # si l'image est en noir et blanc
    if img.ndim==2:
        mask = img>tol
        # mask.any(0) = vérifie la présence d'une valeur True sur chaque colonne
        # mask.any(1) = vérifie la présence d'une valeur True sur chaque ligne
        # on sélectionne les lignes et les colonnes où il y a au moins une valeur True
        return img[np.ix_(mask.any(1),mask.any(0))]
    
    # si l'image est en couleur
    elif img.ndim==3:
        # transforme l'image en noir et blanc
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        mask = gray_img>tol
        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape
        
        # vérifie que l'image renvoyée contient au moins un pixel
        if (check_shape == (0,0)):
            return img
        else:
            # mask.any(0) = vérifie la présence d'une valeur True sur chaque colonne
            # mask.any(1) = vérifie la présence d'une valeur True sur chaque ligne
            # on sélectionne les lignes et les colonnes où il y a au moins une valeur True
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
            img = np.stack([img1,img2,img3],axis=-1)
        return img

# Déséquilibre de classes & Data augmentation

## Déséquilibre de classes

In [None]:
# Pie Chart
eff = list()
percentage = list()
for i in range(5):
    eff.append(len(train.iloc[np.where(train.iloc[:,1] == i)]))
    percentage.append(round(100 * len(train.iloc[np.where(train.iloc[:,1] == i)]) / 3662))
palette = sns.color_palette('hls', n_colors = 5)
# Pie Chart
plt.pie(eff, labels = ['0', '1', '2', '3', '4'], autopct = '%1.0f%%', colors = palette)
plt.title('Proportion des différents stades de la maladie')

## Data augmentation

In [None]:
#Importing torch
import torch
from torch.utils.data import WeightedRandomSampler

size_data_augmented = 6000

targets = train['diagnosis'].values
class_sample_count = np.unique(targets, return_counts=True)[1]
weight = 1. / class_sample_count
samples_weight = weight[targets]
samples_weight = torch.from_numpy(samples_weight).double()
augmented_sampler = WeightedRandomSampler(samples_weight, size_data_augmented, replacement=True)

train_augmented = train.iloc[list(augmented_sampler)]

# Pie Chart
eff = list()
percentage = list()
for i in range(5):
    eff.append(len(train_augmented.iloc[np.where(train_augmented.iloc[:,1] == i)]))
    percentage.append(round(100 * len(train_augmented.iloc[np.where(train_augmented.iloc[:,1] == i)]) / len(train_augmented)))
palette = sns.color_palette('hls', n_colors = 5)
plt.pie(eff, labels = ['0', '1', '2', '3', '4'], autopct = '%1.0f%%', colors = palette)
plt.title('Proportion des différents stades de la maladie')

In [None]:
from torchvision import transforms # For image transformations.

import random
class BenColorTransform:
    """ Ben filter explained below : we create a transform"""
    def __init__(self, sigmaX=10, p = 0.25):
        self.varX = sigmaX
        self.prob = p
    def __call__(self, image):
        import random
        x = random.randint(0,100)
        if x <= self.prob*100:
            image = cv2.addWeighted( image ,4, cv2.GaussianBlur( image , (0,0) , self.varX) ,-4 ,128)
        return image

# Transformations to apply to training image (normalize param based on imagenet, pytorch default).
augmentation_transforms = transforms.Compose([BenColorTransform(sigmaX=10, p = 0.05), # Gaussian blur
                                             transforms.ToPILImage(mode='RGB'),
                                             
                                             transforms.RandomHorizontalFlip(p=0.1), # Horizontally flip the given image randomly with a given probability.
                                             transforms.RandomVerticalFlip(p=0.1), # Vertically flip the given image randomly with a given probability.
                                             transforms.RandomInvert(p=0.1), # Inverts the colors of the given image randomly with a given probability.
                                             transforms.RandomPosterize(bits=2, p=0.1), # Posterize the image randomly with a given probability by reducing the number of bits for each color channel.
                                             transforms.RandomSolarize(threshold=192.0, p=0.1), # Solarize the image randomly with a given probability by inverting all pixel values above a threshold.
                                             transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.1), # Adjust the sharpness of the image randomly with a given probability.
                                             transforms.RandomAutocontrast(p=0.1), # Autocontrast the pixels of the given image randomly with a given probability.
                                             transforms.RandomPerspective(distortion_scale=0.6, p=0.1), # Performs a random perspective transformation of the given image with a given probability.
                                             
                                             # RandomApply : Apply randomly a list of transformations with a given probability.
                                             transforms.ToTensor(), # need tensor
                                             
                                             # Randomly change the brightness, contrast, saturation and hue of an image.
                                             transforms.RandomApply([transforms.ColorJitter(brightness=.5, hue=.3)], p = 0.05),
                                             # Crop a random portion of image and resize it to a given size.
                                             transforms.RandomApply([transforms.RandomResizedCrop(256)], p = 0.1),
                                             # Random affine transformation of the image keeping center invariant.
                                             transforms.RandomApply([transforms.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.7, 0.9))], p = 0.1), 
                                             # Rotate the image by angle.
                                             transforms.RandomApply([transforms.RandomRotation(degrees=(0, 180))], p = 0.1),
                                             
                                             transforms.CenterCrop(224),
                                             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) 

# Transformations to apply to test image (normalize param based on imagenet, pytorch default).
basic_transforms = transforms.Compose([transforms.ToPILImage(mode='RGB'),
                                       transforms.CenterCrop(224),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) 

## Datasets

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler #For working with data.

class EyeDataset(Dataset):
    def __init__(self, 
                 df_from_csv,
                 train = None, 
                 transform = None):
        
        self.data = df_from_csv
        self.train = train
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        # Find path and get the image
        code = self.data['id_code'].values[idx]
        if self.train : 
            datatype = "train"
        else :
            datatype = "test"
        file_path = f'../input/aptos2019-blindness-detection/{datatype}_images/{code}.png'
        img = cv2.imread(file_path)
        
        # Apply essential process : remove black + RGB
        img = remove_black(img,tol=7) # Remove black
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
        image = cv2.resize(img, (256, 256)) # Resize image to 256
        image = self.transform(image)
        
        if self.train:
            label = self.data['diagnosis'].values[idx]
            return image, label
        else :
            return image, code
        
batch_size = 64
num_workers = 0
      
###### Prepare data loaders with data augmentation

dataset_augmented = EyeDataset(df_from_csv = train_augmented, 
                     train = True, 
                     transform = augmentation_transforms)
dataset_validation = EyeDataset(df_from_csv = train_augmented, 
                     train = True, 
                     transform = basic_transforms)

tr, val = train_test_split(train_augmented.diagnosis, stratify=train_augmented.diagnosis, test_size=0.2)
train_sampler = SubsetRandomSampler(list(tr.index))
valid_sampler = SubsetRandomSampler(list(val.index))

train_loader_augmented = DataLoader(dataset_augmented, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers)
valid_loader_augmented = DataLoader(dataset_validation, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)

###### Prepare data test loaders (combine dataset and sampler)

test_set = EyeDataset(df_from_csv = test, 
                      train = False, 
                      transform = basic_transforms)
test_loader = DataLoader(test_set, batch_size=64, num_workers=0)

# Modèle

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Use GPU if it's available or else use CPU.
print(device) #Prints the device we're using.

In [None]:
from torchvision import models # For template models

model = models.resnet50()
torch.save(model.state_dict(), 'resnet50.pth')
model.load_state_dict(torch.load('resnet50.pth'))

num_ftrs = model.fc.in_features # num_ftrs = 2048
model.fc =  nn.Sequential(nn.Linear(num_ftrs, 5))
model = model.to(device) #Moves the model to the device.

# Training

In [None]:
def train(training_dataset, model, criterion, optimizer):
    '''
    train function updates the weights of the model based on the
    loss using the optimizer in order to get a lower loss.
    
    Args :
         training_dataset: Iterator for the batches in the data_set.
         model: Given an input produces an output by multiplying the input with the model weights.
         criterion: Calculates the discrepancy between the label & the model's predictions.
         optimizer: Updates the model weights.
         
    Returns :
         Average loss per batch which is calculated by dividing the losses for all the batches
         with the number of batches.
    '''
    model.train() #Sets the model for training
    
    running_losses = [] # Vector of losses -> mean = average loss
    
    for (inputs, labels) in training_dataset:
        # get the inputs; batch is a list of [inputs, labels]
        inputs, labels = inputs.to(device), labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        out = model(inputs)
        loss = criterion(out, labels)
        loss.backward() #Calculates the gradients.
        optimizer.step() #Updates the model weights.
        
        running_losses.append(loss.item())
        
    avg_loss = np.mean(running_losses) # Average loss for a single batch
    print(f'\nTraining Loss = {avg_loss:.6f}',end='\t')
    return avg_loss

In [None]:
def validate(validation_dataset, model, criterion, final_layer):
    '''
    validate function calculates the average loss per batch and the accuracy of the model's predictions.
    
    Args :
         validation_dataset: Iterator for the batches in the data_set.
         model: Given an input produces an output by multiplying the input with the model weights.
         criterion : Calculates the discrepancy between the label & the model's predictions.
         final_layer: from logits to prob (usually softmax)
    
    Returns :
         Average loss per batch which is calculated by dividing the losses for all the batches
         with the number of batches.
    '''
    
    model.eval() #Sets the model for evaluation.
    
    running_losses = [] # Vector of losses -> mean = average loss
    
    total = 0 # total of predictions
    correct = 0 # total of correct predictions
    
    with torch.no_grad(): #No need to calculate the gradients.
        
        for (inputs, labels) in validation_dataset :
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs) #model's output.
            running_losses.append(criterion(outputs, labels).item())
            total += labels.size(0)
            _, predicted = torch.max(final_layer(outputs).data, 1) # the class with the highest energy is what we choose as prediction
            correct += (predicted == labels).sum().item()
            
    avg_loss = np.mean(running_losses) #Average loss per batch.
    accuracy = 100*(correct/total)
    
    print(f'\nValidation Loss = {avg_loss:.6f}',end='\t')
    print(f'Accuracy on Validation set = {accuracy:.6f}') #Prints the Accuracy.
    
    return avg_loss, accuracy

In [None]:
from tqdm import tqdm

# Scoring functions
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, cohen_kappa_score

def optimize(training_dataset, validation_dataset, model, criterion, optimizer, final_layer, nb_epochs, n_epoch_no_change = 3):
    '''
    optimize function calls the train & validate functions for (nb_epochs) times.
    
    Args :
        training_dataset: DataLoader for the train_set.
        validation_dataset: DataLoader for the valid_set.
        model: Given an input produces an output by multiplying the input with the model weights.
        criterion: Calculates the discrepancy between the label & the model's predictions.
        optimizer: Updates the model weights.
        final_layer: from logits to prob (usually softmax)
        nb_epochs: Number of epochs.
        
    Returns :
        Tuple of lists containing losses, accuracies for all the epochs + kaggle score
    '''
    #Lists to store losses for all the epochs.
    train_losses = []
    valid_losses = []
    valid_accuracies = []
    
    # Early stopping
    iter_wout_change = 0
    stop = 0
    
    for epoch in tqdm(range(nb_epochs)):
        train_loss = train(training_dataset, model, criterion, optimizer) #Calls the train function.
        train_losses.append(train_loss)
        valid_loss, valid_accuracy = validate(validation_dataset, model, criterion, final_layer) #Calls the validate function.
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_accuracy)
        print('| Epoch: {}/{} | Train: Loss {:.4f} '\
              '| Val: Loss {:.4f} Accuracy : {:.4f}\n'.format(epoch+1, nb_epochs, train_losses[epoch], valid_losses[epoch], valid_accuracies[epoch]))
        if valid_loss==min(valid_losses):
            torch.save(model.state_dict(), 'model.param')
            iter_wout_change = 0
        else : 
            iter_wout_change += 1
            if iter_wout_change == n_epoch_no_change : # If no improve in n_epoch_no_change epochs
                stop = 1
        if stop : 
            break # end of training
    
    print('\nTraining has completed!')
    
    model.load_state_dict(torch.load('model.param')) # save best parameters according to validation loss
    
    print("################ Following : Losses & Accuracy ################## ")
    
    plt.semilogy(valid_accuracies)
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy along the epochs')
    plt.show()
    
    plt.semilogy(train_losses, label='Training loss') 
    plt.semilogy(valid_losses, label='Validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Losses')
    plt.title('Losses comparison : Training vs Evaluation')
    plt.legend()
    plt.show()
    
    print("################ Performance : Accuracy, Confusion Matrix ################## ")
    
    model.eval() #Sets the model for evaluation.
    labels_pred = [] #List to store the predicted labels.
    labels_actual = [] #List to store the real labels.
    with torch.no_grad():
        for (inputs, labels) in validation_dataset:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs) #model's output.
                        
            _, predicted = torch.max(final_layer(outputs).data, 1) # the class with the highest energy is what we choose as prediction
            labels_pred.extend([p.item() for p in predicted])
            labels_actual.extend([l.item() for l in labels])
    
    matrix = confusion_matrix(labels_pred, labels_actual)
    score = accuracy_score(labels_pred, labels_actual)
    #plt.figure(figsize=(9,9))
    sns.heatmap(matrix, annot = True, fmt = ".3f", linewidths = .5, square = True, cmap = 'Blues_r');
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    plt.title('Accuracy Score: {0}'.format(score), size = 15);
    plt.show()
    print("Accuracy by classes :", matrix.diagonal()/matrix.sum(axis=1))
    kaggle_score = cohen_kappa_score(labels_pred, labels_actual)
    print("Kaggle score on validation set :", kaggle_score)
    
    return train_losses, valid_losses, valid_accuracies, kaggle_score

In [None]:
### Training of ResNet50 model (with data augmentation)

import torch.optim as optim

output_fn = torch.nn.Softmax(dim=1) # final_layer
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)
losst, lossv, accuracyv, kaggle = optimize(train_loader_augmented, valid_loader_augmented, model, criterion, optimizer, output_fn, nb_epochs = 10, n_epoch_no_change = 3)

In [None]:
print("We are going to submit the best model : ResNet50. With a Cohen Kappa score equal to :", kaggle )

# Submissions

In [None]:
def make_submission(model_submitted, test_dataset, final_layer):
    
    submission = pd.read_csv('../input/aptos2019-blindness-detection/sample_submission.csv')
    model_submitted.eval()
    all_names = []
    labels_pred = []
    
    for (batch) in test_dataset:
        inputs, names = batch
        all_names.extend(list(names))
        
        outputs = model_submitted(inputs.to(device)) #model's output.
        _, predicted = torch.max(final_layer(outputs).data, 1) # the class with the highest energy is what we choose as prediction
        labels_pred.extend([p.item() for p in predicted])
        
    list_diagno_name = list(zip(labels_pred, all_names))
    for (diagno, name) in list_diagno_name:
        submission.loc[submission['id_code'] == name, 'diagnosis'] = diagno
    
    submission.to_csv('submission.csv', index=False)
    return submission

submission = make_submission(model, test_loader, output_fn)
print(submission['diagnosis'].value_counts())
submission.head()