# About this kernel

In this kernel, we will explore the complete workflow for the APTOS 2019 competition. We will go through:

1. Loading & Exploration: A quick overview of the dataset
2. Resize Images: We will resize both the training and test images to 224x224, so that it matches the ImageNet format.
3. Mixup & Data Generator: We show how to create a data generator that will perform random transformation to our datasets (flip vertically/horizontally, rotation, zooming). This will help our model generalize better to the data, since it is fairly small (only ~3000 images).
4. Quadratic Weighted Kappa: A thorough overview of the metric used for this competition, with an intuitive example. Check it out!
5. Model: We will use a DenseNet-121 pre-trained on ImageNet. We will finetune it using Adam for 15 epochs, and evaluate it on an unseen validation set.
6. Training & Evaluation: We take a look at the change in loss and QWK score through the epochs.

### Unused Methods

Throughout V15-V18 of this kernel, I ablated a few methods that I presented in this kernel. The highest LB score was achieved after I removed:
* Mixup
* Optimized Threshold

I decided to keep them if it ever becomes useful in your case.

### Citations & Resources

* I had the idea of using mixup from [KeepLearning's ResNet50 baseline](https://www.kaggle.com/mathormad/aptos-resnet50-baseline). Since the implementation was in PyTorch, I instead used an [open-sourced keras implementation](https://github.com/yu4u/mixup-generator).
* The transfer learning procedure is mostly inspired from my [previous kernel for iWildCam](https://www.kaggle.com/xhlulu/densenet-transfer-learning-iwildcam-2019). The workflow was however heavily modified since then.
* Used similar [method as Abhishek](https://www.kaggle.com/abhishek/optimizer-for-quadratic-weighted-kappa) to find the optimal threshold.
* [Lex's kernel](https://www.kaggle.com/lextoumbourou/blindness-detection-resnet34-ordinal-targets) prompted me to try using Multilabel instead of multiclass classification, which slightly improved the kappa score.

In [None]:
import json
import math
import os

import cv2
from PIL import Image
import numpy as np
from keras import layers
from keras import callbacks
from keras.applications import DenseNet121
from keras.applications.xception import Xception
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
import scipy
from tqdm import tqdm
import imgaug as ia
from imgaug import augmenters as iaa

%matplotlib inline

In [None]:
TRAINING = True

# Loading & Exploration

In [None]:
test_df = pd.read_csv('../input/aptos2019-blindness-detection/test.csv')
print(test_df.shape)
if TRAINING:
    train_df = pd.read_csv('../input/aptos2019-blindness-detection/train.csv')
    print(train_df.shape)
    train_df.head()

In [None]:
# # test_df_19 = pd.read_csv('../input/aptos2019-blindness-detection/test.csv')
# # print(test_df_19.shape)
# if TRAINING:
#     train_df_15 = pd.read_csv('../input/resized-2015-2019-blindness-detection-images/')
#     print(train_df_19.shape)
#     train_df_19.head()

In [None]:
!head -n5 ../input/resized-2015-2019-blindness-detection-images/labels/trainLabels15.csv

In [None]:
if TRAINING:
    train_df['diagnosis'].hist()
    train_df['diagnosis'].value_counts()

### Displaying some Sample Images

In [None]:
def display_samples(df, columns=4, rows=3):
    fig=plt.figure(figsize=(5*columns, 4*rows))

    for i in range(columns*rows):
        image_path = df.loc[i,'id_code']
        image_id = df.loc[i,'diagnosis']
        img = cv2.imread(f'../input/aptos2019-blindness-detection/train_images/{image_path}.png')
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        fig.add_subplot(rows, columns, i+1)
        plt.title(image_id)
        plt.imshow(img)
    
    plt.tight_layout()
    
if TRAINING:
    display_samples(train_df)

# Resize Images

We will resize the images to 224x224, then create a single numpy array to hold the data.

In [None]:
IMG_SIZE = 224
NB_CHANNELS = 3

In [None]:
def get_pad_width(im, new_shape, is_rgb=True):
    pad_diff = new_shape - im.shape[0], new_shape - im.shape[1]
    t, b = math.floor(pad_diff[0]/2), math.ceil(pad_diff[0]/2)
    l, r = math.floor(pad_diff[1]/2), math.ceil(pad_diff[1]/2)
    if is_rgb:
        pad_width = ((t,b), (l,r), (0, 0))
    else:
        pad_width = ((t,b), (l,r))
    return pad_width

def standardize(x):
    x = x.astype(np.float32)
    x = x / np.max(x)
    return (x - np.mean(x)) / (np.std(x))

"""
normalizes the image
"""
def normalize(img):
    img = ((img - np.min(img)) / (np.max(img) - np.min(img))) * 255
    return img.astype(np.uint8)
#     return img

"""
crops black parts around the image (intensity is <= tol)
"""
def crop_image(img, tol=10):
    
    # for one channel
    def crop_image_1(img):
        mask = img > tol
        return img[np.ix_(mask.any(1), mask.any(0))]
    
    if img.ndim == 2:
        return crop_image_1(img)
    
    elif img.ndim == 3:
        try:
            img_cpy = img.copy()
            h, w, _ = img.shape
            img1 = cv2.resize(crop_image_1(img[:, :, 0]), (w, h))
            img2 = cv2.resize(crop_image_1(img[:, :, 1]), (w, h))
            img3 = cv2.resize(crop_image_1(img[:, :, 2]), (w, h))

            img[:,:,0] = img1
            img[:,:,1] = img2
            img[:,:,2] = img3
            
        except:
            return img_cpy
        
        
        
        # add edges to img
        
#         sobelx = cv2.Sobel(img,cv2.CV_32F,1,0,ksize=5)
#         sobely = cv2.Sobel(img,cv2.CV_32F,0,1,ksize=5)
#         sobelx = cv2.cvtColor(sobelx, cv2.COLOR_RGB2GRAY)
#         sobely = cv2.cvtColor(sobely, cv2.COLOR_RGB2GRAY)
        
#         img4 = sobelx ** 2 + sobely ** 2
        
#         img4 -= np.min(img4)
#         img4 = img4 / np.max(img4)
        
#         img4 *= 255.
#         img4.astype(np.uint8)
        
#         img[:,:,0] = (9.0 * img1 + img4) // 10
#         img[:,:,1] = (9.0 * img2 + img4) // 10
#         img[:,:,2] = (9.0 * img3 + img4) // 10
        
        
        return img


"""
crops black parts and enhances image (Ben Graham's method)
"""
# def preprocess_image(img):
#     # Added this line
#     img = normalize(img)
    
#     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#     img = crop_image(img)
#     img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
#     img = cv2.addWeighted(img, 4, cv2.GaussianBlur(img, (0, 0), IMG_SIZE/10), -4, 128)
    
#     return img.astype(np.float32)


def preprocess_image(im):
#     im = Image.open(image_path)
#     im = cv2.imread(image_path)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    
    im = normalize(im)
    im = crop_image(im)
#     im = im.resize((IMG_SIZE, ) * 2, resample=Image.LANCZOS)
    im = cv2.resize(im, (IMG_SIZE, IMG_SIZE))
    
    im = cv2.addWeighted(im, 4, cv2.GaussianBlur(im, (0, 0), IMG_SIZE / 10), -4, 128)
    
#     im = standardize(im)
#     im = normalize(im)
    
    return im.astype(np.uint8)

In [None]:
if TRAINING:
    fig, ax = plt.subplots(1, 4, figsize=[20, 5])
    for i, image_id in enumerate(train_df['id_code']):
    #     img = preprocess_image(f'../input/aptos2019-blindness-detection/train_images/{image_id}.png')
        im = preprocess_image(cv2.imread(f'../input/aptos2019-blindness-detection/train_images/{image_id}.png'))
#         im = crop_image(im)
#         im = cv2.resize(im, (IMG_SIZE, IMG_SIZE))
        ax[i].imshow(im)
        ax[i].axis("off")
        if i == 3:
            break

In [None]:
if TRAINING:
    N = train_df.shape[0]
    x_train = np.empty((N, 224, 224, 3), dtype=np.uint8)

    for i, image_id in enumerate(tqdm(train_df['id_code'])):
        x_train[i, :, :, :] = preprocess_image(cv2.imread(
            f'../input/aptos2019-blindness-detection/train_images/{image_id}.png'
        ))

In [None]:
N = test_df.shape[0]
x_test = np.empty((N, 224, 224, 3), dtype=np.uint8)

for i, image_id in enumerate(tqdm(test_df['id_code'])):
    x_test[i, :, :, :] = preprocess_image(cv2.imread(
        f'../input/aptos2019-blindness-detection/test_images/{image_id}.png'
    ))

In [None]:
if TRAINING:
    y_train = pd.get_dummies(train_df['diagnosis']).values

    print(x_train.shape)
    print(y_train.shape)
print(x_test.shape)

Now we can split it into a training and validation set.

In [None]:
# x_train, x_val, y_train, y_val = train_test_split(
#     x_train, y_train_multi, 
#     test_size=0.15, 
#     random_state=2019
# )

if TRAINING:
    x_train, x_val, y_train, y_val = train_test_split(
#         x_train, y_train_multi, 
        x_train, y_train, 
        test_size=0.2
    )

## Oversampling

In [None]:
# if TRAINING:
#     x_train_0 = x_train[[y_train == 1][0][:, 0]]
#     x_train_1 = x_train[[y_train == 1][0][:, 1]]
#     x_train_2 = x_train[[y_train == 1][0][:, 2]]
#     x_train_3 = x_train[[y_train == 1][0][:, 3]]
#     x_train_4 = x_train[[y_train == 1][0][:, 4]]

In [None]:
# def oversample(x):
#     indexes_asc = np.arange(len(x))
#     indexes_ran = np.random.permutation(len(x))
    
#     return (0.5 * x[indexes_asc] + 0.5 * x[indexes_ran]).clip(0, 255).astype(np.uint8)

In [None]:
# if TRAINING:
#     x_train_1 = np.concatenate([x_train_1, oversample(x_train_1)])
#     x_train_1 = np.concatenate([x_train_1, oversample(x_train_1)])

#     x_train_2 = np.concatenate([x_train_2, oversample(x_train_2)])

#     x_train_3 = np.concatenate([x_train_3, oversample(x_train_3)])
#     x_train_3 = np.concatenate([x_train_3, oversample(x_train_3)])
#     x_train_3 = np.concatenate([x_train_3, oversample(x_train_3)])

#     x_train_4 = np.concatenate([x_train_4, oversample(x_train_4)])
#     x_train_4 = np.concatenate([x_train_4, oversample(x_train_4)])

In [None]:
# if TRAINING:
#     print(x_train_0.shape)
#     print(x_train_1.shape)
#     print(x_train_2.shape)
#     print(x_train_3.shape)
#     print(x_train_4.shape)

In [None]:
# if TRAINING:
#     y_train_0 = np.zeros((len(x_train_0), 5), dtype=np.bool)
#     y_train_0[:, 0] = True

#     y_train_1 = np.zeros((len(x_train_1), 5), dtype=np.bool)
#     y_train_1[:, 1] = True

#     y_train_2 = np.zeros((len(x_train_2), 5), dtype=np.bool)
#     y_train_2[:, 2] = True

#     y_train_3 = np.zeros((len(x_train_3), 5), dtype=np.bool)
#     y_train_3[:, 3] = True

#     y_train_4 = np.zeros((len(x_train_4), 5), dtype=np.bool)
#     y_train_4[:, 4] = True

In [None]:
# if TRAINING:
#     x_train = np.concatenate([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4])
#     y_train = np.concatenate([y_train_0, y_train_1, y_train_2, y_train_3, y_train_4]).astype(np.uint8)

#     del x_train_0
#     del x_train_1
#     del x_train_2
#     del x_train_3
#     del x_train_4

#     del y_train_0
#     del y_train_1
#     del y_train_2
#     del y_train_3
#     del y_train_4

In [None]:
if TRAINING:
    indexes = np.random.permutation(len(x_train))

    x_train = x_train[indexes]
    y_train = y_train[indexes]

In [None]:
# if TRAINING:
#     print(x_train.dtype, x_train.shape)
#     print(y_train.dtype, y_train.shape)

## Creating multilabels

Instead of predicting a single label, we will change our target to be a multilabel problem; i.e., if the target is a certain class, then it encompasses all the classes before it. E.g. encoding a class 4 retinopathy would usually be `[0, 0, 0, 1]`, but in our case we will predict `[1, 1, 1, 1]`. For more details, please check out [Lex's kernel](https://www.kaggle.com/lextoumbourou/blindness-detection-resnet34-ordinal-targets).

In [None]:
if TRAINING:
#     y_train_multi = np.empty(y_train.shape, dtype=y_train.dtype)
#     y_train_multi[:, 4] = y_train[:, 4]

    for i in range(3, -1, -1):
#         y_train_multi[:, i] = np.logical_or(y_train[:, i], y_train_multi[:, i+1])
        y_train[:, i] = np.logical_or(y_train[:, i], y_train[:, i + 1])
        y_val[:, i] = np.logical_or(y_val[:, i], y_val[:, i + 1])

#     print("Original y_train:", y_train.sum(axis=0))
#     print("Multilabel version:", y_train_multi.sum(axis=0))
    print("Multilabel version:", y_train.sum(axis=0))

In [None]:
if TRAINING:
    print(x_train.shape, x_train.dtype, np.min(x_train), np.max(x_train))
    print(x_val.shape, x_val.dtype, np.min(x_val), np.max(x_val))
    
print(x_test.shape, x_test.dtype, np.min(x_test), np.max(x_test))

In [None]:
# Crop training and validation data as test data
if TRAINING:
    for i in tqdm(range(len(x_val))):
        x_val[i] = cv2.resize(x_val[i, 20: -20, 20: -20, :], (IMG_SIZE, IMG_SIZE))
        
    for i in tqdm(range(len(x_train))):
        x_train[i] = cv2.resize(x_train[i, 20: -20, 20: -20, :], (IMG_SIZE, IMG_SIZE))

In [None]:
if TRAINING:
    fig, ax = plt.subplots(5, 5, figsize=(10, 10))
    indexes = np.random.randint(len(x_train), size=25)
    for i in range(5):
        for j in range(5):
            ax[i][j].imshow(x_train[indexes[i * 5 + j]])

In [None]:
if TRAINING:
    fig, ax = plt.subplots(5, 5, figsize=(10, 10))
    indexes = np.random.randint(len(x_val), size=25)
    for i in range(5):
        for j in range(5):
            ax[i][j].imshow(x_val[indexes[i * 5 + j]])

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(10, 10))
indexes = np.random.randint(len(x_test), size=25)
for i in range(5):
    for j in range(5):
        ax[i][j].imshow(x_test[indexes[i * 5 + j]])

In [None]:
if TRAINING:
    print(x_train.shape, x_train.dtype, np.min(x_train), np.max(x_train))
    print(x_val.shape, x_val.dtype, np.min(x_val), np.max(x_val))
    
print(x_test.shape, x_test.dtype, np.min(x_test), np.max(x_test))

# Mixup & Data Generator

Please Note: Although I show how to construct Mixup, **it is currently unused**. Please see notice at the top of the kernel.

In [None]:
class MixupGenerator():
    def __init__(self, X_train, y_train, seq=None, batch_size=32, alpha=0.2, shuffle=True, datagen=None):
        self.X_train = X_train
        self.y_train = y_train
        self.batch_size = batch_size
        self.alpha = alpha
        self.shuffle = shuffle
        self.sample_num = len(X_train)
        self.datagen = datagen
        self.seq = seq

    def __call__(self):
        while True:
            indexes = self.__get_exploration_order()
            itr_num = int(len(indexes) // (self.batch_size * 2))

            for i in range(itr_num):
                batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2]
                X, y = self.__data_generation(batch_ids)

                yield X, y

    def __get_exploration_order(self):
        indexes = np.arange(self.sample_num)

        if self.shuffle:
            np.random.shuffle(indexes)

        return indexes

    def __data_generation(self, batch_ids):
        _, h, w, c = self.X_train.shape
        l = np.random.beta(self.alpha, self.alpha, self.batch_size)
        X_l = l.reshape(self.batch_size, 1, 1, 1)
        y_l = l.reshape(self.batch_size, 1)

        X1 = self.X_train[batch_ids[:self.batch_size]]
        X2 = self.X_train[batch_ids[self.batch_size:]]
        X = X1 * X_l + X2 * (1 - X_l)

        if self.datagen:
            for i in range(self.batch_size):
                X[i] = self.datagen.random_transform(X[i])
                X[i] = self.datagen.standardize(X[i])
                
        if self.seq:
            X = X.astype(np.float32)
            X /= np.max(X)
            X *= 255 
            X = X.astype(np.uint8)
            X = self.seq.augment_images(X)

        if isinstance(self.y_train, list):
            y = []

            for y_train_ in self.y_train:
                y1 = y_train_[batch_ids[:self.batch_size]]
                y2 = y_train_[batch_ids[self.batch_size:]]
                y.append(y1 * y_l + y2 * (1 - y_l))
        else:
            y1 = self.y_train[batch_ids[:self.batch_size]]
            y2 = self.y_train[batch_ids[self.batch_size:]]
            y = y1 * y_l + y2 * (1 - y_l)
        
        return X, y
    
    def get_sample(self):
        return self.__data_generation(self.__get_exploration_order()[i * self.batch_size * 2:(i + 1) * self.batch_size * 2])

In [None]:
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
seq = iaa.Sequential(
        [
            # apply the following augmenters to most images
            iaa.Fliplr(0.5), # horizontally flip 50% of all images
            iaa.Flipud(0.2), # vertically flip 20% of all images
            sometimes(iaa.Affine(
                scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, # scale images to 80-120% of their size, individually per axis
                translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}, # translate by -20 to +20 percent (per axis)
                rotate=(-10, 10), # rotate by -45 to +45 degrees
                shear=(-5, 5), # shear by -16 to +16 degrees
                order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
                cval=(0, 255), # if mode is constant, use a cval between 0 and 255
                mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
            )),
            # execute 0 to 5 of the following (less important) augmenters per image
            # don't execute all of them, as that would often be way too strong
            iaa.SomeOf((0, 5),
                [
                    sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 1.0)), # blur images with a sigma between 0 and 3.0
                        iaa.AverageBlur(k=(3, 5)), # blur image using local means with kernel sizes between 2 and 7
                        iaa.MedianBlur(k=(3, 5)), # blur image using local medians with kernel sizes between 2 and 7
                    ]),
                    iaa.Sharpen(alpha=(0, 1.0), lightness=(0.9, 1.1)), # sharpen images
                    iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
                    # search either for all edges or for directed edges,
                    # blend the result with the original image using a blobby mask
                    iaa.SimplexNoiseAlpha(iaa.OneOf([
                        iaa.EdgeDetect(alpha=(0.5, 1.0)),
                        iaa.DirectedEdgeDetect(alpha=(0.5, 1.0), direction=(0.0, 1.0)),
                    ])),
                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.01*255), per_channel=0.5), # add gaussian noise to images
                    iaa.OneOf([
                        iaa.Dropout((0.01, 0.05), per_channel=0.2), # randomly remove up to 10% of the pixels
                        iaa.CoarseDropout((0.01, 0.03), size_percent=(0.01, 0.02), per_channel=0.2),
                    ]),
                    iaa.Invert(0.01, per_channel=True), # invert color channels
                    iaa.Add((-2, 2), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
                    iaa.AddToHueAndSaturation((-1, 1)), # change hue and saturation
                    # either change the brightness of the whole image (sometimes
                    # per channel) or change the brightness of subareas
                    iaa.OneOf([
                        iaa.Multiply((0.9, 1.1), per_channel=0.5),
                        iaa.FrequencyNoiseAlpha(
                            exponent=(-1, 0),
                            first=iaa.Multiply((0.9, 1.1), per_channel=True),
                            second=iaa.ContrastNormalization((0.9, 1.1))
                        )
                    ]),
                    sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
                    sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))), # sometimes move parts of the image around
                    sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.1)))
                ],
                random_order=True
            )
        ],
        random_order=True)

In [None]:
BATCH_SIZE = 32

def create_datagen():
#     return ImageDataGenerator(
#         zoom_range=0.15,  # set range for random zoom
#         # set mode for filling points outside the input boundaries
#         fill_mode='constant',
#         cval=0.,  # value used for fill_mode = "constant"
#         horizontal_flip=True,  # randomly flip images
#         vertical_flip=True,  # randomly flip images
#     )
    return ImageDataGenerator(
        preprocessing_function=seq.augment_image
    )
if TRAINING:
    # Using original generator
    data_generator = create_datagen().flow(x_train, y_train, batch_size=BATCH_SIZE)
    # Using Mixup
    # mixup_generator = MixupGenerator(x_train, y_train, batch_size=BATCH_SIZE, alpha=0.2, datagen=create_datagen())()
    mixup_generator = MixupGenerator(x_train, y_train, seq=seq, batch_size=BATCH_SIZE, alpha=0.2)()

In [None]:
if TRAINING:
    fig, ax = plt.subplots(5, 5, figsize=(15, 15))
    indexes = np.random.randint(len(x_train), size=25)
    for i in range(5):
        for j in range(5):
            ax[i][j].imshow(x_train[indexes[i * 5 + j]])

In [None]:
# Image and its augmentations
if TRAINING:
    index = np.random.randint(len(x_train), size=1)[0]

    fig, ax = plt.subplots(1, 5, figsize=(15, 3))
    for i in range(5):
        if i == 0:
            ax[i].imshow(x_train[index])
            ax[i].axis("off")
        else:
#             img = x_train[index] * 255
#             img = img.astype(np.uint8)
            img = seq.augment_image(x_train[index])
            ax[i].imshow(img)
            ax[i].axis("off")

# Quadratic Weighted Kappa

Quadratic Weighted Kappa (QWK, the greek letter $\kappa$), also known as Cohen's Kappa, is the official evaluation metric. For our kernel, we will use a custom callback to monitor the score, and plot it at the end.

### What is Cohen Kappa?

According to the [wikipedia article](https://en.wikipedia.org/wiki/Cohen%27s_kappa), we have
> The definition of $\kappa$ is:
> $$\kappa \equiv \frac{p_o - p_e}{1 - p_e}$$
> where $p_o$ is the relative observed agreement among raters (identical to accuracy), and $p_e$ is the hypothetical probability of chance agreement, using the observed data to calculate the probabilities of each observer randomly seeing each category.

### How is it computed?

Let's take the example of a binary classification problem. Say we have:

In [None]:
true_labels = np.array([1, 0, 1, 1, 0, 1])
pred_labels = np.array([1, 0, 0, 0, 0, 1])

We can construct the following table:

| true | pred | agreement      |
|------|------|----------------|
| 1    | 1    | true positive  |
| 0    | 0    | true negative  |
| 1    | 0    | false negative |
| 1    | 0    | false negative |
| 0    | 0    | true negative  |
| 1    | 1    | true positive  |


Then the "observed proportionate agreement" is calculated exactly the same way as accuracy:

$$
p_o = acc = \frac{tp + tn}{all} = {2 + 2}{6} = 0.66
$$

This can be confirmed using scikit-learn:

In [None]:
accuracy_score(true_labels, pred_labels)

Additionally, we also need to compute `p_e`:

$$p_{yes} = \frac{tp + fp}{all} \frac{tp + fn}{all} = \frac{2}{6} \frac{4}{6} = 0.222$$

$$p_{no} = \frac{fn + tn}{all} \frac{fp + tn}{all} = \frac{4}{6} \frac{2}{6} = 0.222$$

$$p_{e} = p_{yes} + p_{no} = 0.222 + 0.222 = 0.444$$

Finally,

$$
\kappa = \frac{p_o - p_e}{1-p_e} = \frac{0.666 - 0.444}{1 - 0.444} = 0.4
$$

Let's verify with scikit-learn:

In [None]:
cohen_kappa_score(true_labels, pred_labels)

### What is the weighted kappa?

The wikipedia page offer a very concise explanation: 
> The weighted kappa allows disagreements to be weighted differently and is especially useful when **codes are ordered**. Three matrices are involved, the matrix of observed scores, the matrix of expected scores based on chance agreement, and the weight matrix. Weight matrix cells located on the diagonal (upper-left to bottom-right) represent agreement and thus contain zeros. Off-diagonal cells contain weights indicating the seriousness of that disagreement.

Simply put, if two scores disagree, then the penalty will depend on how far they are apart. That means that our score will be higher if (a) the real value is 4 but the model predicts a 3, and the score will be lower if (b) the model instead predicts a 0. This metric makes sense for this competition, since the labels 0-4 indicates how severe the illness is. Intuitively, a model that predicts a severe retinopathy (3) when it is in reality a proliferative retinopathy (4) is probably better than a model that predicts a mild retinopathy (1).

### Creating keras callback for QWK

In [None]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_kappas = []

    def on_epoch_end(self, epoch, logs={}):
        X_val, y_val = self.validation_data[:2]
        y_val = (y_val.sum(axis=1) - 1).clip(0, 4)
#         y_val = np.argmax(y_val, axis=1)
        
        y_pred = self.model.predict(X_val) > 0.5
        y_pred = (y_pred.astype(int).sum(axis=1) - 1).clip(0, 4)
#         y_pred = np.argmax(self.model.predict(X_val), axis=1)

        _val_kappa = cohen_kappa_score(
            y_val,
            y_pred, 
            weights='quadratic'
        )

        self.val_kappas.append(_val_kappa)

        print(f"val_kappa: {_val_kappa:.4f}")
        
        if _val_kappa == max(self.val_kappas):
            print("Validation Kappa has improved. Saving model.")
            self.model.save('model.h5')

        return

# Model: DenseNet-121

In [None]:
base_model = Xception(
    weights=None, #'imagenet',
    include_top=False,
    input_shape=(224,224,3)
)

base_model.load_weights("../input/keras-pretrained-models/xception_weights_tf_dim_ordering_tf_kernels_notop.h5")

In [None]:
def build_model():
    model = Sequential()
    model.add(base_model)
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(0.5))
#     model.add(layers.Dense(1024, activation='sigmoid'))
#     model.add(layers.Dropout(0.5))
#     model.add(layers.BatchNormalization())
    model.add(layers.Dense(5, activation='sigmoid'))
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=0.00005),
        metrics=['accuracy']
    )
    
    return model

In [None]:
model = build_model()
model.summary()

# Training & Evaluation

In [None]:
if TRAINING:
    kappa_metrics = Metrics()
    rlr = callbacks.ReduceLROnPlateau(factor=0.5, patience=4, verbose=1)
    es  = callbacks.EarlyStopping(patience=10, verbose=1, mode="min")

    history = model.fit_generator(
        data_generator,
        steps_per_epoch=x_train.shape[0] / BATCH_SIZE,
        epochs=200,
        validation_data=(x_val, y_val),
        callbacks=[kappa_metrics, rlr, es]
    )

In [None]:
if TRAINING:
    with open('history.json', 'w') as f:
        json.dump(str(history.history), f)

    history_df = pd.DataFrame(history.history)
    history_df[['loss', 'val_loss']].plot()
    history_df[['acc', 'val_acc']].plot()

In [None]:
if TRAINING:
    plt.plot(kappa_metrics.val_kappas)

## Find best threshold

Please Note: Although I show how to construct a threshold optimizer, **it is currently unused**. Please see notice at the top of the kernel.

In [None]:
# if TRAINING:
#     model.load_weights('model.h5')
#     y_val_pred = model.predict(x_val)

#     def compute_score_inv(threshold):
#         y1 = y_val_pred > threshold
#         y1 = y1.astype(int).sum(axis=1) - 1
#         y2 = y_val.sum(axis=1) - 1
#         score = cohen_kappa_score(y1, y2, weights='quadratic')

#         return 1 - score

#     simplex = scipy.optimize.minimize(
#         compute_score_inv, 0.5, method='nelder-mead'
#     )

#     best_threshold = simplex['x'][0]
    
#     print(best_threshold)

## Submit

In [None]:
# if x_test.dtype == np.uint8:
#     x_test = x_test.astype(np.float32)
    
# x_test = x_test / np.max(x_test)

In [None]:
if TRAINING:
    weights_path = 'model.h5'
else:
    weights_path = '../input/weights/model.h5'
model.load_weights(weights_path)

y_test = model.predict(x_test) > 0.5
y_test = (y_test.astype(int).sum(axis=1) - 1).clip(0, 4)
# y_test = np.argmax(model.predict(x_test), axis=1).astype(int)

test_df['diagnosis'] = y_test
test_df.to_csv('submission.csv',index=False)

In [None]:
test_df.hist()