## Classifying diabetic retinopathy

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *

from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [None]:
PATH = "../input/diabetic-retinopathy-detection/"
TMP_PATH = "/tmp/tmp"
MODEL_PATH = "/tmp/model/"
sz=224  # default

arch=resnet34
# sz = 64  # Because medical images.
# Not sure anymore about sz=64
bs = 4  # Because we have a very limited sample


In [None]:
print('Make sure cuda is installed:', torch.cuda.is_available())
print('Make sure cudnn is enabled:', torch.backends.cudnn.enabled)

## First look at DR pictures

In [None]:
base_image_dir = os.path.join('..', 'input', 'diabetic-retinopathy-detection')
retina_df = pd.read_csv(os.path.join(base_image_dir, 'trainLabels.csv'))
retina_df['PatientId'] = retina_df['image'].map(lambda x: x.split('_')[0])
retina_df['path'] = retina_df['image'].map(lambda x: os.path.join(base_image_dir,
                                                         '{}.jpeg'.format(x)))
retina_df['exists'] = retina_df['path'].map(os.path.exists)
print(retina_df['exists'].sum(), 'images found of', retina_df.shape[0], 'total')
retina_df['eye'] = retina_df['image'].map(lambda x: 1 if x.split('_')[-1]=='left' else 0)
# from keras.utils.np_utils import to_categorical
# retina_df['level_cat'] = retina_df['level'].map(lambda x: to_categorical(x, 1+retina_df['level'].max()))

retina_df.dropna(inplace = True)
retina_df = retina_df[retina_df['exists']]
retina_df.sample(3)

# Examine the distribution of eye and severity

In [None]:
retina_df[['level', 'eye']].hist(figsize = (10, 5))

# Check number of images in each classes

In [None]:
retina_df = retina_df[['PatientId', 'level', 'eye', 'path']].drop_duplicates()  # Should not drop any rows in this case
# V1:
# retina_df[['level', 'PatientId']].groupby(['level']).agg(['count'])

# V2:
retina_df.pivot_table(index='level', aggfunc=len).sort_values('PatientId', ascending=False)

# Optional: only keep images of type 0 and 2 (2 being the second most present class in this sample)

In [None]:
# retina_df = retina_df.drop(retina_df[[(x in [1, 3, 4]) for x in retina_df.level]].index)

# Balance the distribution based on the smallest set

In [None]:
def balance_data(class_size):
    train_df = retina_df.groupby(['level']).apply(lambda x: x.sample(class_size, replace = True)).reset_index(drop = True)
    print('New Data Size:', train_df.shape[0], 'Old Size:', retina_df.shape[0])
    train_df[['level', 'eye']].hist(figsize = (10, 5))
    return train_df

train_df = balance_data(148)

In [None]:
fnames = train_df['path'].values
labels = train_df['level'].values

# Making sure fnames and labels are in order

In [None]:
test_label = train_df.level.unique()[-1]

# To shuffle rows:
# train_df = train_df.sample(frac=1).reset_index(drop=True)

patient_example = train_df.loc[train_df['level'] == test_label].iloc[0]
patient_example_index = train_df.index[train_df['PatientId'] == patient_example['PatientId']][-1]
print(patient_example)
assert labels[patient_example_index] == test_label, f"Check that patient with id {patient_example_index}'s label is equal to {test_label}"

img = plt.imread(f'{fnames[patient_example_index]}')
plt.imshow(img);

In [None]:
img.shape

## Exploring our dataset images size

In [None]:
data = ImageClassifierData.from_names_and_array(
    path='./', 
    fnames=fnames, 
    y=labels, 
    classes=sorted(retina_df.level.unique()), 
    test_name=None, 
    tfms=tfms_from_model(arch, sz)
)

In [None]:
img_name = data.trn_ds.fnames[0]; img_name

In [None]:
img = PIL.Image.open(img_name); img

In [None]:
img.size

In [None]:
size_d = {k: PIL.Image.open(k).size for k in data.trn_ds.fnames}
row_sz, col_sz = list(zip(*size_d.values()))
row_sz = np.array(row_sz); col_sz = np.array(col_sz)

In [None]:
plt.hist(row_sz)

In [None]:
plt.hist(col_sz[col_sz < 2000])

We can't really balance the size of our dataset by down-sampling because almost all images are very large, because of this we are going to resize our images instead.

In [None]:
def get_data(sz, bs=4): # sz: image size, bs: batch size
#     tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.05)
    tfms = tfms_from_model(arch, sz)
    data = ImageClassifierData.from_names_and_array(
        path='./', 
        fnames=fnames, 
        y=labels, 
        classes=sorted(retina_df.level.unique()), 
        test_name=None,
        tfms=tfms,
        bs=bs
    )
    
    if len(data.trn_ds) % bs == 1:
        data = ImageClassifierData.from_names_and_array(path='./', classes=sorted(retina_df.level.unique()), test_name=None, tfms=tfms, bs=bs,
            fnames=fnames[:-1], 
            y=labels[:-1]
        )
    assert len(data.trn_ds) % bs != 1, 'This condition makes sure that we never have a batch size of 1, which could cause issues with lr_find for instance.'
    return data

In [None]:
data = get_data(sz=sz, bs=4)

In [None]:
from sklearn.metrics import cohen_kappa_score
from fastai.metrics import accuracy, recall, precision, fbeta

quadratic_kappa = lambda y_hat, y: cohen_kappa_score(y_hat, y, weights='quadratic')
def f2(log_preds, targs): 
    return fbeta(log_preds, targs, 2)

In [None]:
print(f'Sample classes: {retina_df.level.unique()}')

learn = ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)

In [None]:
learn.fit(1e-2, 3, metrics=[accuracy, recall, precision, f2])
# learn.fit(1e-2, 3, metrics=[quadratic_kappa])

In [None]:
def get_80percent_accuracy_with_sample_bias():
    fnames2 = retina_df['path'].as_matrix()[:-1]
    labels2 = retina_df['level'].as_matrix()[:-1]

    data = ImageClassifierData.from_names_and_array(
        path='./',
        fnames=fnames2,
        y=labels2,
        classes=sorted(retina_df.level.unique()),
        test_name=None,
        tfms=tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.05)
    )

    print(retina_df.pivot_table(index='level', aggfunc=len).sort_values('PatientId', ascending=False))

    return ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)

learn = get_80percent_accuracy_with_sample_bias()

In [None]:
learn.fit(0.01, 2, metrics=[accuracy, recall, precision, f2])

## Analyzing results: looking at pictures

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (i.e. those with highest probability that are correct)
4. The most incorrect labels of each class (i.e. those with highest probability that are incorrect)
5. The most uncertain labels (i.e. those with probability closest to 0.5).

In [None]:
# This is the label for a val data
data.val_y

In [None]:
# from here we know that 'cats' is label 0 and 'dogs' is label 1.
data.classes

In [None]:
# this gives prediction for validation set. Predictions are in log scale
log_preds = learn.predict()
log_preds.shape

In [None]:
log_preds[:10]

In [None]:
preds = np.argmax(log_preds, axis=1)  # from log probabilities to 0 or 1
probs = np.exp(log_preds[:,1])        # pr(dog)

In [None]:
def rand_by_mask(mask): return np.random.choice(np.where(mask)[0], 4, replace=False)
def rand_by_correct(is_correct): return rand_by_mask((preds == data.val_y)==is_correct)

In [None]:
def plots(ims, figsize=(12,6), rows=1, titles=None):
    f = plt.figure(figsize=figsize)
    for i in range(len(ims)):
        sp = f.add_subplot(rows, len(ims)//rows, i+1)
        sp.axis('Off')
        if titles is not None: sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i])

In [None]:
def load_img_id(ds, idx): return np.array(PIL.Image.open(ds.fnames[idx]))

def plot_val_with_title(idxs, title):
    imgs = [load_img_id(data.val_ds,x) for x in idxs]
    title_probs = [probs[x] for x in idxs]
    print(title)
    return plots(imgs, rows=1, titles=title_probs, figsize=(16,8))

In [None]:
# 1. A few correct labels at random
plot_val_with_title(rand_by_correct(True), "Correctly classified")

In [None]:
# 2. A few incorrect labels at random
plot_val_with_title(rand_by_correct(False), "Incorrectly classified")

In [None]:
def most_by_mask(mask, mult):
    idxs = np.where(mask)[0]
    return idxs[np.argsort(mult * probs[idxs])[:4]]

def most_by_correct(y, is_correct): 
    mult = -1 if (y==1)==is_correct else 1
    return most_by_mask(((preds == data.val_y)==is_correct) & (data.val_y == y), mult)

In [None]:
plot_val_with_title(most_by_correct(0, True), "Most correct 0")

In [None]:
plot_val_with_title(most_by_correct(2, True), "Most correct 2")

In [None]:
plot_val_with_title(most_by_correct(0, False), "Most incorrect 0")

In [None]:
plot_val_with_title(most_by_correct(2, False), "Most incorrect 2")

In [None]:
most_uncertain = np.argsort(np.abs(probs -0.5))[:4]
plot_val_with_title(most_uncertain, "Most uncertain predictions")

## Choosing a learning rate

The *learning rate* determines how quickly or how slowly you want to update the *weights* (or *parameters*). Learning rate is one of the most difficult parameters to set, because it significantly affects model performance.

The method `learn.lr_find()` helps you find an optimal learning rate. It uses the technique developed in the 2015 paper [Cyclical Learning Rates for Training Neural Networks](http://arxiv.org/abs/1506.01186), where we simply keep increasing the learning rate from a very small value, until the loss stops decreasing. We can plot the learning rate across batches to see what this looks like.

We first create a new learner, since we want to know how to set the learning rate for a new (untrained) model.

In [None]:
data = get_data(sz=224, bs=4)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)

In [None]:
lrf=learn.lr_find(start_lr=1e-7, end_lr=1e-1)

We can see the plot of loss versus learning rate to see where our loss stops decreasing:

In [None]:
learn.sched.plot()

The loss is still clearly improving at lr=1e-2 (0.01), so that's what we use. Note that the optimal learning rate can change as we train the model, so you may want to re-run this function from time to time.

## Improving our model

In [None]:
lr = 3e-3

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)

In [None]:
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
lrs = np.array([lr/9,lr/3,lr])

In [None]:
learn.unfreeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

By default when we create a learner, it sets all but the last layer to *frozen*. That means that it's still only updating the weights in the last layer when we call `fit`.

What is that `cycle_len` parameter? What we've done here is used a technique called *stochastic gradient descent with restarts (SGDR)*, a variant of *learning rate annealing*, which gradually decreases the learning rate as training progresses. This is helpful because as we get closer to the optimal weights, we want to take smaller steps.

In [None]:
learn.sched.plot_lr()

Our validation loss isn't improving much, so there's probably no point further training the last layer on its own.

Since we've got a pretty good model at this point, we might want to save it so we can load it again later without training it from scratch.

In [None]:
learn.save('224')

In [None]:
learn.load('224')

There is something else we can do with data augmentation: use it at *inference* time (also known as *test* time). Not surprisingly, this is known as *test time augmentation*, or just *TTA*.

TTA simply makes predictions not just on the images in your validation set, but also makes predictions on a number of randomly augmented versions of them too (by default, it uses the original image along with 4 randomly augmented versions). It then takes the average prediction from these images, and uses that. To use TTA on the validation set, we can use the learner's `TTA()` method.

In [None]:
log_preds,y = learn.TTA()
probs = np.mean(np.exp(log_preds),0)

In [None]:
accuracy_np(probs, y)

I generally see about a 10-20% reduction in error on this dataset when using TTA at this point, which is an amazing result for such a quick and easy technique!

## Analyzing results

### Confusion matrix 

In [None]:
preds = np.argmax(probs, axis=1)
probs = probs[:,1]

A common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, preds)
cm

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, data.classes)

### Looking at pictures again

In [None]:
plot_val_with_title(most_by_correct(0, False), "Most incorrect 0")

In [None]:
plot_val_with_title(most_by_correct(1, False), "Most incorrect 1")

## Review: easy steps to train a world-class image classifier

1. precompute=True
1. Use `lr_find()` to find highest learning rate where loss is still clearly improving
1. Train last layer from precomputed activations for 1-2 epochs
1. Train last layer with data augmentation (i.e. precompute=False) for 2-3 epochs with cycle_len=1
1. Unfreeze all layers
1. Set earlier layers to 3x-10x lower learning rate than next higher layer
1. Use `lr_find()` again
1. Train full network with cycle_mult=2 until over-fitting

`ConvLearner.pretrained` builds *learner* that contains a pre-trained model. The last layer of the model needs to be replaced with the layer of the right dimensions. The pretained model was trained for 1000 classes therfore the final layer predicts a vector of 1000 probabilities. The model for cats and dogs needs to output a two dimensional vector. The diagram below shows in an example how this was done in one of the earliest successful CNNs. The layer "FC8" here would get replaced with a new layer with 2 outputs.

<img src="https://image.slidesharecdn.com/practicaldeeplearning-160329181459/95/practical-deep-learning-16-638.jpg" width="500">
[original image](https://image.slidesharecdn.com/practicaldeeplearning-160329181459/95/practical-deep-learning-16-638.jpg)

```python
learn = ConvLearner.pretrained(resnet34, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)
```
*Parameters*  are learned by fitting a model to the data. *Hyperparameters* are another kind of parameter, that cannot be directly learned from the regular training process. These parameters express “higher-level” properties of the model such as its complexity or how fast it should learn. Two examples of hyperparameters are the *learning rate* and the *number of epochs*.

During iterative training of a neural network, a *batch* or *mini-batch* is a subset of training samples used in one iteration of Stochastic Gradient Descent (SGD). An *epoch* is a single pass through the entire training set which consists of multiple iterations of SGD.

We can now *fit* the model; that is, use *gradient descent* to find the best parameters for the fully connected layer we added. We need to pass two hyperameters: the *learning rate* (generally 1e-2 or 1e-3 is a good starting point, we'll look more at this next) and the *number of epochs* (you can pass in a higher number and just stop training when you see it's no longer improving, then re-run it with the number of epochs you found works well.)