### Imports

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import glob
%matplotlib inline
from fastai.vision.all import *
from fastai import *

set_seed(42) #Set random seed to a constant so tests are reproducible


### Set paths

In [None]:
path = Path('../input/cassava-leaf-disease-classification')
path.ls()

### Read training data csv

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.head()

### Add 'train_images/' to image_id column to easily access directory

In [None]:
train_df['image_id'] = 'train_images/' + train_df['image_id']
train_df.head()

### Add qualitative labels

In [None]:
import json
with open(path/'label_num_to_disease_map.json') as json_file:
    data = json.load(json_file)
    print(data)

In [None]:
labels_dic = {0: 'Bacterial Blight',
1: 'Brown Streak Disease',
2: 'Green Mottle',
3: 'Mosaic Disease',
4: 'Healthy'
}
train_df['qual_label'] = train_df['label'].map(labels_dic)
train_df.head()

### Datablock

Functions to obtain x and y - image paths and labels.

In [None]:
def get_x(row):
    return path/row['image_id']

def get_y(row):
    return row['label']

Create data block with validation set of 20%, transforming each item to 448x448px and then randomly cropping batches to 224x224px. Other data augmentation also applied to batches, which should improve the accuracy.

In [None]:
CassavaBlock = DataBlock(
    blocks = (ImageBlock, CategoryBlock), 
    splitter = RandomSplitter(valid_pct=0.2, seed=42),
    get_x = get_x,
    get_y = get_y,
    item_tfms = Resize(448),
    batch_tfms = [RandomResizedCropGPU(224), *aug_transforms(), Normalize.from_stats(*imagenet_stats)] #Data augmentation
)

Data loaders. Show 4 images.

In [None]:
dls = CassavaBlock.dataloaders(train_df, batch_size=64)
dls.valid.show_batch(max_n=4, nrows=1)

### Training

The compeition does not allow internet access. Normally FastAI can obtain weights for ResNet-50 from the Internet, but now it must be done offline. Obtain weights from Kaggle dataset then copy the file over to the directory at which FastAI will expect it.

In [None]:
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
        os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/resnet50/resnet50.pth' '/root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth'

Create the model and fine tune to our data, with 7 epochs.

In [None]:
learn = cnn_learner(dls, resnet50, metrics=accuracy, loss_func = LabelSmoothingCrossEntropy(), opt_func = ranger)
learn.fine_tune(10)

Plot confusion matrix.

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

Show worst 5 images in terms of loss - i.e. the images which the model is not predicting well on.

In [None]:
interp.plot_top_losses(5, nrows=5)

From the above, the predictor is confusing Mosaic Disease with other diseases, namely Brown Steak Disease and Green Mottle. An expert in Cassava plants, and plants in general, would be able to provide insight into this - further reading required.

### Predictions

In [None]:
sample_df = pd.read_csv(path/'sample_submission.csv') #Read csv
sample_df_copy = sample_df.copy() #Make copy so that when uploading original, image ids are unchanged.
sample_df_copy.head()

In [None]:
sample_df_copy['image_id'] = 'test_images/' + sample_df_copy['image_id'] #Add path to image ids

In [None]:
test_dl = dls.test_dl(sample_df_copy) #Data loader
test_dl.show_batch()

In [None]:
preds = learn.tta(dl=test_dl, n=8, beta=0) #Predictions for each class (probability)

In [None]:
sample_df['label'] = np.argmax(preds[0], axis=1) #Add prediction to original dataframe - maximum probability

In [None]:
sample_df.head()

In [None]:
sample_df.to_csv('submission.csv', index=False) #Dataframe to csv