# fastai v2 example

How far does one lecture of fastai takes you? Or two? This notebook will be improved with some new stuff as lectures go on.

In [None]:
!pip install fastai2>=0.0.11 graphviz ipywidgets matplotlib nbdev>=0.2.12 pandas scikit_learn azure-cognitiveservices-search-imagesearch sentencepiece

In [None]:
!pip install --upgrade fastprogress

In [None]:
from fastai2.vision.all import *
from sklearn.metrics import roc_auc_score

In [None]:
data_path = Path("../input/plant-pathology-2020-fgvc7/")

In [None]:
df = pd.read_csv(data_path/"train.csv")

In [None]:
df.head()

A funny thing is that it is single-label classification task, not a multi-label, which can be checked like this:

In [None]:
df.iloc[:, 1:].sum(axis=1).value_counts()

In [None]:
imglabels = list(df.columns[1:])

In [None]:
df["labels"] = df.apply(lambda x: imglabels[x.values[1:].argmax()], axis=1)

In [None]:
df.head()

In [None]:
dls = ImageDataLoaders.from_df(df,
                               path=data_path, 
                               suff=".jpg", 
                               folder="images",
                               label_col="labels",
                               item_tfms=RandomResizedCrop(512, min_scale=0.5), # note that we use a bigger image size
                               batch_tfms=aug_transforms(),
                               valid_pct=0.05,
                               bs=16,
                               val_bs=16
                               )

In [None]:
dls.show_batch()

In [None]:
def mean_roc_auc(preds, targets, num_cols=4):
    """The competition metric
    
    Quoting: 'Submissions are evaluated on mean column-wise ROC AUC. 
    In other words, the score is the average of the individual AUCs 
    of each predicted column. '
    
    Unfortunately, we cannot use in validation, as it can happen that
    all files in a batch has the same label, and ROC is undefined
    """
    aucs = []
    preds = preds.detach().cpu().numpy()
    targets = targets.detach().cpu().numpy()
    
    for i in range(num_cols):
        # grab a column from the networks output
        cpreds = preds[:, i]
        # see which objects have the i-th label
        ctargets = [x == i for x in targets]
        aucs.append(roc_auc_score(ctargets, cpreds))
    return sum(aucs) / num_cols

In [None]:
learn = cnn_learner(dls, resnet50, metrics=[accuracy], model_dir="/kaggle/working")

We now know a bit more about setting a correct learning rate, so let's do it by finding a good LR with the learning rate finder technique.

In [None]:
learn.lr_find()

In [None]:
learn.fit(4, lr=1e-3)

Great. Now let's unfreeze the lower layers and look at the suggested LR again.

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.save("model")

In [None]:
learn.fit_one_cycle(16, lr_max=slice(1e-6,1e-5), cbs=[SaveModelCallback(every='epoch', monitor="accuracy")])

In [None]:
learn.load("model")

Aaand prediction time!

In [None]:
test_image_ids = [img.split(".")[0] for img in os.listdir(data_path/"images") if img.startswith("Test")]
test_images = [data_path/"images"/f"{img}.jpg" for img in test_image_ids]
preds = learn.get_preds(dl=dls.test_dl(test_images, shuffle=False, drop_last=False))

In [None]:
# ensure that the order of columns in preds matches the imglabels
preds = preds[0].cpu().numpy()
vocab = list(dls[0].dataset.vocab)
column_permutation = [vocab.index(l) for l in imglabels]
preds = preds[:, column_permutation]

submission = pd.DataFrame()
submission["image_id"] = test_image_ids
for i in range(len(imglabels)):
    submission[imglabels[i]] = preds[:, i]
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head(10)