# fastai training with the data-block API
fastai is a great tool to create a strong baseline quickly. I use pretty much out of the box approach for multilabel classification, with resnet50 backbone, one cycle training, lr finder etc. The data block API is a great way to prepare the data, and comes with a default set of augmentations that I use as well.

Solution overview: https://www.kaggle.com/c/hpa-single-cell-image-classification/discussion/221550

### I will smile for every upvote :) 

Forded from "fastai cell tile prototyping [training]". credits due to author of It.

In [None]:
#! pip list

 

In [None]:
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

In [None]:
#!pip install git+https://github.com/fastai/fastcore > /dev/null
#!pip install git+https://github.com/fastai/fastai2 > /dev/null
#!pip install iterative-stratification > /dev/null

In [None]:
import sys
package_path = '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master'
sys.path.append(package_path)



In [None]:
!ls ../input/efficientnet-pytorch/EfficientNet-PyTorch

In [None]:
%cd /kaggle/input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master
from efficientnet_pytorch import EfficientNet
%cd -

In [None]:
import pandas as pd
import numpy as np
from fastai.vision.all import *
import pickle
import os

In [None]:
# Making pretrained weights work without needing to find the default filename
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
        os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/resnet50/resnet50.pth' '/root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth'
!cp '../input/resnet101/resnet101.pth' '/root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth'

## cp efficientnet pretrained weights
!cp '../input/efficientnet-pytorch-pretrained/adv-efficientnet-b7-4652b6dd.pth' '/root/.cache/torch/hub/checkpoints/'

!cp '../input/efficientnet-pytorch-pretrained/adv-efficientnet-b6-ac80338e.pth' '/root/.cache/torch/hub/checkpoints/'

 
!cp '../input/efficientnet-pytorch-pretrained/adv-efficientnet-b5-86493f6b.pth' '/root/.cache/torch/hub/checkpoints/'

!cp '../input/efficientnet-pytorch-pretrained/adv-efficientnet-b4-44fb3a87.pth' '/root/.cache/torch/hub/checkpoints/'



!cp '../input/efficientnet-pytorch/efficientnet-b1-dbc7070a.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b2-27687264.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b3-c8376fa2.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b4-e116e8b3.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b5-586e6cc6.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b6-c76e70fd.pth' '/root/.cache/torch/hub/checkpoints/'
!cp '../input/efficientnet-pytorch/efficientnet-b7-dcc49843.pth' '/root/.cache/torch/hub/checkpoints/'

!cp '../input/vgg16weight/vgg16_bn-6c64b313.pth' '/root/.cache/torch/hub/checkpoints/'

#!cp '../input/vgg19-bnmodels/vgg19_bn-c79401a0.pth' '/root/.cache/torch/hub/checkpoints/'

#!cp '../input/squeezenet/squeezenet1_0-a815701f.pth' '/root/.cache/torch/hub/checkpoints/'

#!cp '../input/squeezenet/squeezenet1_0-a815701f.pth' '/root/.cache/torch/hub/checkpoints/'

#!cp '../input/pytorch-model-zoo/alexnet-owt-4df8aa71.pth' '/root/.cache/torch/hub/checkpoints/'

# !cp '../input/resnet34/resnet34.pth' '/root/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth'

In [None]:
'''
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
        os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/resnet101/resnet101.pth' '/root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth'

'''

In [None]:
path = Path('../input/hpa-cell-tiles-sample-balanced-dataset')

In [None]:
df = pd.read_csv(path/'cell_df.csv')

In [None]:
df.head()

In [None]:
len(df)

In [None]:
labels = [str(i) for i in range(19)]
for x in labels: df[x] = df['image_labels'].apply(lambda r: int(x in r.split('|')))

## Change below to `frac=1` to run on the whole training sample

In [None]:
#dfs = df.sample(frac=0.1, random_state=42)

#dfs = df.sample(frac=1, random_state=42)

#let try less data for efficientnetb5

#b5 is fine. over b5 out of memory.

dfs = df.sample(frac=1, random_state=42)


dfs = dfs.reset_index(drop=True)
len(dfs)

In [None]:
unique_counts = {}
for lbl in labels:
    unique_counts[lbl] = len(dfs[dfs.image_labels == lbl])

full_counts = {}
for lbl in labels:
    count = 0
    for row_label in dfs['image_labels']:
        if lbl in row_label.split('|'): count += 1
    full_counts[lbl] = count
    
counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
counts = np.array(sorted(counts, key=lambda x:-x[1]))
counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])
counts.set_index('label').T


In [None]:
len(dfs)

## Using multilabel stratification for the train-validation split.

There is some leakage in the code below (cells belonging to the same image should be in the same split). However, when I fixed that, I got a lower score... coincidence? 

In [None]:
nfold = 5
seed = 42

y = dfs[labels].values
X = dfs[['image_id', 'cell_id']].values

dfs['fold'] = np.nan

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
mskf = MultilabelStratifiedKFold(n_splits=nfold, random_state=seed)
for i, (_, test_index) in enumerate(mskf.split(X, y)):
    dfs.iloc[test_index, -1] = i
    
dfs['fold'] = dfs['fold'].astype('int')

In [None]:
dfs['is_valid'] = False
dfs['is_valid'][dfs['fold'] == 0] = True

In [None]:
dfs.is_valid.value_counts()

## Using fastai data block API with item and batch transforms

Read more: https://docs.fast.ai/tutorial.datablock.html

In [None]:
def get_x(r): return path/'cells'/(r['image_id']+'_'+str(r['cell_id'])+'.jpg')
img = get_x(dfs.loc[12])
img = PILImage.create(img)
img.show();

In [None]:
def get_y(r): return r['image_labels'].split('|')
get_y(dfs.loc[12])

In [None]:
sample_stats = ([0.07237246, 0.04476176, 0.07661699], [0.17179589, 0.10284516, 0.14199627])

In [None]:
item_tfms = RandomResizedCrop(224, min_scale=0.75, ratio=(1.,1.))
batch_tfms = [*aug_transforms(flip_vert=True, size=128, max_warp=0), Normalize.from_stats(*sample_stats)]
bs=256

In [None]:
dblock = DataBlock(blocks=(ImageBlock, MultiCategoryBlock(vocab=labels)),
                splitter=ColSplitter(col='is_valid'),
                get_x=get_x,
                get_y=get_y,
                item_tfms=item_tfms,
                batch_tfms=batch_tfms
                )
dls = dblock.dataloaders(dfs, bs=bs)

In [None]:
# dblock.summary(dfs)

In [None]:
dls.show_batch(nrows=3, ncols=3)

## Let's train!

In [None]:
#learn = cnn_learner(dls, resnet50, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()

In [None]:
#learn = cnn_learner(dls, resnet101, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()

In [None]:
#learn = cnn_learner(dls, vgg16_bn, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()#

In [None]:
#learn = cnn_learner(dls, vgg19_bn, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()

In [None]:
#learn = cnn_learner(dls,squeezenet1_0, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()#

In [None]:
#learn = cnn_learner(dls,alexnet, metrics=[accuracy_multi, PrecisionMulti()]).to_fp16()#

In [None]:
'''
def get_learner(fold_num, lr=1e-3):
    opt_func = partial(Adam, lr=lr, wd=0.01, eps=1e-8)

    data = get_data(fold_num)
    
    model = EfficientNet.from_pretrained("efficientnet-b7", advprop=True)
    #model = EfficientNet.from_name("efficientnet-b7")
    #model = EfficientNet.from_pretrained("efficientnet-b8", advprop=True) # weights run to NaN
    #model = EfficientNet.from_name('efficientnet-b4') 
    #model._fc = nn.Linear(1280, data.c)# the last layer... # works for b0,b1
    #model._fc = nn.Linear(1536, data.c)# the last layer... B3
    #model._fc = nn.Linear(1792, data.c)# the last layer... B4
    #model._fc = nn.Linear(2048, data.c)# the last layer... B5
    #model._fc = nn.Linear(2304, data.c)# the last layer... B6
    model._fc = nn.Linear(2560, data.c)# the last layer... B7
    #model._fc = nn.Linear(2816, data.c)# the last layer... B8

    learn = Learner(
        dls, model, opt_func=opt_func,
        loss_func=LabelSmoothingCrossEntropy(),
        #callback_fns = [partial(OverSamplingCallback)],  
        metrics=[
            AccumMetric(healthy_roc_auc, flatten=False),
            AccumMetric(multiple_diseases_roc_auc, flatten=False),
            AccumMetric(rust_roc_auc, flatten=False),
            AccumMetric(scab_roc_auc, flatten=False),
            AccumMetric(comp_metric, flatten=False)]
        ).to_fp16()
    return learn
'''

In [None]:
def get_learner(lr=1e-3):
    opt_func = partial(Adam, lr=lr, wd=0.01, eps=1e-8)

    #data = get_data(fold_num)
    
    #model = EfficientNet.from_pretrained("efficientnet-b5", advprop=True) #0.364
    
    #let try add some epochs
    
    model = EfficientNet.from_pretrained("efficientnet-b5", advprop=True)  
    
    #model = EfficientNet.from_pretrained("efficientnet-b6", advprop=False) outof memory
    # b7 out of memory , try small model 
    
    #model = EfficientNet.from_name("efficientnet-b7")
    #model = EfficientNet.from_pretrained("efficientnet-b8", advprop=True) # weights run to NaN
    #model = EfficientNet.from_name('efficientnet-b4') 
    #model._fc = nn.Linear(1280, data.c)# the last layer... # works for b0,b1
    #model._fc = nn.Linear(1536, data.c)# the last layer... B3
    #model._fc = nn.Linear(1792, data.c)# the last layer... B4
    model._fc = nn.Linear(2048, dls.c)# the last layer... B5
    #model._fc = nn.Linear(2304, dls.c)# the last layer... B6
    #model._fc = nn.Linear(2560, dls.c)# the last layer... B7
    #model._fc = nn.Linear(2816, data.c)# the last layer... B8

    learn = Learner(
        dls, model, opt_func=opt_func,
        #loss_func=LabelSmoothingCrossEntropy(),
        #callback_fns = [partial(OverSamplingCallback)],  
        metrics=[accuracy_multi, PrecisionMulti()]
        ).to_fp16()
    return learn


In [None]:
learn=get_learner()

In [None]:
learn.lr_find()

In [None]:
#SuggestedLRs(lr_min=0.017378008365631102, lr_steep=0.001737800776027143)

In [None]:
#learn.lr_find()#
# SuggestedLRs(lr_min=0.03630780577659607, lr_steep=0.02754228748381138)

In [None]:
#learn.fit(16)
#each epoch around 20minuts

In [None]:
lr=3e-2

I trained for 10 epochs in the 0.342 leaderboard submission. 

In [None]:
#learn.fine_tune(2,base_lr=lr)

In [None]:
#learn.fine_tune(4,base_lr=lr)

In [None]:
learn.fine_tune(6,base_lr=lr)

In [None]:
learn.recorder.plot_loss()

## Where are the mistakes? 

In [None]:
from sklearn.metrics import multilabel_confusion_matrix as cm

In [None]:
# val_targ = torch.stack([x[1] for x in learn.dls.valid_ds], dim=0).numpy()
# val_targ.shape

In [None]:
val_targ = dfs[labels][dfs.is_valid == True].values

In [None]:
val_targ.shape

In [None]:
val_preds_all = learn.get_preds(dl=learn.dls.valid)

In [None]:
val_preds = val_preds_all[0].numpy()

In [None]:
val_preds = val_preds > 0.5

In [None]:
full_preds = val_preds_all[0].numpy()

In [None]:
vis_arr = cm(val_targ, val_preds)

In [None]:
# i = 60
# print(learn.dls.valid.dataset[i][1])
# print(val_preds[i])
# print(full_preds[i])
# learn.dls.valid.dataset[i][0]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def print_confusion_matrix(confusion_matrix, axes, class_label, class_names, fontsize=14):

    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names,
    )

    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d", cbar=False, ax=axes)
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    axes.set_ylabel('True label')
    axes.set_xlabel('Predicted label')
    axes.set_title("Confusion Matrix for the class - " + class_label)

In [None]:
fig, ax = plt.subplots(5, 4, figsize=(12, 16))
    
for axes, cfs_matrix, label in zip(ax.flatten(), vis_arr, labels):
    print_confusion_matrix(cfs_matrix, axes, label, ["0", "1"])

fig.tight_layout()
plt.show()

In [None]:
val = dfs[dfs.is_valid==True]
len(val[val['16'] == 1])

In [None]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(val_targ, val_preds)
average_precision

In [None]:
from sklearn.metrics import precision_recall_curve

precision = dict()
recall = dict()
average_precision = dict()
for i in range(19):
    precision[i], recall[i], _ = precision_recall_curve(val_targ[:, i], val_preds[:, i])
    average_precision[i] = average_precision_score(val_targ[:, i], val_preds[:, i])

# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(val_targ.ravel(), val_preds.ravel())
average_precision["micro"] = average_precision_score(val_targ, val_preds, average="micro")
print('Average precision score, micro-averaged over all classes: {0:0.2f}'.format(average_precision["micro"]))

In [None]:
average_precision

# Inference
This is running on the public test data preprocessed in the same way as train. We will save both regular preds and preds with TTA so that we can use them later in a separate submission notebook. 

In [None]:
path = Path('../input/hpa-cell-tiles-test-with-enc-dataset')

In [None]:
df = pd.read_csv(path/'cell_df.csv')

In [None]:
df.to_csv('cell_df.csv', index=False)

In [None]:
test_dl = learn.dls.test_dl(df)

In [None]:
test_dl.show_batch()

In [None]:
preds, _ = learn.get_preds(dl=test_dl)

In [None]:
preds.shape

In [None]:
with open('preds.pickle', 'wb') as handle:
    pickle.dump(preds, handle)

In [None]:
tta, _ = learn.tta(dl=test_dl)

In [None]:
tta.shape

In [None]:
with open('tta.pickle', 'wb') as handle:
    pickle.dump(tta, handle)

In [None]:
cls_prds = torch.argmax(preds, dim=-1)
len(cls_prds), cls_prds

In [None]:
sample_submission = pd.read_csv('../input/hpa-single-cell-image-classification/sample_submission.csv')
sample_submission.head()

In [None]:
df['cls'] = cls_prds
df['pred'] = df[['cls', 'enc']].apply(lambda r: str(r[0]) + ' 1 ' + r[1], axis=1)
df.head()

In [None]:
subm = df.groupby(['image_id'])['pred'].apply(lambda x: ' '.join(x)).reset_index()
# subm = subm.loc[3:]
subm.head()

In [None]:
sub = pd.merge(
    sample_submission,
    subm,
    how="left",
    left_on='ID',
    right_on='image_id',
)

In [None]:
sub.head()

In [None]:
def isNaN(num):
    return num != num

In [None]:
for i, row in sub.iterrows():
    if isNaN(row['pred']): continue
    sub.PredictionString.loc[i] = row['pred']

In [None]:
sub = sub[sample_submission.columns]
sub.head()

In [None]:
sub.to_csv('submission_1.csv', index=False)

Thank you for your attention! Looking forward to questions and comments!

submission 

In [None]:
cell_df = pd.read_csv('cell_df.csv')
cell_df.head()
cell_df['cls'] = ''

In [None]:
threshold = 0.0

for i in range(preds.shape[0]): 
    p = torch.nonzero(preds[i] > threshold).squeeze().numpy().tolist()
    if type(p) != list: p = [p]
    if len(p) == 0: cls = [(preds[i].argmax().item(), preds[i].max().item())]
    else: cls = [(x, preds[i][x].item()) for x in p]
    cell_df['cls'].loc[i] = cls

In [None]:
def combine(r):
    cls = r[0]
    enc = r[1]
    classes = [str(c[0]) + ' ' + str(c[1]) + ' ' + enc for c in cls]
    return ' '.join(classes)

combine(cell_df[['cls', 'enc']].loc[24])

In [None]:
cell_df['pred'] = cell_df[['cls', 'enc']].apply(combine, axis=1)
cell_df.head()

In [None]:
subm = cell_df.groupby(['image_id'])['pred'].apply(lambda x: ' '.join(x)).reset_index()
# subm = subm.loc[3:]
subm.head()

In [None]:
sample_submission = pd.read_csv('../input/hpa-single-cell-image-classification/sample_submission.csv')
sample_submission.head()

In [None]:
sub = pd.merge(
    sample_submission,
    subm,
    how="left",
    left_on='ID',
    right_on='image_id',
)
sub.head()

In [None]:
def isNaN(num):
    return num != num

for i, row in sub.iterrows():
    if isNaN(row['pred']): continue
    sub.PredictionString.loc[i] = row['pred']

In [None]:
sub = sub[sample_submission.columns]
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)