In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
from sklearn.utils import shuffle
from tqdm import tqdm_notebook

path = "../input/histopathologic-cancer-detection/"

data = pd.read_csv(path +"train_labels.csv")
train_path = path +'train/'
test_path = path + 'test/'
# quick look at the label stats
data['label'].value_counts()

In [None]:
def readImage(path):
    # OpenCV reads the image in bgr format by default
    bgr_img = cv2.imread(path)
    # We flip it to rgb for visualization purposes
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    return rgb_img

visualisation from : https://www.kaggle.com/qitvision/a-complete-ml-pipeline-fast-ai

In [None]:
# random sampling
shuffled_data = shuffle(data)

fig, ax = plt.subplots(2,5, figsize=(20,8))
fig.suptitle('Histopathologic scans of lymph node sections',fontsize=20)
# Negatives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 0]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[0,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='b',facecolor='none', linestyle=':', capstyle='round')
    ax[0,i].add_patch(box)
ax[0,0].set_ylabel('Negative samples', size='large')
# Positives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 1]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[1,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='r',facecolor='none', linestyle=':', capstyle='round')
    ax[1,i].add_patch(box)
ax[1,0].set_ylabel('Tumor tissue samples', size='large')

In [None]:
data.head()

In [None]:
!pip install -Uqq fastbook
import fastbook
from fastai.vision.all import *

In [None]:
path = "../input/histopathologic-cancer-detection/"


In [None]:
def get_x(r): return path+'train/'+r['id']+'.tif'
def get_y(r): return r['label']


# start with creatinga datablock

dblock =  DataBlock(blocks=(ImageBlock, CategoryBlock),
                    splitter=RandomSplitter(valid_pct=0.2,seed=42), 
                    get_x=get_x, 
                    get_y=get_y, 
                    item_tfms=RandomResizedCrop(128, min_scale=0.35))
dls = dblock.dataloaders(data)



In [None]:
dls.show_batch(nrows=1, ncols=3)

So we are able to create a dataloader however the images are too big right nowm we need a way to reduce the size of images. lets try for a crop to 48x48 at center with flip. and brightness. https://docs.fast.ai/vision.augment#PadMode

In [None]:
dblock =  DataBlock(blocks=(ImageBlock, CategoryBlock),
                    splitter=RandomSplitter(valid_pct=0.2,seed=42), 
                    get_x=get_x, 
                    get_y=get_y, 
                    item_tfms= (CropPad(48, pad_mode='zeros'),DihedralItem(p=1.0, nm=None, before_call=None) ))
dls = dblock.dataloaders(data)

In [None]:
dls.show_batch()

In [None]:
learn = cnn_learner(dls, resnet34, metrics=error_rate)
learn.fine_tune(4)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(16, nrows=8)

In [None]:
learn.predict('../input/histopathologic-cancer-detection/test/00006537328c33e284c973d7b39d340809f7271b.tif')

### Problems with testing data

for submission again from : https://www.kaggle.com/qitvision/a-complete-ml-pipeline-fast-ai. Maybe U sould consider differnet way to evaluate any ideas ?

In [None]:
path = "../input/histopathologic-cancer-detection/"

from : https://www.kaggle.com/mamamot/fastai-v2-example and https://www.kaggle.com/mentalwanderer/image-classification-workflow-with-fast-ai

for more info on tta
https://docs.fast.ai/learner#Learner.tta

In [None]:
preds, y = learn.tta()
acc = accuracy(preds, y)


In [None]:
from sklearn.metrics import roc_auc_score
def auc_score(y_pred,y_true,tens=True):
    score = roc_auc_score(y_true,torch.sigmoid(y_pred)[:,1])
    if tens:
        score = tensor(score)
    return score

In [None]:
print('The validation accuracy is {} %.'.format(acc * 100))
pred_score = auc_score(preds,y).item()
print('The validation AUC is {}.'.format(pred_score))

Since definition of tta has changed I am unable currently to make it work, however creating a dataloader for test data

In [None]:
# # doesnt work
# tf_fns = get_image_files(path + 'test')
# test_data = DataBlock(get_items=get_image_files,
#                  item_tfms=(CropPad(48, pad_mode='zeros'),DihedralItem(p=1.0, nm=None, before_call=None)))
# dl_test = test_data.dataloaders(path+'test')
# dl_test.show_batch()


In [None]:

test_images = get_image_files(path + 'test')
preds,y = learn.get_preds(dl=dls.test_dl(test_images, shuffle=False, drop_last=False))

In [None]:
pred_list = list(preds[:,1])

In [None]:
len(pred_list), len(test_images)

In [None]:
submissions = pd.read_csv(path + 'sample_submission.csv')
id_list = list(submissions.id)
id_list

In [None]:
test_images_dict = {}
for i in range(len(test_images)):
    test_images_dict[str(str(test_images[i]).split('/')[-1].split('.')[0])] = float(pred_list[i])

In [None]:
test_images_dict['88a12685148c0d876fed1fba8228afc6e7ee937f']

In [None]:
prediction_list  = []

for i in id_list:
    prediction_list.append(test_images_dict[i])

In [None]:
prediction_list[:5]

In [None]:

submissions = pd.DataFrame({'id':id_list,'label':prediction_list})
submissions.to_csv("submission.csv".format(pred_score),index = False)

baseline model has an accuracy of \n
Score!
submission.csv
just now
1 seconds
1 seconds
0.9269

[Screenshot%20%28119%29.png](attachment:Screenshot%20%28119%29.png)

Rest not needed (down below) 
but you can upload your own kaggle score image if you want !

In [None]:
from fastai.vision.widgets import *

In [None]:
btn_upload = widgets.FileUpload()
btn_upload

In [None]:
img =   PILImage.create(btn_upload.data[-1])

In [None]:
out_pl = widgets.Output()
out_pl.clear_output()
with out_pl: display(img.to_thumb(600,600))
out_pl