# Prostate cANcer graDe Assessment (PANDA) Challenge
### Prostate cancer diagnosis using the Gleason grading system

In [None]:
import sys
sys.path = [
    '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',
] + sys.path

In [None]:
# !pip install torchsummary

In [None]:
from fastai import *
from fastai.vision import *
from fastai.callbacks.hooks import *
from fastai.callbacks import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score,confusion_matrix
import matplotlib.image as image
from tqdm.notebook import tqdm
import os
import gc
import zipfile
import openslide
import cv2
import skimage.io as sk
import warnings
import albumentations as A
from torchvision import transforms
# from torchsummary import summary
from sys import getsizeof
warnings.filterwarnings("ignore")

In [None]:
device = torch.device('cuda')

In [None]:
tile_size = 256
sz = image_size = 256
N = n_tiles = 36
batch_size = 8
num_workers = 4
TRAIN = '/kaggle/input/prostate-cancer-grade-assessment/train_images/'

In [None]:
path1 = Path('/kaggle/input/panda-36-tiles')

In [None]:
sld = os.listdir(TRAIN)
sld = [x[:-5] for x in sld]

In [None]:
df_duplicates = pd.read_csv('../input/duplicates-panda/duplicates.csv')
# df_duplicates.head()

In [None]:
duplicate_files = df_duplicates['file2'].tolist()

In [None]:
df = pd.read_csv('/kaggle/input/prostate-cancer-grade-assessment/train.csv')
df = df[df['image_id'].isin(sld)]
df = df[~df['image_id'].isin(duplicate_files)]
df.columns = ['fn', 'data_provider', 'isup_grade', 'gleason_score']

In [None]:
wsi_aug = A.Compose([
    A.RandomCrop(height=10, width=10, p=0.2),
    A.Rotate(limit=5, p=0.2)
])

In [None]:
tile_aug = A.Compose([A.OneOf([
                            A.RGBShift(p=1),
                            A.RandomGamma(p=1),
                        ], p=0.5),
                        A.RandomBrightnessContrast(p=0.7),
                        A.OneOf([
                            A.RandomRotate90(p=1),
                            A.Flip(p=1),
                            A.Rotate(limit=10, border_mode=0, value=(255, 255, 255), p=1),
                            A.ShiftScaleRotate(shift_limit=0.15, scale_limit=0.1, rotate_limit=10, border_mode=0, value=(255, 255, 255), p=1),
                        ], p=0.25),
                        A.OneOf([
                            A.Cutout(num_holes=50, max_h_size=10, max_w_size=10, fill_value=0, p=1),
                            A.Cutout(num_holes=70, max_h_size=7, max_w_size=7, fill_value=0, p=1),
                            A.Cutout(num_holes=100, max_h_size=5, max_w_size=5, fill_value=0, p=1),
                        ], p=0.2)])

## Stratified Kfold

In [None]:
df['kfold'] = -1
df = df.sample(frac=1.,random_state=2020).reset_index(drop=True)
kf = StratifiedKFold(n_splits=5)
y = df.isup_grade.values
for f,(t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,'kfold'] = f

In [None]:
# df.head()

In [None]:
import seaborn as sns
sns.countplot(x=df[df.kfold==1].isup_grade);
plt.title('Fold - 1: Images count');

# Data Processing for fastai 
* We have 2 options either we write a custom Imagelist function or
* We first convert all images first then use then As we like.

Later will take time at first but will Speed up process later. As Fastai datablock will not have to process large **.tiff** files every time

* I have converted the tiff files they can be found [**here**](https://www.kaggle.com/ianmoone0617/panda-36-tiles-resize)
* Lets start with Custum ImageItem List first

In [None]:
def get_tiles(img, mode=0):
        result = []
        h, w, c = img.shape
        pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
        pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)
        img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=1)
        img3 = img2.reshape(
            img2.shape[0] // tile_size,
            tile_size,
            img2.shape[1] // tile_size,
            tile_size,
            3
        )

        img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
        if len(img) < n_tiles:
            img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]], constant_values=1)
        idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
        img3 = img3[idxs]
        for i in range(len(img3)):
            result.append({'img':1 - img3[i], 'idx':i})
        return result

## Custom Fastai TiffImageList to Directly Process Slides

In [None]:
class TiffImageItemList(ImageList):
    def open(self,fn):
        path = '/kaggle/input/prostate-cancer-grade-assessment/train_images/'
        fl = path + str(fn)+'.tiff'
        img = sk.MultiImage(fl)[1]
        img = wsi_aug(**{'image':img})['image']
        res = get_tiles(img)
        imgs = []
        for i in range(36):
            im = res[i%len(res)]['img']
            im = tile_aug(**{'image':im})['image']
            imgs.append(im)
        #imgs = np.array(imgs)
        imgs = [torch.tensor(x) for x in imgs]
        imgs = torch.div(imgs, 255.0)
#         final_image = np.concatenate(np.array([np.concatenate(imgs[j:j+6],axis=1).astype(np.uint8) for j in range(0,36,6)]),axis=0)
#         final_image = cv2.resize(final_image, dsize=(512, 512), interpolation=cv2.INTER_CUBIC)
        
#         return vision.Image(pil2tensor(final_image,np.float32).div_(255))
        return imgs

In [None]:
mean, std = imagenet_stats
mean = 1.0 - torch.tensor(mean)
std = 1.0 - torch.tensor(std)
def open_image(fn:PathOrStr, div:bool=True, convert_mode:str='RGB', cls:type=Image,
        after_open:Callable=None)->Image:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning) # EXIF warning from TiffPlugin
        x = sk.MultiImage(fn)[1]
        trans = transforms.ToPILImage()
        x = trans(x).convert(convert_mode)
#         x = PIL.Image.open(fn).convert(convert_mode)
    if after_open: x = after_open(x)
    x = pil2tensor(x,np.float32)
    if div: x.div_(255)
    return x #invert image for zero padding

class MImage(ItemBase):
    def __init__(self, imgs):
        self.obj, self.data = \
          (imgs), [(imgs[i].data - mean[...,None,None])/std[...,None,None] for i in range(len(imgs))]
    
    def apply_tfms(self, tfms,*args, **kwargs):
        for i in range(len(self.obj)):
            self.obj[i] = self.obj[i].apply_tfms(tfms, *args, **kwargs)
            self.data[i] = (self.obj[i].data - mean[...,None,None])/std[...,None,None]
        return self
    
    def __repr__(self): return f'{self.__class__.__name__} {img.shape for img in self.obj}'
    def to_one(self):
        img = torch.stack(self.data,1)
        img = img.view(3,-1,N,sz,sz).permute(0,1,3,2,4).contiguous().view(3,-1,sz*N)
        return Image(1.0 - (mean[...,None,None]+img*std[...,None,None]))

class MImageItemList(ImageList):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def __len__(self)->int: return len(self.items) or 1 
    
    def get(self, i):
        fn = Path(self.items[i])
#         fnames = [Path(str(fn)+'_'+str(i)+'.png')for i in range(N)]
        fname = TRAIN + str(fn)+ '.tiff'
        imgs = open_image(fname, convert_mode=self.convert_mode, after_open=self.after_open)
        imgs = imgs.permute(1, 2, 0).numpy()
        res = get_tiles(imgs)
        imgs = [Image(torch.tensor(x['img']).permute(2, 0, 1)) for x in res]
#         print(imgs)
        return MImage(imgs)

    def reconstruct(self, t):
        return MImage([mean[...,None,None]+_t*std[...,None,None] for _t in t])
    
    def show_xys(self, xs, ys, figsize:Tuple[int,int]=(300,50), **kwargs):
        rows = min(len(xs),8)
        fig, axs = plt.subplots(rows,1,figsize=figsize)
        for i, ax in enumerate(axs.flatten() if rows > 1 else [axs]):
            xs[i].to_one().show(ax=ax, y=ys[i], **kwargs)
        plt.tight_layout()
        

#collate function to combine multiple images into one tensor
def MImage_collate(batch:ItemsList)->Tensor:
    result = torch.utils.data.dataloader.default_collate(to_data(batch))
    if isinstance(result[0],list):
        result = [torch.stack(result[0],1),result[1]]
    return result

In [None]:
# x = sk.MultiImage('../input/prostate-cancer-grade-assessment/train_images/0005f7aaab2800f6170c399693a96917.tiff')[1]
# trans = transforms.ToPILImage()
# x = trans(x).convert('RGB')
# x = pil2tensor(x,np.float32)
# x.div_(255)
# z = x #invert image for zero padding
# img = z.permute(1, 2, 0).numpy()

In [None]:
# mode=0
# result = []
# h, w, c = img.shape
# pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
# pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)
# img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=1)
# img3 = img2.reshape(
#     img2.shape[0] // tile_size,
#     tile_size,
#     img2.shape[1] // tile_size,
#     tile_size,
#     3
# )

# img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
# if len(img) < n_tiles:
#     img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]], constant_values=1)
# idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
# img3 = img3[idxs]
# for i in range(len(img3)):
#     result.append({'img':1 - img3[i], 'idx':i})

In [None]:
fig, ax = plt.subplots(6, 6, figsize = (22, 22))
for i,j in enumerate(ax.flatten()):
    j.imshow(result[i]['img'])

* Train and validation split

In [None]:
trn_idx,val_idx = list(df[df.kfold!=4].index),list(df[df.kfold==4].index)
random.shuffle(trn_idx)
random.shuffle(val_idx)

## DataBunch of Custom TiffImageItemList 

In [None]:
data = (MImageItemList.from_df(df,path='',cols='fn')
                          .split_by_idxs(trn_idx,val_idx)
                          .label_from_df(cols='isup_grade')
                          .transform(get_transforms(flip_vert=True,max_rotate=15),size=sz,padding_mode='zeros')
                          .databunch(num_workers=4,bs=batch_size)
#                           .normalize(imagenet_stats)
       )

In [None]:
data.show_batch()

## Databunch of Processed Images: Using fastai's own ImageList

In [None]:
# stats = ([0.785946], [0.45007266])
# data_img = (ImageList.from_df(df,path1,folder='.',suffix='.png',cols='fn')
#                 .split_by_idxs(trn_idx,val_idx)
#                 .label_from_df(cols='isup_grade',)
#                 .transform(get_transforms(do_flip=True), size=300)
#                 .databunch(bs=batch_size).normalize(imagenet_stats))

In [None]:
# data_img.show_batch(rows=3,figsize=(20,8),seed=2020)

In [None]:
len(data_img.train_ds), len(data_img.valid_ds), data_img.classes, data_img.train_ds[0][0].data[0].shape,data_img.c

## Model Efficient-B3

In [None]:
from efficientnet_pytorch import model as enet

In [None]:
pretrained_model = {
    'efficientnet-b3': '../input/efficientnet-pytorch/efficientnet-b3-c8376fa2.pth'
}

enet_type = 'efficientnet-b3'
out_dim = 6

In [None]:
enet.EfficientNet.from_name('efficientnet-b3')

In [None]:
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))
        nc = self.enet._fc.in_features
        self.head = nn.Sequential(AdaptiveConcatPool2d(),Flatten(),nn.Linear(2*nc,512),
                            nn.ReLU(),nn.BatchNorm1d(512), nn.Dropout(0.5),nn.Linear(512,out_dim))
        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        shape = x[0].shape
        x = torch.stack(x,1).view(-1,shape[1],shape[2],shape[3])
        #x: bs*N x 3 x 128 x 128
        x = self.extract(x)
        #x: bs*N x C x 4 x 4
        shape = x.shape
        #concatenate the output for tiles into a single map
        x = x.view(-1,n,shape[1],shape[2],shape[3]).permute(0,2,1,3,4).contiguous()\
          .view(-1,shape[1],shape[2]*n,shape[3])
        #x: bs x C x N*4 x 4
        x = self.head(x)
        #x: bs x n
        return x
#         x = self.extract(x)
#         x = self.myfc(x)
#         return x

In [None]:
arch = enetv2(enet_type, out_dim=out_dim)

### Metrics Kappa Score

In [None]:
kp = KappaScore()
kp.weights = 'quadratic'

In [None]:
learn = Learner(data_img, arch , metrics = [kp] , model_dir = '/kaggle/working/')

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
gc.collect()

In [None]:
cb2 = SaveModelCallback(learn, monitor = 'kappa_score', every = 'improvement', mode='max', name = 'best_model_ft' )
cb3 = ReduceLROnPlateauCallback(learn,  monitor = 'kappa_score', mode = 'max',factor = 0.2,patience = 4, min_delta = 0.01)

In [None]:
#learn.split([arch.myfc])

In [None]:
# epochs = 4
# learn.fit_one_cycle(epochs ,max_lr = 1e-3, callbacks = [cb2,cb3])

In [None]:
learn.unfreeze()
learn.fit_one_cycle(6 ,max_lr = 1e-3, callbacks = [cb2,cb3])

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.load('best_model_ft');

In [None]:
learn.export('/kaggle/working/panda.pkl')

## Inference Kernel can be found [**here**](https://www.kaggle.com/ianmoone0617/panda-effnet-b3-inference-fastai-custom-imagelist)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
# learn = learn.to_fp32()

In [None]:
# test_df = pd.read_csv('/kaggle/input/prostate-cancer-grade-assessment/test.csv')
# df.drop('kfold', axis=1, inplace=True)
# df.columns = ['image_id', 'data_provider', 'isup_grade', 'gleason_score']
# data_dir = '../input/prostate-cancer-grade-assessment'
# image_folder = os.path.join(data_dir, 'test_images')
# is_test = os.path.exists(image_folder)  # IF test_images is not exists, we will use some train images.
# image_folder = image_folder if is_test else os.path.join(data_dir, 'train_images')

# test = test_df if is_test else df.sample(n=100)

In [None]:
# def image_test(fn,image_folder):     
#     path2 = image_folder +'/'
#     fl = path2 + str(fn)+'.tiff'
#     img = sk.MultiImage(fl)[1]
#     res = get_tiles(img)
#     imgs = []
#     for i in range(36):
#         im = res[i%len(res)]['img']
#         imgs.append(im)
#     imgs = np.array(imgs)
#     final_image = np.concatenate(np.array([np.concatenate(imgs[j:j+6],axis=1).astype(np.uint8) for j in range(0,36,6)]),axis=0)
#     final_image = cv2.resize(final_image, dsize=(300, 300), interpolation=cv2.INTER_CUBIC)
#     return vision.Image(pil2tensor(final_image,np.float32).div_(255))

In [None]:
# ts_name = test.image_id.values
# pred = np.zeros(len(ts_name))
    
# for j in tqdm(range(len(ts_name))):
#     ans = int(learn.predict(image_test(ts_name[j],image_folder))[0])
#     pred[j] = ans
        
# out = pd.DataFrame({'image_id':ts_name,'isup_grade':pred.astype(int)})
# out.to_csv('submission.csv',index=False)