In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import datasets, models, transforms

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
from tqdm.notebook import tqdm

import time
import os

import cv2
import shutil
import glob

In [None]:
# print(torch.__version__)
# print(torch.cuda.is_available())

In [None]:
BASE_FOLDER = "../input/prostate-cancer-grade-assessment/"
sample = BASE_FOLDER+"sample_submission.csv"

In [None]:
plt.ion()
df = pd.read_csv(BASE_FOLDER+"train.csv")

labels = list(dict.fromkeys(df['isup_grade']))

In [None]:
def merge_url(z, y):
    path = os.listdir(z)
    df[y] = [[('../input/prostate-cancer-grade-assessment/train_images/' + i) for a in df['image_id'] if (i.split('.')[0] == a)] for i in path]
    df[y] = df[y].apply(lambda url: " ".join(url))
    return df

In [None]:
df = merge_url('../input/prostate-cancer-grade-assessment/train_images/', 'image_url')

In [None]:
def merge_mask():
    df_mask = pd.DataFrame({'image_url':[], 'mask_url':[], 'image_id':[], 'data_provider':[], 'isup_grade':[], 'gleason_score':[]})
    path1 = os.listdir('../input/prostate-cancer-grade-assessment/train_label_masks/')
    for i in path1:
        y=i.split('.')[0].split('_')[0]
        for m,j,n,k,l in zip(df['image_url'], df['image_id'], df['data_provider'], df['isup_grade'], df['gleason_score']):
            if str(y) == str(j):
                df_mask=df_mask.append({'image_url':m, 'mask_url':'../input/prostate-cancer-grade-assessment/train_label_masks/' + i, 'image_id':j, 'data_provider':n, 'isup_grade':k, 'gleason_score':l}, ignore_index=True)
                
    return df_mask

In [None]:
df_mask = merge_mask()

In [None]:
df_mask

In [None]:
df_mask.loc[(df_mask['isup_grade'] == 2) & (df_mask['gleason_score'] == '4+3')]

In [None]:
df_mask.loc[8724,'isup_grade'] = 3

In [None]:
df_mask['gleason_score'] = df_mask['gleason_score'].apply(lambda x: '0+0' if x == 'negative' else x)

In [None]:
isup_0 = df_mask[df_mask.isup_grade == 0]
isup_1 = df_mask[df_mask.isup_grade == 1]
isup_2 = df_mask[df_mask.isup_grade == 2]
isup_3 = df_mask[df_mask.isup_grade == 3]
isup_4 = df_mask[df_mask.isup_grade == 4]
isup_5 = df_mask[df_mask.isup_grade == 5]

print(f'isup_0: {len(isup_0)}, isup_1: {len(isup_1)}, isup_2: {len(isup_2)}, isup_3: {len(isup_3)}, isup_4: {len(isup_4)}, isup_5: {len(isup_5)}')

In [None]:
isup_sam0 = isup_0.sample(n=1215)
isup_sam1 = isup_1.sample(n=1215)
isup_sam2 = isup_2.sample(n=1215)
isup_sam3 = isup_3.sample(n=1215)
isup_sam4 = isup_4.sample(n=1215)
isup_sam5 = isup_5.sample(n=1215)

frames = [isup_sam0, isup_sam1, isup_sam2, isup_sam3, isup_sam4, isup_sam5]
balanced_df = pd. concat(frames)


In [None]:
def overlay_mask_on_slide(df, center='radboud', alpha=0.8, max_size=(800, 800)):
    """Show a mask overlayed on a slide."""
    
    data = df
    for i, row in enumerate(data.iterrows()):
        x = row[1][0]
        slide = openslide.OpenSlide(x)
        y = row[1][1]
        mask = openslide.OpenSlide(y)
        slide_data = slide.read_region((0,0), slide.level_count - 1, slide.level_dimensions[-1])
        mask_data = mask.read_region((0,0), mask.level_count - 1, mask.level_dimensions[-1])
        mask_data = mask_data.split()[0]


        # Create alpha mask
        alpha_int = int(round(255*alpha))
        if center == 'radboud':
            alpha_content = np.less(mask_data.split()[0], 2).astype('uint8') * alpha_int + (255 - alpha_int)
        elif center == 'karolinska':
            alpha_content = np.less(mask_data.split()[0], 1).astype('uint8') * alpha_int + (255 - alpha_int)

        alpha_content = PIL.Image.fromarray(alpha_content)
        preview_palette = np.zeros(shape=768, dtype=int)

        if center == 'radboud':
            # Mapping: {0: background, 1: stroma, 2: benign epithelium, 3: Gleason 3, 4: Gleason 4, 5: Gleason 5}
            preview_palette[0:18] = (np.array([0, 0, 0, 0.5, 0.5, 0.5, 0, 1, 0, 1, 1, 0.7, 1, 0.5, 0, 1, 0, 0]) * 255).astype(int)
        elif center == 'karolinska':
            # Mapping: {0: background, 1: benign, 2: cancer}
            preview_palette[0:9] = (np.array([0, 0, 0, 0, 1, 0, 1, 0, 0]) * 255).astype(int)

        mask_data.putpalette(data=preview_palette.tolist())
        mask_rgb = mask_data.convert(mode='RGB')
        overlayed_image = PIL.Image.composite(image1=slide_data, image2=mask_rgb, mask=alpha_content)
        overlayed_image.thumbnail(size=max_size, resample=0)

        slide.close()
        mask.close()   

        return overlayed_image

In [None]:
DEBUG = True
import os
import sys
sys.path = [
    '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',
] + sys.path

In [None]:
import time
import skimage.io
import numpy as np
import pandas as pd
import cv2
import PIL.Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from warmup_scheduler import GradualWarmupScheduler
from efficientnet_pytorch import model as enet
import albumentations
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm_notebook as tqdm

from efficientnet_pytorch import EfficientNet

In [None]:
data_dir = '../input/prostate-cancer-grade-assessment'
df_train = balanced_df
image_folder = os.path.join(data_dir, 'train_images')

#kernel_type = 'how_to_train_effnet_b0_to_get_LB_0.86'

#enet_type = 'efficientnet-b0'
fold = 0
tile_size = 256
image_size = 256
n_tiles = 36
batch_size = 2
num_workers = 4
out_dim = 5
init_lr = 3e-4
warmup_factor = 10

warmup_epo = 1
n_epochs = 1 if DEBUG else 30
df_train = df_train.sample(100).reset_index(drop=True) if DEBUG else df_train

device = torch.device('cuda')

print(image_folder)

In [None]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)
df_train['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(skf.split(df_train, df_train['isup_grade'])):
    df_train.loc[valid_idx, 'fold'] = i
df_train.head()

In [None]:
pretrained_model = {
    'efficientnet-b0': '../input/efficientnet-pytorch/efficientnet-b0-08094119.pth'
}

In [None]:
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        return x

In [None]:
def get_tiles(img, mode=0):
        result = []
        h, w, c = img.shape
        pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
        pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)

        img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
        img3 = img2.reshape(
            img2.shape[0] // tile_size,
            tile_size,
            img2.shape[1] // tile_size,
            tile_size,
            3
        )

        img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
        n_tiles_with_info = (img3.reshape(img3.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()
        if len(img3) < n_tiles:
            img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]], constant_values=255)
        idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
        img3 = img3[idxs]
        for i in range(len(img3)):
            result.append({'img':img3[i], 'idx':i})
        return result, n_tiles_with_info >= n_tiles


class PANDADataset(Dataset):
    def __init__(self,
                 df,
                 image_size,
                 n_tiles=n_tiles,
                 tile_mode=0,
                 rand=False,
                 transform=None,
                ):

        self.df = df.reset_index(drop=True)
        self.image_size = image_size
        self.n_tiles = n_tiles
        self.tile_mode = tile_mode
        self.rand = rand
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.image_id
        
        tiff_file = os.path.join(image_folder, f'{img_id}.tiff')
        image = skimage.io.MultiImage(tiff_file)[-2]
        tiles, OK = get_tiles(image, self.tile_mode)

        if self.rand:
            idxes = np.random.choice(list(range(self.n_tiles)), self.n_tiles, replace=False)
        else:
            idxes = list(range(self.n_tiles))

        n_row_tiles = int(np.sqrt(self.n_tiles))
        images = np.zeros((image_size * n_row_tiles, image_size * n_row_tiles, 3))
        for h in range(n_row_tiles):
            for w in range(n_row_tiles):
                i = h * n_row_tiles + w
    
                if len(tiles) > idxes[i]:
                    this_img = tiles[idxes[i]]['img']
                else:
                    this_img = np.ones((self.image_size, self.image_size, 3)).astype(np.uint8) * 255
                this_img = 255 - this_img
                if self.transform is not None:
                    this_img = self.transform(image=this_img)['image']
                h1 = h * image_size
                w1 = w * image_size
                images[h1:h1+image_size, w1:w1+image_size] = this_img

        if self.transform is not None:
            images = self.transform(image=images)['image']
        images = images.astype(np.float32)
        images /= 255
        images = images.transpose(2, 0, 1)

        label = np.zeros(5).astype(np.float32)
        label[:row.isup_grade] = 1.
        return torch.tensor(images), torch.tensor(label)

In [None]:
transforms_train = albumentations.Compose([
    albumentations.Transpose(p=0.5),
    albumentations.VerticalFlip(p=0.5),
    albumentations.HorizontalFlip(p=0.5),
])
transforms_val = albumentations.Compose([])

In [None]:
dataset_show = PANDADataset(df_train, image_size, n_tiles, 0, transform=transforms_train)
from pylab import rcParams
rcParams['figure.figsize'] = 20,10
for i in range(2):
    f, axarr = plt.subplots(1,5)
    for p in range(5):
        idx = np.random.randint(0, len(dataset_show))
        img, label = dataset_show[idx]
        axarr[p].imshow(1. - img.transpose(0, 1).transpose(1,2).squeeze())
        axarr[p].set_title(str(sum(label)))

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train_epoch(loader, optimizer):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for (data, target) in bar:
        
        data, target = data.to(device), target.to(device)
        loss_func = criterion
        optimizer.zero_grad()
        logits = model(data)
        loss = loss_func(logits, target)
        loss.backward()
        optimizer.step()

        loss_np = loss.detach().cpu().numpy()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.5f, smth: %.5f' % (loss_np, smooth_loss))
    return train_loss


def val_epoch(loader, get_output=False):

    model.eval()
    val_loss = []
    LOGITS = []
    PREDS = []
    TARGETS = []

    with torch.no_grad():
        for (data, target) in tqdm(loader):
            data, target = data.to(device), target.to(device)
            logits = model(data)

            loss = criterion(logits, target)

            pred = logits.sigmoid().sum(1).detach().round()
            LOGITS.append(logits)
            PREDS.append(pred)
            TARGETS.append(target.sum(1))

            val_loss.append(loss.detach().cpu().numpy())
        val_loss = np.mean(val_loss)

    LOGITS = torch.cat(LOGITS).cpu().numpy()
    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    acc = (PREDS == TARGETS).mean() * 100.
    
    qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
    qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values, weights='quadratic')
    qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values, weights='quadratic')
    print('qwk', qwk, 'qwk_k', qwk_k, 'qwk_r', qwk_r)

    if get_output:
        return LOGITS
    else:
        return val_loss, acc, qwk

In [None]:
train_idx = np.where((df_train['fold'] != fold))[0]
valid_idx = np.where((df_train['fold'] == fold))[0]

df_this  = df_train.loc[train_idx]
df_valid = df_train.loc[valid_idx]

dataset_train = PANDADataset(df_this , image_size, n_tiles, transform=transforms_train)
dataset_valid = PANDADataset(df_valid, image_size, n_tiles, transform=transforms_val)

train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=RandomSampler(dataset_train), num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=batch_size, sampler=SequentialSampler(dataset_valid), num_workers=num_workers)

model = EfficientNet.from_pretrained('efficientnet-b0')
#model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=init_lr/warmup_factor)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs-warmup_epo)
scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine)

print(len(dataset_train), len(dataset_valid))

In [None]:
qwk_max = 0.
best_file = f'{kernel_type}_best_fold{fold}.pth'
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader)

    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    print(content)
    with open(f'log_{kernel_type}.txt', 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), best_file)
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(f'{kernel_type}_final_fold{fold}.pth'))

In [None]:
# def train_validate_test_split(df, train_percent=.8, validate_percent=.15, seed=None):
#     np.random.seed(seed)
#     perm = np.random.permutation(df.index)
#     m = len(df.index)
#     train_end = int(train_percent * m)
#     validate_end = int(validate_percent * m) + train_end
#     train = df.iloc[perm[:train_end]]
#     validate = df.iloc[perm[train_end:validate_end]]
#     test = df.iloc[perm[validate_end:]]
#     return train, validate, test

In [None]:
# def save_folder(df):
#     save_mask_dir = '/kaggle/pc_detection/CoAtNet/'
#     os.makedirs(save_mask_dir, exist_ok=True)
#     for i, img_id in tqdm(enumerate(df.image_id)):
#         src_dir = BASE_FOLDER + "train_images"
#         dst_dir = save_mask_dir + "df"
#         for jpgfile in glob.iglob(os.path.join(src_dir, img_id+".tiff")):
#             shutil.copy(jpgfile, dst_dir)

In [None]:
import os
import sys
import time
import skimage.io
import numpy as np
import pandas as pd
import cv2
import PIL.Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from warmup_scheduler import GradualWarmupScheduler
from efficientnet_pytorch import model as enet
import albumentations
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm
import pickle
from PIL import Image
from skimage.io import imread
import skimage.feature as skfeature
import sklearn
import os
from efficientnet_pytorch import EfficientNet

if len(sys.argv) == 1:
    print('Specify GPU via parameter.')
    exit(1)

os.environ['CUDA_VISIBLE_DEVICES'] = sys.argv[1]

data_dir = '../input/prostate-cancer-grade-assessment'
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_sub = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
image_folder = os.path.join(data_dir, 'train_images')

kernel_type = 'model'

enet_type = 'efficientnet-b0'
fold = 0
tile_size = 256
image_size = 256
n_tiles = 36
batch_size = 2
num_workers = 4
out_dim = 5
init_lr = 3e-4
warmup_factor = 10
warmup_epo = 1
n_epochs = 5 
df_train = df_train.sample(300).reset_index(drop=True)
print(df_train.shape)
eps = np.finfo(np.float32).eps

device = torch.device('cuda')

print(image_folder)

raw_image_ids = [s[:s.find('.')] for s in os.listdir(image_folder)]
df_train = df_train[df_train['image_id'].isin(raw_image_ids)].reset_index(drop=True)
print(df_train.shape)

skf = StratifiedKFold(5, shuffle=True, random_state=42)
df_train['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(skf.split(df_train, df_train['isup_grade'])):
    df_train.loc[valid_idx, 'fold'] = i

pretrained_model = {
    'efficientnet-b0': 'efficientnet-b0-08094119.pth'
}

class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        return x

def get_tiles(image_id, mode=0):
    tiff_file = os.path.join(image_folder, f'{image_id}.tiff')
    img = skimage.io.MultiImage(tiff_file)[-1]

    result = []
    h, w, c = img.shape    
    pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
    pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)
    img2 = np.pad(img, [[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
    img3 = img2.reshape(
        img2.shape[0] // tile_size,
        tile_size,
        img2.shape[1] // tile_size,
        tile_size,
        3
    )
    img3 = img3.transpose(0, 2, 1, 3, 4).reshape(-1, tile_size, tile_size, 3)
    n_tiles_with_info = (img3.reshape(img3.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()

    if len(img3) < n_tiles:
        img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]],mode='constant', constant_values=255)
    idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
    img3 = img3[idxs]

    result = []
    for i in range(len(img3)):
        result.append({'img':img3[i], 'idx':i})
    return result, n_tiles_with_info >= n_tiles

class PANDADataset(Dataset):
    def __init__(self,
                 df,
                 image_size,
                 n_tiles=n_tiles,
                 tile_mode=0,
                 rand=False,
                 transform=None,
                ):

        self.df = df.reset_index(drop=True)
        self.image_size = image_size
        self.n_tiles = n_tiles
        self.tile_mode = tile_mode
        self.rand = rand
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.image_id
        
        tiles, OK = get_tiles(img_id, self.tile_mode)

        if self.rand:
            idxes = np.random.choice(list(range(self.n_tiles)), self.n_tiles, replace=False)
        else:
            idxes = list(range(self.n_tiles))

        n_row_tiles = int(np.sqrt(self.n_tiles))
        images = np.zeros((image_size * n_row_tiles, image_size * n_row_tiles, 3))
        for h in range(n_row_tiles):
            for w in range(n_row_tiles):
                i = h * n_row_tiles + w
                h1 = h * image_size
                w1 = w * image_size    
                if len(tiles) > idxes[i]:
                    this_img = tiles[idxes[i]]['img']
                else:
                    this_img = np.ones((self.image_size, self.image_size, 3)).astype(np.uint8) * 255
                this_img = 255 - this_img
                if self.transform is not None:
                    this_img = self.transform(image=this_img)['image']

                images[h1:h1+image_size, w1:w1+image_size] = this_img

        if self.transform is not None:
            images = self.transform(image=images)['image']
        images = images.astype(np.float32)
        images /= 255
        images = images.transpose(2, 0, 1)

        label = np.zeros(5).astype(np.float32)
        label[:row.isup_grade] = 1.
        return torch.tensor(images), torch.tensor(label)


transforms_train = albumentations.Compose([
    albumentations.Transpose(p=0.5),
    albumentations.VerticalFlip(p=0.5),
    albumentations.HorizontalFlip(p=0.5),
])
transforms_val = albumentations.Compose([])

criterion = nn.BCEWithLogitsLoss()
def train_epoch(loader, optimizer):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for (data, target) in bar:
        
        data, target = data.to(device), target.to(device)
        loss_func = criterion
        optimizer.zero_grad()
        logits = model(data)

        loss = loss_func(logits, target)
        loss.backward()
        optimizer.step()

        loss_np = loss.detach().cpu().numpy()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.5f, smth: %.5f' % (loss_np, smooth_loss))
    return train_loss


def val_epoch(loader, confusion_matrix=False, get_output=False):

    model.eval()
    val_loss = []
    LOGITS = []
    PREDS = []
    TARGETS = []

    with torch.no_grad():
        for (data, target) in tqdm(loader):
            data, target = data.to(device), target.to(device)
            logits = model(data)

            loss = criterion(logits, target)

            pred = logits.sigmoid().sum(1).detach().round()
            LOGITS.append(logits)
            PREDS.append(pred)
            TARGETS.append(target.sum(1))

            val_loss.append(loss.detach().cpu().numpy())
        val_loss = np.mean(val_loss)

    LOGITS = torch.cat(LOGITS).cpu().numpy()
    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()

    acc = (np.clip(ROUND_PREDS, 0, 5) == TARGETS).mean()

    qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
    qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values, weights='quadratic')
    qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values, weights='quadratic')
    print('qwk', qwk, 'qwk_k', qwk_k, 'qwk_r', qwk_r)

    if get_output:
        return LOGITS
    else:
        return val_loss, acc, qwk

train_idx = np.where((df_train['fold'] != fold))[0]
valid_idx = np.where((df_train['fold'] == fold))[0]

df_this  = df_train.loc[train_idx]
df_valid = df_train.loc[valid_idx]

dataset_train = PANDADataset(df_this , image_size, n_tiles, transform=transforms_train)
dataset_valid = PANDADataset(df_valid, image_size, n_tiles, transform=transforms_val)

train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=RandomSampler(dataset_train), num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=batch_size, sampler=SequentialSampler(dataset_valid), num_workers=num_workers)

model = EfficientNet.from_pretrained('efficientnet-b0')
# model = nn.DataParallel(model)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=init_lr/warmup_factor)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs-warmup_epo)
scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine)

print(len(dataset_train), len(dataset_valid))

qwk_max = 0.
best_file = 'model_{}_{}.pth'
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader, confusion_matrix=True)

    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    print(content)
    with open(f'log_{kernel_type}.txt', 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), best_file.format(sys.argv[1], qwk))
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(f'{kernel_type}_final_fold{fold}.pth'))

In [None]:
!pip install warmup-scheduler
!pip install efficientnet-pytorch

# Successful Run

In [None]:
import numpy as np
import pandas as pd
import json
import math
import cv2
import PIL
from PIL import Image
import numpy as np
from keras import layers
from tensorflow.keras.applications import DenseNet121
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
import scipy
import tensorflow as tf
from tqdm import tqdm
%matplotlib inline

In [None]:
import os
# There are two ways to load the data from the PANDA dataset:
# Option 1: Load images using openslide
import openslide
# Option 2: Load images using skimage (requires that tifffile is installed)
import skimage.io
# General packages
from IPython.display import display
# Plotly for the interactive viewer (see last section)
import plotly.graph_objs as go
# read images
import rasterio

import gc
from random import randint

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
BATCH_SIZE = 15
TRAIN_VAL_RATIO = 0.27
EPOCHS = 5
LR = 0.00010409613402110064

In [None]:
train_df = pd.read_csv('../input/prostate-cancer-grade-assessment/train.csv')
test_df = pd.read_csv('../input/prostate-cancer-grade-assessment/test.csv')
print(train_df.shape)
print(test_df.shape)
train_df.head()

In [None]:
def train_validate_test_split(df, train_percent=.8, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test

In [None]:
df_train, df_test = train_validate_test_split(train_df)

In [None]:
def preprocess_image(image_path, desired_size=224):
    biopsy = openslide.OpenSlide(image_path)
    im = np.array(biopsy.get_thumbnail(size=(desired_size,desired_size)))
    im = Image.fromarray(im)
    im = im.resize((desired_size,desired_size)) 
    im = np.array(im)
    return im

In [None]:
from tqdm import tqdm

In [None]:
# get the number of training images from the target\id dataset
N = df_train.shape[0]
# create an empty matrix for storing the images
x_train = np.empty((N, 224, 224, 3), dtype=np.uint8)
# loop through the images from the images ids from the target\id dataset
# then grab the cooresponding image from disk, pre-process, and store in matrix in memory
for i, image_id in enumerate(tqdm(df_train['image_id'])):
    x_train[i, :, :, :] = preprocess_image(
        f'../input/prostate-cancer-grade-assessment/train_images/{image_id}.tiff'
    )

In [None]:
if os.path.exists('/kaggle/pc_detection/test_data/'):
    # do the same thing as the last cell but on the test\holdout set
    N = df_test[:3].shape[0]
    x_test = np.empty((N, 224, 224, 3), dtype=np.uint8)
    for i, image_id in enumerate(tqdm(df_test['image_id'][:3])):
        x_test[i, :, :, :] = preprocess_image(
            f'/kaggle/pc_detection/test_data/{image_id}.tiff'
        )
else:
    print("test images not found")

In [None]:
# pre-processing the target (i.e. one-hot encoding the target)
y_train = pd.get_dummies(df_train['isup_grade']).values

print(x_train.shape)
print(y_train.shape)
if os.path.exists('/kaggle/pc_detection/test_data/'):
    print(x_test.shape)
else:
    print("test images not found")

In [None]:
y_train_multi = np.empty(y_train.shape, dtype=y_train.dtype)
y_train_multi[:, 5] = y_train[:, 5]

for i in range(4, -1, -1):
    y_train_multi[:, i] = np.logical_or(y_train[:, i], y_train_multi[:, i+1])

# print("Original y_train:", y_train.sum(axis=0))
# print("Multilabel version:", y_train_multi.sum(axis=0))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train_multi, 
    test_size=TRAIN_VAL_RATIO, 
    random_state=2020
)

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
def create_datagen():
    return ImageDataGenerator(
        zoom_range=0.15,  # set range for random zoom
        # set mode for filling points outside the input boundaries
        fill_mode='constant',
        cval=0.,  # value used for fill_mode = "constant"
        horizontal_flip=True,  # randomly flip images
        vertical_flip=True,  # randomly flip images
    )

# Using original generator
data_generator = create_datagen().flow(x_train, y_train, batch_size=BATCH_SIZE, seed=2019)

In [None]:
from tensorflow.keras.applications.densenet import DenseNet121

In [None]:
densenet = DenseNet121(
    weights='../input/input/DenseNet-BC-121-32-no-top.h5',
    include_top=False,
    input_shape=(224,224,3)
)

In [None]:
def build_model(LR=LR):
    model = Sequential()
    model.add(densenet)
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(0.80))
    model.add(layers.Dense(6, activation='sigmoid'))
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=LR),
        metrics=['accuracy']
    )
    
    return model

In [None]:
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [None]:
model = build_model()
model.summary()

In [None]:
history = model.fit_generator(
    data_generator,
    steps_per_epoch=x_train.shape[0] / BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_val, y_val)
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot()
history_df[['accuracy', 'val_accuracy']].plot()

In [None]:
BASE_FOLDER = "../input/prostate-cancer-grade-assessment/"
sample = BASE_FOLDER+"sample_submission.csv"

In [None]:
import os

import cv2
import shutil
import glob

In [None]:
def save_folder(df):
    save_mask_dir = '/kaggle/pc_detection/test_data/'
    os.makedirs(save_mask_dir, exist_ok=True)
    for i, img_id in tqdm(enumerate(df.image_id[:3])):
        src_dir = BASE_FOLDER + "train_images"
        dst_dir = save_mask_dir
        for jpgfile in glob.iglob(os.path.join(src_dir, img_id+".tiff")):
            shutil.copy(jpgfile, dst_dir)
    print(dst_dir)

In [None]:
test_data = save_folder(df_test)

In [None]:
df_test1 = df_test[:3]

In [None]:
df_test1

In [None]:
from random import randint

test_img_path = '/kaggle/pc_detection/test_data/'

if os.path.exists(test_img_path):
    y_test = model.predict(x_test)
    y_test = y_test > 0.37757874193797547
    y_test = y_test.astype(int).sum(axis=1) - 1
else:
    y_test = [randint(0, 5) for i in range(3)]

df_test1['isup_grade_1'] = y_test
df_test1 = df_test1[["image_id", "isup_grade", "isup_grade_1"]]
df_test1.to_csv("submission.csv", index=False)

In [None]:
df_test1

In [None]:
# if os.path.exists('/kaggle/pc_detection/test_data/'):
#     print("test images found.")
#     y_test = model.predict(x_test)
#     y_test = y_test > 0.37757874193797547
#     y_test = y_test.astype(int).sum(axis=1) - 1
#     df_test1['isup_grade'] = y_test
#     df_test1 = df_test[["image_id","isup_grade"]]
#     df_test1.to_csv('submission.csv',index=False)
# else: # if test is not available, just submit some random values
#     print("test images not found, submitting random values.")
#     rand_preds = []
#     for i in range(len(df_test1)):
#         rand_preds.append(randint(0,5))
#     df_test1['isup_grade_1'] = rand_preds
#     df_test1 = df_test[["image_id","isup_grade", "isup_grade_1"]]
#     df_test1.to_csv('submission.csv',index=False)

In [None]:
df_test1

In [None]:
import os
print(os.listdir('/kaggle/pc_detection/test_data/'))

In [None]:
import glob
removing_files = glob.glob('/kaggle/pc_detection/test_data/*.tiff')
for i in removing_files:
    os.remove(i)

In [None]:
filepath = 'kagle/Denset/Dmodel/my_model.h5'
model.save(filepath)
new_model = tf.keras.models.load_model(filepath)
new_model.summary()
# tensorflow.keras.models.save_model(
#     model,
#     filepath
# )

In [None]:
model.save(filepath)

In [None]:
new_model = tf.keras.models.load_model(filepath)

In [None]:
new_model.summary()

In [None]:
DEBUG = True

In [None]:
!pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git


In [None]:
import os
import sys
sys.path = [
    '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',
] + sys.path

In [None]:
import time
import skimage.io
import numpy as np
import pandas as pd
import cv2
import PIL.Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from warmup_scheduler import GradualWarmupScheduler
from efficientnet_pytorch import model as enet
import albumentations
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm_notebook as tqdm

In [None]:
data_dir = '../input/prostate-cancer-grade-assessment'
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
image_folder = os.path.join(data_dir, 'train_images')

kernel_type = 'how_to_train_effnet_b0_to_get_LB_0.86'

enet_type = 'efficientnet-b0'
fold = 0
tile_size = 256
image_size = 256
n_tiles = 36
batch_size = 64
num_workers = 2
out_dim = 5
init_lr = 1e-1
warmup_factor = 10

warmup_epo = 1
n_epochs = 1 if DEBUG else 30
df_train = df_train.sample(50).reset_index(drop=True) if DEBUG else df_train

#device = torch.device('cuda')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(image_folder)

In [None]:
print(device)

In [None]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)
df_train['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(skf.split(df_train, df_train['isup_grade'])):
    df_train.loc[valid_idx, 'fold'] = i
df_train.head()

In [None]:
 pretrained_model = {
    'efficientnet-b0': '../input/efficientnet-pytorch/efficientnet-b0-08094119.pth'
}

In [None]:
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        return x

In [None]:
def get_tiles(img, mode=0):
        result = []
        h, w, c = img.shape
        pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
        pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)

        img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
        img3 = img2.reshape(
            img2.shape[0] // tile_size,
            tile_size,
            img2.shape[1] // tile_size,
            tile_size,
            3
        )

        img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
        n_tiles_with_info = (img3.reshape(img3.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()
        if len(img3) < n_tiles:
            img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]], constant_values=255)
        idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
        img3 = img3[idxs]
        for i in range(len(img3)):
            result.append({'img':img3[i], 'idx':i})
        return result, n_tiles_with_info >= n_tiles


class PANDADataset(Dataset):
    def __init__(self,
                 df,
                 image_size,
                 n_tiles=n_tiles,
                 tile_mode=0,
                 rand=False,
                 transform=None,
                ):

        self.df = df.reset_index(drop=True)
        self.image_size = image_size
        self.n_tiles = n_tiles
        self.tile_mode = tile_mode
        self.rand = rand
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.image_id
        
        tiff_file = os.path.join(image_folder, f'{img_id}.tiff')
        image = skimage.io.MultiImage(tiff_file)[-1]
        tiles, OK = get_tiles(image, self.tile_mode)

        if self.rand:
            idxes = np.random.choice(list(range(self.n_tiles)), self.n_tiles, replace=False)
        else:
            idxes = list(range(self.n_tiles))

        n_row_tiles = int(np.sqrt(self.n_tiles))
        images = np.zeros((image_size * n_row_tiles, image_size * n_row_tiles, 3))
        for h in range(n_row_tiles):
            for w in range(n_row_tiles):
                i = h * n_row_tiles + w
    
                if len(tiles) > idxes[i]:
                    this_img = tiles[idxes[i]]['img']
                else:
                    this_img = np.ones((self.image_size, self.image_size, 3)).astype(np.uint8) * 255
                this_img = 255 - this_img
                if self.transform is not None:
                    this_img = self.transform(image=this_img)['image']
                h1 = h * image_size
                w1 = w * image_size
                images[h1:h1+image_size, w1:w1+image_size] = this_img

        if self.transform is not None:
            images = self.transform(image=images)['image']
        images = images.astype(np.float32)
        images /= 255
        images = images.transpose(2, 0, 1)

        label = np.zeros(5).astype(np.float32)
        label[:row.isup_grade] = 1.
        return torch.tensor(images), torch.tensor(label)

In [None]:
transforms_train = albumentations.Compose([
    albumentations.Transpose(p=0.5),
    albumentations.VerticalFlip(p=0.5),
    albumentations.HorizontalFlip(p=0.5),
])
transforms_val = albumentations.Compose([])

In [None]:
dataset_show = PANDADataset(df_train, image_size, n_tiles, 0, transform=transforms_train)
from pylab import rcParams
rcParams['figure.figsize'] = 20,10
for i in range(2):
    f, axarr = plt.subplots(1,5)
    for p in range(5):
        idx = np.random.randint(0, len(dataset_show))
        img, label = dataset_show[idx]
        axarr[p].imshow(1. - img.transpose(0, 1).transpose(1,2).squeeze())
        axarr[p].set_title(str(sum(label)))

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train_epoch(loader, optimizer):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for (data, target) in bar:
        
        data, target = data.to(device), target.to(device)
        loss_func = criterion
        optimizer.zero_grad()
        logits = model(data)
        loss = loss_func(logits, target)
        loss.backward()
        optimizer.step()

        loss_np = loss.detach().cpu().numpy()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.5f, smth: %.5f' % (loss_np, smooth_loss))
    return train_loss


def val_epoch(loader, get_output=False):

    model.eval()
    val_loss = []
    LOGITS = []
    PREDS = []
    TARGETS = []

    with torch.no_grad():
        for (data, target) in tqdm(loader):
            data, target = data.to(device), target.to(device)
            logits = model(data)

            loss = criterion(logits, target)

            pred = logits.sigmoid().sum(1).detach().round()
            LOGITS.append(logits)
            PREDS.append(pred)
            TARGETS.append(target.sum(1))

            val_loss.append(loss.detach().cpu().numpy())
        val_loss = np.mean(val_loss)

    LOGITS = torch.cat(LOGITS).cpu().numpy()
    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    acc = (PREDS == TARGETS).mean() * 100.
    
    qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
    qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values, weights='quadratic')
    qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values, weights='quadratic')
    print('qwk', qwk, 'qwk_k', qwk_k, 'qwk_r', qwk_r)

    if get_output:
        return LOGITS
    else:
        return val_loss, acc, qwk

In [None]:
train_idx = np.where((df_train['fold'] != fold))[0]
valid_idx = np.where((df_train['fold'] == fold))[0]

df_this  = df_train.loc[train_idx]
df_valid = df_train.loc[valid_idx]

dataset_train = PANDADataset(df_this , image_size, n_tiles, transform=transforms_train)
dataset_valid = PANDADataset(df_valid, image_size, n_tiles, transform=transforms_val)

train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=RandomSampler(dataset_train), num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=batch_size, sampler=SequentialSampler(dataset_valid), num_workers=num_workers)

model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.1)
#scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs-warmup_epo)
#scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=2, after_scheduler=scheduler_cosine)

print(len(dataset_train), len(dataset_valid))

In [None]:
len(train_loader.dataset)

In [None]:
# model.parameters

In [None]:
qwk_max = 0.
best_file = f'{kernel_type}_best_fold{fold}.pth'
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    #scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader)

    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    print(content)
    with open(f'log_{kernel_type}.txt', 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), best_file)
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(f'{kernel_type}_final_fold{fold}.pth'))

In [None]:
DEBUG = False

In [None]:
import os
import sys
sys.path = [
    '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',
] + sys.path

In [None]:
import skimage.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from efficientnet_pytorch import model as enet

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

In [None]:
data_dir = '../input/prostate-cancer-grade-assessment'
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_sub = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

model_dir = '../input/panda-public-models'
image_folder = os.path.join(data_dir, 'test_images')
is_test = os.path.exists(image_folder)  # IF test_images is not exists, we will use some train images.
image_folder = image_folder if is_test else os.path.join(data_dir, 'train_images')

df = df_test if is_test else df_train.loc[:10]

tile_size = 256
image_size = 256
n_tiles = 36
batch_size = 8
num_workers = 4

device = torch.device('cuda')
print(image_folder)

In [None]:
import albumentations

transforms_train = albumentations.Compose([
    albumentations.Transpose(p=0.5),
    albumentations.VerticalFlip(p=0.5),
    albumentations.HorizontalFlip(p=0.5),
])


transforms_val = albumentations.Compose([])

transforms_val1 = albumentations.Compose([
    albumentations.Transpose(p=1)
])

transforms_val2 = albumentations.Compose([
    albumentations.VerticalFlip(p=1)
])

transforms_val3= albumentations.Compose([
    albumentations.HorizontalFlip(p=1),
])

transforms_val4= albumentations.Compose([
    albumentations.Transpose(p=1),
    albumentations.VerticalFlip(p=1),
    albumentations.HorizontalFlip(p=1),
])

In [None]:
def get_tiles(img, mode=0):
        result = []
        h, w, c = img.shape
        pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
        pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)

        img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
        img3 = img2.reshape(
            img2.shape[0] // tile_size,
            tile_size,
            img2.shape[1] // tile_size,
            tile_size,
            3
        )

        img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
        n_tiles_with_info = (img3.reshape(img3.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()
        if len(img) < n_tiles:
            img3 = np.pad(img3,[[0,n_tiles-len(img3)],[0,0],[0,0],[0,0]], constant_values=255)
        idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
        img3 = img3[idxs]
        for i in range(len(img3)):
            result.append({'img':img3[i], 'idx':i})
        return result, n_tiles_with_info >= n_tiles


class PANDADataset(Dataset):
    def __init__(self,
                 df,
                 image_size,
                 n_tiles=n_tiles,
                 tile_mode=0,
                 rand=False,
                 sub_imgs=False,
                 transform=None
                ):

        self.df = df.reset_index(drop=True)
        self.image_size = image_size
        self.n_tiles = n_tiles
        self.tile_mode = tile_mode
        self.rand = rand
        self.sub_imgs = sub_imgs
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.image_id
        
        tiff_file = os.path.join(image_folder, f'{img_id}.tiff')
        image = skimage.io.MultiImage(tiff_file)[-1]
        tiles, OK = get_tiles(image, self.tile_mode)

        if self.rand:
            idxes = np.random.choice(list(range(self.n_tiles)), self.n_tiles, replace=False)
        else:
            idxes = list(range(self.n_tiles))
        idxes = np.asarray(idxes) + self.n_tiles if self.sub_imgs else idxes

        n_row_tiles = int(np.sqrt(self.n_tiles))
        images = np.zeros((image_size * n_row_tiles, image_size * n_row_tiles, 3))
        for h in range(n_row_tiles):
            for w in range(n_row_tiles):
                i = h * n_row_tiles + w
    
                if len(tiles) > idxes[i]:
                    this_img = tiles[idxes[i]]['img']
                else:
                    this_img = np.ones((self.image_size, self.image_size, 3)).astype(np.uint8) * 255
                this_img = 255 - this_img
                h1 = h * image_size
                w1 = w * image_size
                images[h1:h1+image_size, w1:w1+image_size] = this_img

        if self.transform is not None:
            images = self.transform(image=images)['image']
            
        images = images.astype(np.float32)
        images /= 255
        images = images.transpose(2, 0, 1)

        return torch.tensor(images)

In [None]:
if not is_test:
    dataset_show = PANDADataset(df, image_size, n_tiles, 0)
    from pylab import rcParams
    rcParams['figure.figsize'] = 20,10
    for i in range(2):
        f, axarr = plt.subplots(1,5)
        for p in range(5):
            idx = np.random.randint(0, len(dataset_show))
            img = dataset_show[idx]
            axarr[p].imshow(1. - img.transpose(0, 1).transpose(1,2).squeeze())
            axarr[p].set_title(str(idx))

In [None]:
dataset = PANDADataset(df, image_size, n_tiles, 0, False, False, transforms_val )  # mode == 0
loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

dataset2 = PANDADataset(df, image_size, n_tiles, 2, False, False, transforms_val )  # mode == 2
loader2 = DataLoader(dataset2, batch_size=batch_size, num_workers=num_workers, shuffle=False)


LOGITS = []
LOGITS2 = []
LOGITS3 = []
LOGITS4 = []
with torch.no_grad():
    for data in tqdm(loader):
        data = data.to(device)
        logits = models[0](data)
        LOGITS.append(logits)
        logits = models2[0](data)
        LOGITS3.append(logits)
        
    for data in tqdm(loader2):
        data = data.to(device)
        logits = models[0](data)
        LOGITS2.append(logits)
        logits = models2[0](data)
        LOGITS4.append(logits)
        
LOGITS = (torch.cat(LOGITS).sigmoid().cpu()+torch.cat(LOGITS2).sigmoid().cpu()+torch.cat(LOGITS3).sigmoid().cpu()+torch.cat(LOGITS4).sigmoid().cpu()) / 4
PREDS = LOGITS.sum(1).numpy()

In [None]:
!pip install ../input/kaggle-efficientnet-repo/efficientnet-1.0.0-py3-none-any.whl

import cv2
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import argparse
import os
import skimage.io
from scipy.ndimage import measurements
import os
import numpy as np
import pandas as pd
import argparse
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy
from kaggle_datasets import KaggleDatasets
from tensorflow.keras import layers as L
import efficientnet.tfkeras as efn
from tensorflow.keras.utils import to_categorical
import gc
import albumentations
gc.enable()



sz = 256
N = 48
def tile(img):
    result = []
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=255)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
    img = img[idxs]
    return img

def tile2(img):
    result = []
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz + ((sz * 2) // 2), (sz - shape[1]%sz)%sz + ((sz * 2) // 2)
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=255)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
    img = img[idxs]
    return img


In [None]:
TRAIN = '../input/prostate-cancer-grade-assessment/train_images/'
MASKS = '../input/prostate-cancer-grade-assessment/train_label_masks/'
BASE_PATH = '../input/prostate-cancer-grade-assessment/'
train = pd.read_csv(BASE_PATH + "train.csv")
train.head()

sub = pd.read_csv("../input/prostate-cancer-grade-assessment/sample_submission.csv")
sub.head()

test = pd.read_csv("../input/prostate-cancer-grade-assessment/test.csv")
test.head()

TEST = '../input/prostate-cancer-grade-assessment/test_images/'


PRED_PATH = TEST 
df = sub
t_df = test

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(lr= 1e-05), loss= tf.nn.sigmoid_cross_entropy_with_logits)
model.load_weights('../input/pandaenetb042x256x256x3/DenseNet121-48-full-epochs60.h5')

####

if os.path.exists(PRED_PATH):
    predictions40 = []
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        
        image_id = row['image_id']
        
        img_path = PRED_PATH + image_id + '.tiff' #BASE_PATH
        
        img = skimage.io.MultiImage(img_path)[1]
        
        patches = tile(img)
        patches1 = patches.copy()
        patches2 = patches.copy()
        patches3 = patches.copy()
        patches4 = patches.copy() 
        
        k = 0
        while k < N_TILES:
            patches1[k, ] = transforms_val0(image=patches1[k, ])['image']
            patches2[k, ] = transforms_val1(image=patches2[k, ])['image']
            patches3[k, ] = transforms_val2(image=patches3[k, ])['image']
            patches4[k, ] = transforms_val3(image=patches4[k, ])['image']
            k += 1
        
        image = np.stack([patches1, patches2, patches3, patches4])
        image = image / 255.0
        
        pred = model.predict(image) 
        isup = 0.25*np.sum(pred)
        predictions40.append(isup)

        del patches, img
        gc.collect()

else:
    PRED_PATH = TRAIN
    df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    df = df_train.loc[:10]
    df = df[['image_id','isup_grade']].copy()
    predictions40 = []
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        
        image_id = row['image_id']
        
        img_path = PRED_PATH + image_id + '.tiff' #BASE_PATH
        
        img = skimage.io.MultiImage(img_path)[1]
        
        patches = tile(img)
        patches1 = patches.copy()
        patches2 = patches.copy()
        patches3 = patches.copy()
        patches4 = patches.copy() 
        
        k = 0
        while k < N_TILES:
            patches1[k, ] = transforms_val0(image=patches1[k, ])['image']
            patches2[k, ] = transforms_val1(image=patches2[k, ])['image']
            patches3[k, ] = transforms_val2(image=patches3[k, ])['image']
            patches4[k, ] = transforms_val3(image=patches4[k, ])['image']
            k += 1
        
        image = np.stack([patches1, patches2, patches3, patches4])
        image = image / 255.0
        
        pred = model.predict(image) 
        isup = 0.25*np.sum(pred)
        predictions40.append(isup)

        del patches, img
        gc.collect()


####

if os.path.exists(PRED_PATH):
    predictions42 = []
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        
        image_id = row['image_id']
        
        img_path = PRED_PATH + image_id + '.tiff' #BASE_PATH
        
        img = skimage.io.MultiImage(img_path)[1]
        
        patches = tile2(img)
        patches1 = patches.copy()
        patches2 = patches.copy()
        patches3 = patches.copy()
        patches4 = patches.copy() 
        
        k = 0
        while k < N_TILES:
            patches1[k, ] = transforms_val0(image=patches1[k, ])['image']
            patches2[k, ] = transforms_val1(image=patches2[k, ])['image']
            patches3[k, ] = transforms_val2(image=patches3[k, ])['image']
            patches4[k, ] = transforms_val3(image=patches4[k, ])['image']
            k += 1
        
        image = np.stack([patches1, patches2, patches3, patches4])
        image = image / 255.0
        
        pred = model.predict(image) 
        isup = 0.25*np.sum(pred)
        predictions42.append(isup)

        del patches, img
        gc.collect()

else:
    PRED_PATH = TRAIN
    df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    df = df_train.loc[:10]
    df = df[['image_id','isup_grade']].copy()
    predictions42 = []
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        
        image_id = row['image_id']
        
        img_path = PRED_PATH + image_id + '.tiff' #BASE_PATH
        
        img = skimage.io.MultiImage(img_path)[1]
        
        patches = tile2(img)
        patches1 = patches.copy()
        patches2 = patches.copy()
        patches3 = patches.copy()
        patches4 = patches.copy() 
        
        k = 0
        while k < N_TILES:
            patches1[k, ] = transforms_val0(image=patches1[k, ])['image']
            patches2[k, ] = transforms_val1(image=patches2[k, ])['image']
            patches3[k, ] = transforms_val2(image=patches3[k, ])['image']
            patches4[k, ] = transforms_val3(image=patches4[k, ])['image']
            k += 1
        
        image = np.stack([patches1, patches2, patches3, patches4])
        image = image / 255.0
        
        pred = model.predict(image) 
        isup = 0.25*np.sum(pred)
        predictions42.append(isup)

        del patches, img
        gc.collect()


        
del model, dummy_data, sub, pred, train, isup, image
del patches1,patches2,patches3,patches4    

gc.collect()

In [None]:
PREDS = (1/5)*PREDS + (1/5)*PREDS1 + (1/5)*PREDS2 + (1/5)*PREDS3 + (1/5)*PREDS4

FINAL = np.round( (6/10)*PREDS +
                  (2/60)*np.array(predictions10) + (2/60)*np.array(predictions12) + 
                  (2/60)*np.array(predictions20) + (2/60)*np.array(predictions22) +
                  (2/60)*np.array(predictions30) + (2/60)*np.array(predictions32) +
                  (0.5/10)*np.array(predictions40) + (0.5/10)*np.array(predictions42) +
                  (1/60)*np.array(predictions50) + (1/60)*np.array(predictions52) +
                  (1/60)*np.array(predictions60) + (1/60)*np.array(predictions62) +
                  (1/60)*np.array(predictions70) + (1/60)*np.array(predictions72) )


df['isup_grade'] = FINAL.astype(int)
df[['image_id', 'isup_grade']].to_csv('submission.csv', index=False)
print(df.head())
print()
print(df.isup_grade.value_counts())

In [None]:
import pandas as pd
import os

In [None]:
df_00 = pd.read_csv('../input/prostate-cancer-grade-assessment/test.csv')

In [None]:
df_00

In [None]:
import numpy as np
import pandas as pd
import json
import math
import cv2
import PIL
from PIL import Image
import numpy as np
from keras import layers
from tensorflow.keras.applications.densenet import DenseNet121
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
import scipy
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

import tensorflow_addons as tfa

%matplotlib inline

In [None]:
import os
# There are two ways to load the data from the PANDA dataset:
# Option 1: Load images using openslide
import openslide
# Option 2: Load images using skimage (requires that tifffile is installed)
import skimage.io
# General packages
from IPython.display import display
# Plotly for the interactive viewer (see last section)
import plotly.graph_objs as go
# read images
import rasterio

import gc
from random import randint

import cv2
import shutil
import glob


device = tf.device('cuda')
device


In [None]:
BATCH_SIZE = 32
TRAIN_VAL_RATIO = 0.20
EPOCHS = 5
LR = 1e-1
image_size = 256
n_classes = 6

In [None]:
BASE_FOLDER = "../input/prostate-cancer-grade-assessment/"

In [None]:
train_df1 = pd.read_csv(BASE_FOLDER + '/train.csv')
test_df = pd.read_csv(BASE_FOLDER + 'test.csv')
print(train_df1.shape)
print(test_df.shape)
train_df1.head()

In [None]:
train_df = train_df1
print(train_df.shape)
train_df.head()

In [None]:
#train_df.loc[8724,'isup_grade'] = 3

In [None]:
train_df['gleason_score'] = train_df['gleason_score'].apply(lambda x: '0+0' if x == 'negative' else x)

In [None]:
fold = 0
skf = StratifiedKFold(5, shuffle=True, random_state=42)
train_df['fold'] = -1
for i, (train_idx, test_idx) in enumerate(skf.split(train_df, train_df['isup_grade'])):
    train_df.loc[test_idx, 'fold'] = i
train_df.head()

train_idx = np.where((train_df['fold'] != fold))[0]
test_idx = np.where((train_df['fold'] == fold))[0]

df_train  = train_df.loc[train_idx]
df_test = train_df.loc[test_idx]

In [None]:
# def train_validate_test_split(df, train_percent=.8, seed=None):
#     np.random.seed(seed)
#     perm = np.random.permutation(df.index)
#     m = len(df.index)
#     train_end = int(train_percent * m)
#     train = df.iloc[perm[:train_end]]
#     test = df.iloc[perm[train_end:]]
#     return train, test

# df_train, df_test = train_validate_test_split(train_df)dd

In [None]:
def preprocess_image(image_path, desired_size=image_size):
    biopsy = openslide.OpenSlide(image_path)
    im = np.array(biopsy.get_thumbnail(size=(desired_size,desired_size)))
    im = Image.fromarray(im)
    im = im.resize((desired_size,desired_size)) 
    im = np.array(im)
    return im

In [None]:
def save_folder(df):
    save_mask_dir = '/kaggle/pc_detection/test_data/'
    os.makedirs(save_mask_dir, exist_ok=True)
    for i, img_id in tqdm(enumerate(df.image_id)):
        src_dir = "../input/panda-16x128x128-tiles-data/train"
        dst_dir = save_mask_dir
        for jpgfile in glob.iglob(os.path.join(src_dir, img_id+".tiff")):
            shutil.copy(jpgfile, dst_dir)
    print(dst_dir)

In [None]:
test_data = save_folder(df_test)

In [None]:
# get the number of training images from the target\id dataset
N = df_train.shape[0]
# create an empty matrix for storing the images
x_train = np.empty((N, image_size, image_size, 3), dtype=np.uint8)
# loop through the images from the images ids from the target\id dataset
# then grab the cooresponding image from disk, pre-process, and store in matrix in memory
for i, image_id in enumerate(tqdm(df_train['image_id'])):
    x_train[i, :, :, :] = preprocess_image(
        f'../input/panda-16x128x128-tiles-data/train/{image_id}.tiff'
    )

In [None]:
if os.path.exists('/kaggle/pc_detection/test_data/'):
    # do the same thing as the last cell but on the test\holdout set
    N = df_test.shape[0]
    x_test = np.empty((N, image_size, image_size, 3), dtype=np.uint8)
    for i, image_id in enumerate(tqdm(df_test['image_id'])):
        x_test[i, :, :, :] = preprocess_image(
            f'/kaggle/pc_detection/test_data/{image_id}.tiff'
        )
        
else:
    print("test images not found")

In [None]:
# pre-processing the target (i.e. one-hot encoding the target)
y_train = pd.get_dummies(df_train['isup_grade']).values
y_train.to(device)
print(x_train.shape)
print(y_train.shape)
if os.path.exists('/kaggle/pc_detection/test_data/'):
    print(x_test.shape)
else:
    print("test images not found")

In [None]:
y_train_multi = np.empty(y_train.shape, dtype=y_train.dtype)
y_train_multi[:, 5] = y_train[:, 5]


for i in range(4, -1, -1):
    y_train_multi[:, i] = np.logical_or(y_train[:, i], y_train_multi[:, i+1])
    y_train_multi.to(device)
# print("Original y_train:", y_train.sum(axis=0))
# print("Multilabel version:", y_train_multi.sum(axis=0))

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train_multi, 
    test_size=TRAIN_VAL_RATIO, 
    random_state=2020
)

In [None]:
def create_datagen():
    return ImageDataGenerator(
        zoom_range=0.15,  # set range for random zoom
        # set mode for filling points outside the input boundaries
        fill_mode='constant',
        cval=0.,  # value used for fill_mode = "constant"
        horizontal_flip=True,  # randomly flip images
        vertical_flip=True,  # randomly flip images
    )

# Using original generator
data_generator = create_datagen().flow(x_train, y_train, batch_size=BATCH_SIZE, seed=2019)

In [None]:
densenet = DenseNet121(
    weights='../input/input/DenseNet-BC-121-32-no-top.h5',
    include_top=False,
    input_shape=(image_size,image_size,3)
)

densenet.to(device)

In [None]:
def build_model(LR=LR):
    model = Sequential()
    model.add(densenet)
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(0.80))
    model.add(layers.Dense(6, activation='sigmoid'))
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=LR),
        metrics=['accuracy',tfa.metrics.CohenKappa(num_classes=n_classes,weightage='quadratic')]
    )
    
    return model

In [None]:
model = build_model()
model.to(device)
model.summary()

In [None]:
history = model.fit_generator(
    data_generator,
    steps_per_epoch=x_train.shape[0] / BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_val, y_val)
)
history.to(device)

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot()
history_df[['accuracy', 'val_accuracy']].plot()

In [None]:
y_true = df_test.isup_grade.values
df_test.head()

In [None]:
from random import randint

test_img_path = '/kaggle/pc_detection/test_data/'

if os.path.exists(test_img_path):
    y_test = model.predict(x_test)
    y_test = y_test > 0.37757874193797547
    y_test = y_test.astype(int).sum(axis=1) - 1
else:
    n = len(df_test)
    y_test = [randint(0, 5) for i in range(n)]

df_test['isup_grade_1'] = y_test
df_test = df_test[["image_id", "isup_grade", "isup_grade_1"]]
df_test.to_csv("submission.csv", index=False)

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_test)

In [None]:
cohen_kappa_score(y_true, y_test)

In [None]:
df_test

In [None]:
import os
print(os.listdir('/kaggle/pc_detection/test_data/'))

In [None]:
import glob
removing_files = glob.glob('/kaggle/pc_detection/test_data/*.tiff')
for i in removing_files:
    os.remove(i)

In [None]:
filepath = 'kagle/Denset/Dmodel/my_model.h5'
model.save(filepath)
new_model = tf.keras.models.load_model(filepath)
new_model.summary()