In [109]:
# colab version

# change train_test_split to 8 fold
# change to my cropped dataset

In [1]:
DEBUG = False

In [2]:
import numpy as np
import pandas as pd
import os, random, glob, cv2, gc
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from termcolor import colored
import sys

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import albumentations as A

from functools import partial
import scipy as sp

from zipfile import ZipFile

from PIL import Image, ImageChops
import time


import numpy as np
import pandas as pd
import skimage.io
from tqdm.notebook import tqdm
import glob
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from zipfile import ZipFile
import cv2
import albumentations as A
import os

In [3]:
Image.MAX_IMAGE_PIXELS = 933120000

In [None]:
!rm -rf /root/.kaggle
!mkdir /root/.kaggle

In [None]:
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [85]:
!kaggle datasets download muerbingsha/pandacroppedlevel11
!kaggle datasets download muerbingsha/pandacroppedlevel12
!kaggle datasets download muerbingsha/pandacroppedlevel13

Downloading pandacroppedlevel11.zip to /home/jupyter
100%|█████████████████████████████████████▉| 4.82G/4.84G [01:54<00:00, 72.7MB/s]
100%|██████████████████████████████████████| 4.84G/4.84G [01:54<00:00, 45.2MB/s]


In [4]:
# test zip file
myzip = ZipFile('pandacroppedlevel13.zip')
files = myzip.infolist()
f = myzip.open(files[1])
img = Image.open(f)
print(img.size, img.mode, len(img.getdata()))

(5348, 5900) RGB 31553200


In [5]:
class config:
    SZ = 256    # not used, just to declare start point, 
    N = 36      # not used, just to declare start point, 
    LEVEL = 1   # not used, just to declare start point, 
    
    IMG_SIZE = 1536
    BS = 2
    SEED = 2020
    LR = 0.00001
    LOG = './efnb1-log.txt'
    
GRAY_TR = 235
N_TR = 0.85
WHITE_TR = 0.95

V = 1

In [6]:
# seed
def seed_everything(seed=2020):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(config.SEED)


In [7]:
torch.cuda.empty_cache()
gc.collect()

80

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

# Data

In [9]:
df = pd.read_csv('train.csv')
df = df.iloc[:1000] if DEBUG else df

In [10]:
# shuffle
df = shuffle(df)

# split
train_df = df[df.split != 0]
val_df = df[df.split == 0]
n = len(val_df) // 2
test_df = val_df.iloc[:n]
val_df = val_df.iloc[n:]


len(train_df), len(val_df), len(test_df)

(9289, 664, 663)

In [11]:
train_df.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,split,folder
805,25a27922edaf3f338e8693ededcbe1c2,radboud,4,4+4,6,1
3218,f232dc705893be350eaf1afd474818aa,karolinska,2,3+4,3,1
9513,f0a6026f3c1dafdf2496721f3721c039,karolinska,1,3+3,2,2
8682,647946e65654e3e71e42eea5cc95d01a,karolinska,2,3+4,7,1
7508,e4bbe0282fbed7b4c2e82a36fec5246b,radboud,2,3+4,4,2


# Augmentation

In [12]:
train_transforms = A.Compose([A.Transpose(p=0.5), 
                             A.VerticalFlip(p=0.5),
                             A.HorizontalFlip(p=0.5)])
val_transforms = A.Compose([])

# Dataset

In [13]:
def get_tiles(img, SZ=256, N=36):
    '''
    dynamically select SZ and N
    img: numpy array since this image has been processed by select_level, remove_gray, crop, and transpose
    Return 
        tiles, SZ, N
    '''
    
    # pad img
    h, w, c = img.shape
    pad_h = (SZ - h % SZ) % SZ 
    pad_w = (SZ - w % SZ) % SZ 

    img2 = np.pad(img, [[pad_h // 2, pad_h - pad_h // 2], 
                        [pad_w // 2, pad_w - pad_w//2], 
                        [0,0]], constant_values=255)
    
    # choose tiles
    img3 = img2.reshape(
        img2.shape[0] // SZ,
        SZ,
        img2.shape[1] // SZ,
        SZ,
        3
    )
    
    new_row, new_col = img3.shape[0], img3.shape[2]
    img3 = img3.transpose(0, 2, 1, 3, 4).reshape(-1, SZ, SZ, 3) # (783, 256, 256, 3)
    info = (img3.reshape(img3.shape[0],-1).sum(-1) < WHITE_TR * SZ*SZ*3*255).sum() # how many tiles are not white
    
    # get new N
    possible_N = int(np.sqrt(info))**2
    if N < possible_N:
        N = possible_N
        
    idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:N]
    tiles = img3[idxs]
        
    return tiles, SZ, N, info

In [14]:
def find_tiles(img, SZ=256, N=36):
    tiles, SZ, N, info = get_tiles(img, SZ=SZ, N=N)

    
    # too much white tiles
    while info <= int(N_TR * N) and SZ > 64:
        SZ = SZ // 2
        tiles, SZ, N, info = get_tiles(img, SZ=SZ, N=N)
        
        
    # pad 
    # for example 32 < 30(0.85*36) 
    if tiles.shape[0] < N:
        tiles = np.pad(tiles, [ [0, N-len(tiles)], [0,0],[0,0],[0,0]], constant_values=255)
        
        
    return tiles, SZ, N, info

In [15]:
class MyTrainDataset(Dataset):
    def __init__(self, df, split='train', shuffle_df=False, shuffle_tiles=False):
        super().__init__()
        
        if shuffle_df:
            df = shuffle(df)
        self.df = df.reset_index(drop=True)
        
        self.split = split
        self.shuffle_tiles = shuffle_tiles
            
        
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        
        # read img
        name = self.df.image_id[idx] 
        folder = self.df.folder[idx]
        myzip = ZipFile(f'pandacroppedlevel1{folder}.zip')
        path = f'train{folder}/{name}.jpeg'
        img = myzip.open(path)
        img = Image.open(img)
        img = np.array(img)
        


        tiles, SZ, N, info = find_tiles(img) # find suitable tiles, SZ, N
        

        # apply transform to each img
        imgs = []
        for t in tiles:
            if self.split == 'train':
                t_aug = train_transforms(**{'image': t})['image']
            elif self.split == 'val':
                t_aug = val_transforms(**{'image': t})['image']
                
            imgs.append(t_aug) 
        
        
        # shuffle tiles
        if self.shuffle_tiles:
            imgs = shuffle(imgs)
     
        # concat
        n = int(np.sqrt(N)) # new Z
        images = np.zeros((SZ*n, SZ*n, 3), dtype=np.int32) # new SZ
        for i in range(n):
            for j in range(n):
                images[i*SZ : (i+1)*SZ, j*SZ : (j+1)*SZ, :] = imgs[i*n+j]
        
            
        # normalize 
        images = 255 - images # reverse 
        if self.split == 'train':
            images = train_transforms(image=images)['image']
        images = A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(image=images)['image']
        
        # resize 
        images = cv2.resize(images, (config.IMG_SIZE, config.IMG_SIZE))
        images = torch.tensor(images).permute(2, 0, 1)
        label = torch.tensor(self.df.isup_grade[idx])
        return images, label

In [16]:
# test dataset
if DEBUG:
    train_ds = MyTrainDataset(train_df, 'train', shuffle_tiles=True)
    x, y = train_ds[6]
    print(x.shape)
    plt.imshow((x*0.5+0.5).permute(1, 2, 0).numpy())
    plt.title(y)
    
    del train_ds, x, y

In [17]:
train_ds = MyTrainDataset(train_df, 'train', shuffle_tiles=True)
train_dl = DataLoader(train_ds, batch_size=config.BS, shuffle=True, drop_last=False) # use 32 means 2 images per batch, don't shuffle to preserve 

val_ds = MyTrainDataset(val_df, 'val', shuffle_tiles=False)
val_dl = DataLoader(val_ds, batch_size=config.BS, shuffle=False, drop_last=False)

test_ds = MyTrainDataset(test_df, 'val', shuffle_tiles=False)
test_dl = DataLoader(test_ds, batch_size=config.BS, shuffle=False, drop_last=False)

In [18]:
# test dl
if DEBUG:
    x, y = next(iter(train_dl))
    plt.imshow((x[0]*0.5+0.5).permute(1, 2, 0).numpy())
    plt.title(y[0])

# Model

In [19]:
!pip install efficientnet_pytorch



In [19]:
from efficientnet_pytorch import EfficientNet

class MyModel(nn.Module):
    def __init__(self, backbone=f'efficientnet-b{V}'):
        super().__init__()
        
        self.base = EfficientNet.from_pretrained(backbone)
        self.fc = nn.Linear(self.base._fc.in_features, 1)
        self.base._fc = nn.Identity()
    
    def forward(self, x):
        x = self.base(x)
        x = self.fc(x)
        return x

In [20]:
if DEBUG:
    model = MyModel()
    x = torch.randn(2, 3, 1536, 1536) # for b4, it can only accepts 512
    y = model(x)
    print(y.shape)

In [21]:
model = MyModel()
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=config.LR)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, verbose=1)

Loaded pretrained weights for efficientnet-b1


In [22]:
# freeze 
# def freeze_until(model, name):
#     flag = False
#     for n, p in model.named_parameters():
#         if n == name:
#             flag = True
#         p.requires_grad = flag
# freeze_until(model, 'model.layer4.2.conv1.weight')

# Training

In [23]:
class OptimizedRounder():
    def __init__(self):
        self.coef_ = [0.5, 1.5, 2.5, 3.5, 4.5]

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 4
            else:
                X_p[i] = 5

        ll = cohen_kappa_score(y, X_p, weights='quadratic')

        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef=[0.5, 1.5, 2.5, 3.5, 4.5]):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 4
            else:
                X_p[i] = 5
        return X_p

    def coefficients(self):
        '''use after self.fit or error throws'''
        return self.coef_['x']

In [24]:
rounder = OptimizedRounder()

In [25]:
def train_on(epoch):
    
    torch.cuda.empty_cache()
    gc.collect()
    
    model.train()
    
    loss_epoch = []
    preds_epoch = []
    y_epoch = []
    bar = tqdm(enumerate(train_dl), total=len(train_dl))
    for i, (x, y) in bar:
        x = x.to(device, dtype=torch.float32)
        y = y.to(device, dtype=torch.float32)

        y_preds = model(x) # [batch, 1]
        y_preds = y_preds.view(-1) #[batch,]
        
        # get metrics
        loss = nn.MSELoss()(y_preds, y)

        # update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # add
        y_np = y.cpu().detach().numpy()
        y_preds_np = rounder.predict(y_preds.cpu().detach().numpy())
        loss_np = loss.cpu().detach().numpy()
        
        # add
        preds_epoch.append(y_preds_np)
        y_epoch.append(y_np)
        loss_epoch.append(loss_np)
    
        
        c = cohen_kappa_score(y_np, y_preds_np, weights='quadratic')
        bar.set_description('loss: %.4f, cohen: %.4f' % (loss_np, c))
        
            
        # clean
        del x, y, y_preds, loss, y_np, y_preds_np, loss_np
        torch.cuda.empty_cache()
        gc.collect()
        
        
    cohen = cohen_kappa_score(np.concatenate(preds_epoch), 
                              np.concatenate(y_epoch), 
                              weights='quadratic')
    
    print('Epoch: %d, Loss: %.4f, Cohen: %.4f' % (epoch, np.mean(loss_epoch), cohen))
    return np.mean(loss_epoch), cohen


In [26]:
torch.cuda.empty_cache()
gc.collect()

66

In [27]:
if DEBUG:
    train_on(1)

In [28]:
def val_on(epoch, dl):

    torch.cuda.empty_cache()
    gc.collect()
    
    model.eval()
    
    loss_epoch = []
    preds_epoch = []
    y_epoch = []
    bar = tqdm(enumerate(dl), total=len(dl))
    with torch.no_grad():
        for i, (x, y) in bar:
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.float32)
            
            y_preds = model(x).view(-1)

            # get metrics
            loss = nn.MSELoss()(y_preds, y)

            # add
            y_preds_np = y_preds.cpu().detach().numpy()
            y_np = y.cpu().detach().numpy()
            loss_np = loss.cpu().detach().numpy()
            
            loss_epoch.append(loss_np)
            preds_epoch.append(y_preds_np)
            y_epoch.append(y_np)
            
            
            # get cohen
            c = cohen_kappa_score(rounder.predict(y_preds_np), y_np, weights='quadratic')
            bar.set_description('loss: %.4f, cohen: %.4f' % (loss_np, c))

        
            # clean
            del x, y, y_preds, loss, y_np, y_preds_np, loss_np, c
            torch.cuda.empty_cache()
            gc.collect()
            

    # cohen
    preds_epoch = np.concatenate(preds_epoch)
    y_epoch = np.concatenate(y_epoch)
    
    rounder.fit(preds_epoch, y_epoch)
    coef = rounder.coefficients()
    preds_epoch = rounder.predict(preds_epoch, coef)
    cohen = cohen_kappa_score(preds_epoch, y_epoch, weights='quadratic')
    
    print('Epoch: %d, Loss: %.4f, Cohen: %.4f' % (epoch, np.mean(loss_epoch), cohen))

    return np.mean(loss_epoch), cohen, coef
        
            

In [29]:
if DEBUG:
    val_on(1, val_dl)

In [31]:
# should be put outside 
best_cohen = [0.8644] # last best score

In [32]:
def train(epochs):
    
    for e in range(epochs):
        train_loss, train_cohen = train_on(e)
        val_loss, val_cohen, coef = val_on(e, val_dl)
        
        # adjust lr
        scheduler.step(val_loss)

        # write to log
        with open(config.LOG, 'a') as f:
            f.write(time.ctime() + f' Epoch: {e}, Train loss: {(train_loss):.4f}, Train cohen: {(train_cohen):.4f}\n   Val loss: {(val_loss):.4f}, Val cohen:{(val_cohen):.4f}, Coef: {coef} \n\n')
            
        # save best
        best_cohen.append(val_cohen)
        if val_cohen >= max(best_cohen):
            print('save best model')
            torch.save(model.state_dict(), f'efnb{V}-best.pth')
            
        torch.save(model.state_dict(), f'./efnb{V}-{e}.pth')
        
    torch.save(model.state_dict(), f'./efnb{V}-final.pth')

In [None]:
train(30)

HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)



Epoch: 0, Loss: 0.3134, Cohen: 0.9342


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 0, Loss: 0.9448, Cohen: 0.8615


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 1, Loss: 0.2799, Cohen: 0.9415


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 1, Loss: 0.9073, Cohen: 0.8676
save best model


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 2, Loss: 0.2459, Cohen: 0.9483


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 2, Loss: 0.8608, Cohen: 0.8656


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 3, Loss: 0.3768, Cohen: 0.9235


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 3, Loss: 0.8292, Cohen: 0.8560


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 4, Loss: 0.3574, Cohen: 0.9277


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 4, Loss: 0.8751, Cohen: 0.8627


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 5, Loss: 0.3551, Cohen: 0.9276


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 5, Loss: 0.8755, Cohen: 0.8617


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))


Epoch: 6, Loss: 0.3447, Cohen: 0.9294


HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))


Epoch: 6, Loss: 0.8124, Cohen: 0.8667


HBox(children=(FloatProgress(value=0.0, max=4645.0), HTML(value='')))

In [33]:
model.load_state_dict(torch.load(f'efnb{V}-0.pth'))

<All keys matched successfully>

In [35]:
val_on(1, test_dl) 
# 0.8858
# [0.51885345, 1.53662393, 2.44524873, 3.61225587, 4.29971003]

HBox(children=(FloatProgress(value=0.0, max=332.0), HTML(value='')))

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)



Epoch: 1, Loss: 1.1410, Cohen: 0.8296


(1.1410031,
 0.8296214644835782,
 array([0.53895498, 1.44433128, 2.58725179, 3.29614676, 4.30293131]))