In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# download kaggle dependent labraries 
# in kaggle  need to -> Dataset  
!pip install pretrainedmodels
!pip install segmentation_models_pytorch

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pathlib, sys, os, random, time
import numba, cv2, gc
import glob

from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.notebook import tqdm
import albumentations as A
import rasterio
from rasterio.windows import Window

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as D

import torchvision
from torchvision import transforms as T

# set up seed for reproduction
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
set_seeds()

In [None]:

DATA_PATH = '../input/hubmap-kidney-segmentation/'

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 

import logging
logging.basicConfig(filename='log.log',
                    format='%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S ',
                    level=logging.INFO)

In [None]:
# used for converting the decoded image to rle mask
def rle_encode(im):
    '''
    im: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = im.flatten(order = 'F')
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def rle_decode(mask_rle, shape=(256, 256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape, order='F')

# accelerate
@numba.njit()
def rle_numba(pixels):
    size = len(pixels)
    points = []
    if pixels[0] == 1: points.append(0)
    flag = True
    for i in range(1, size):
        if pixels[i] != pixels[i-1]:
            if flag:
                points.append(i+1)
                flag = False
            else:
                points.append(i+1 - points[-1])
                flag = True
    if pixels[-1] == 1: points.append(size-points[-1]+1)    
    return points

def rle_numba_encode(image):
    pixels = image.flatten(order = 'F')
    points = rle_numba(pixels)
    return ' '.join(str(x) for x in points)

# crop the raw image.tiff
def make_grid(shape, window=256, min_overlap=32):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2 
    """
    x, y = shape
    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)
    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)
    slices = np.zeros((nx,ny, 4), dtype=np.int64)
    
    for i in range(nx):
        for j in range(ny):
            slices[i,j] = x1[i], x2[i], y1[j], y2[j]    
    return slices.reshape(nx*ny,4)

In [None]:
# generate the dataset from the run-length encoded mask and tiff -> label y and image x 
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)

class HubDataset(D.Dataset):

    def __init__(self, path, tiff_ids, transform,
                 window=256, overlap=32, threshold = 100, isvalid=False):
        self.path = pathlib.Path(path)
        self.tiff_ids = tiff_ids
        self.overlap = overlap
        self.window = window
        self.transform = transform
        self.csv = pd.read_csv((self.path / 'train.csv').as_posix(),
                               index_col=[0])
        self.threshold = threshold
        self.isvalid = isvalid
        
        self.x, self.y, self.id = [], [], []
        self.build_slices()
        self.len = len(self.x)
        self.as_tensor = T.Compose([
            T.ToTensor(),
            T.Normalize([0.625, 0.448, 0.688],
                        [0.131, 0.177, 0.101]),
        ])
        
    
    def build_slices(self):
        self.masks = []
        self.files = []
        self.slices = []
        for i, filename in enumerate(self.csv.index.values):
            if not filename in self.tiff_ids:
                continue
            
            filepath = (self.path /'train'/(filename+'.tiff')).as_posix()
            self.files.append(filepath)
            
            print('Transform', filename)
            with rasterio.open(filepath, transform = identity) as dataset:
                self.masks.append(rle_decode(self.csv.loc[filename, 'encoding'], dataset.shape))
                slices = make_grid(dataset.shape, window=self.window, min_overlap=self.overlap)
                
                for slc in slices:
                    x1,x2,y1,y2 = slc
                    # print(slc)
                    image = dataset.read([1,2,3],
                            window=Window.from_slices((x1,x2),(y1,y2)))
                    image = np.moveaxis(image, 0, -1)
                    
                    image = cv2.resize(image, (256, 256))
                    masks = cv2.resize(self.masks[-1][x1:x2,y1:y2], (256, 256))
                    
                    if self.isvalid:
                        self.slices.append([i,x1,x2,y1,y2])
                        self.x.append(image)
                        self.y.append(masks)
                        self.id.append(filename)
                    else:
                        if self.masks[-1][x1:x2,y1:y2].sum() >= self.threshold or (image>32).mean() > 0.25:
                            self.slices.append([i,x1,x2,y1,y2])
                            
                            self.x.append(image)
                            self.y.append(masks)
                            self.id.append(filename)
    
    # get data operation
    def __getitem__(self, index):
        image, mask = self.x[index], self.y[index]
        augments = self.transform(image=image, mask=mask)
        return self.as_tensor(augments['image']), augments['mask'][None]
    
    def __len__(self):
        """
        Total number of samples in the dataset
        """
        return self.len


In [None]:
class SoftDiceLoss(nn.Module):
    def __init__(self, smooth=1., dims=(-2,-1)):
        super(SoftDiceLoss, self).__init__()
        self.smooth = smooth
        self.dims = dims
    
    def forward(self, x, y):
        tp = (x * y).sum(self.dims)
        fp = (x * (1 - y)).sum(self.dims)
        fn = ((1 - x) * y).sum(self.dims)
        dc = (2 * tp + self.smooth) / (2 * tp + fp + fn + self.smooth)
        dc = dc.mean()
        
        return 1 - dc

bce_fn = nn.BCEWithLogitsLoss()
# bce_fn = nn.BCELoss()
dice_fn = SoftDiceLoss()
    
def loss_fn(y_pred, y_true, ratio=1.0, hard=False):
    bce = bce_fn(y_pred, y_true)
    if hard:
        dice = dice_fn((y_pred.sigmoid()).float() > 0.5, y_true)
    else:
        dice = dice_fn(y_pred.sigmoid(), y_true)
    return ratio*bce + (1-ratio)*dice


In [None]:
# model train function 
def train(model, train_loader, criterion, optimizer):
    losses = []
    for i, (image, target) in enumerate(train_loader):
        image, target = image.to(DEVICE), target.float().to(DEVICE)
        optimizer.zero_grad()
        
        output = model(image)
        loss = criterion(output, target, 1, False)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        # print('train, ', loss.item())
    return np.array(losses).mean()

# # make up all the patch to eval dice 
def np_dice_score(probability, mask):
    p = probability.reshape(-1)
    t = mask.reshape(-1)

    p = p>0.4
    t = t>0.5
    uion = p.sum() + t.sum()
    
    overlap = (p*t).sum()
    dice = 2*overlap/(uion+0.001)
    return dice

# make up all the patch to eval dice with searching for the best threshold
# def np_dice_score(probability, mask):
#     threshold = 0
#     dice_best = 0
#     for nt in np.linspace(0.2, 0,8, 17):
#         p = probability.reshape(-1)
#         t = mask.reshape(-1)
#         p = p > nt
#         t = t > 0.5
#         uion = p.sum() + t.sum()
    
#         overlap = (p*t).sum()
#         dice = 2*overlap/(uion+0.001)
#         if(dice > dice_best):
#             threshold = nt
#             dice_best = dice
#         #print(nt, dice)

#     return threshold, dice_best
        
# model eval function 
def validation(model, val_loader, criterion):
    val_probability, val_mask = [], []
    model.eval()
    with torch.no_grad():
        for image, target in val_loader:
            image, target = image.to(DEVICE), target.float().to(DEVICE)
            output = model(image)
            
            output_ny = output.sigmoid().data.cpu().numpy()
            target_np = target.data.cpu().numpy()
            
            val_probability.append(output_ny)
            val_mask.append(target_np)
            
    val_probability = np.concatenate(val_probability)
    val_mask = np.concatenate(val_mask)
    
    return np_dice_score(val_probability, val_mask)
    

In [None]:
EPOCHES = 8
BATCH_SIZE = 4

WINDOW=1024
MIN_OVERLAP=40
NEW_SIZE=320

train_trfm = A.Compose([
    # A.RandomCrop(NEW_SIZE*3, NEW_SIZE*3),
    A.Resize(NEW_SIZE, NEW_SIZE),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(),
    A.OneOf([
        A.HueSaturationValue(10,15,10),
        A.CLAHE(clip_limit=2),
        A.RandomBrightnessContrast(),            
    ], p=0.3),
    A.OneOf([
        A.OpticalDistortion(p=0.3),
        A.GridDistortion(p=0.1),
        A.IAAPiecewiseAffine(p=0.3),
    ], p=0.3),
     A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=15, p=0.9, 
                                 border_mode=cv2.BORDER_REFLECT),
])

val_trfm = A.Compose([
    # A.CenterCrop(NEW_SIZE, NEW_SIZE),
    A.Resize(NEW_SIZE,NEW_SIZE),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(),
#     A.OneOf([
#         A.RandomContrast(),
#         A.RandomGamma(),
#         A.RandomBrightness(),
#         A.ColorJitter(brightness=0.07, contrast=0.07,
#                    saturation=0.1, hue=0.1, always_apply=False, p=0.3),
#         ], p=0.3),
#     A.OneOf([
#         A.OpticalDistortion(p=0.5),
#         A.GridDistortion(p=0.5),
#         A.IAAPiecewiseAffine(p=0.5),
#     ], p=0.3),
#     A.ShiftScaleRotate(),
])



In [None]:
# define model 
# model = get_model()
# model = Unet(encoder_name="resnet34",classes=1,activation=None)
import segmentation_models_pytorch as smp

model = smp.Unet(
    encoder_name="efficientnet-b5",        # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",     # use `imagenet` pretreined weights for encoder initialization
    in_channels=3,                  # model input channels (1 for grayscale images, 3 for RGB, etc.)
    classes=1,                      # model output channels (number of classes in your dataset)
)

In [None]:
# # csv write fast for public/A leaderboard 8 fold 
# # to be confirmed 
# trfm = T.Compose([
#     T.ToPILImage(),
#     T.Resize(NEW_SIZE),
#     T.ToTensor(),
#     T.Normalize([0.625, 0.448, 0.688],
#                 [0.131, 0.177, 0.101]),
# ])


# #mold_path = '../input/trained_model/'
# mold_path = '../input/dataweights/unet_8_fold/'
# fold_models = []
# for fold_model_path in glob.glob(mold_path + '*.pth'):
#     fold_models.append(torch.load(fold_model_path))
# print(len(fold_models))


# p = pathlib.Path(DATA_PATH)

# subm = {}


# for i, filename in enumerate(p.glob('test/*.tiff')):
#     print(f'{i+1} Predicting {filename.stem}')
#     start_time = time.time()
#     dataset = rasterio.open(filename.as_posix(), transform = identity)
#     slices = make_grid(dataset.shape, window=WINDOW, min_overlap=MIN_OVERLAP)
#     preds = np.zeros(dataset.shape, dtype=np.uint8)
#     print(slices.shape)
    
#     for (x1,x2,y1,y2) in slices:
        
#         image = dataset.read([1,2,3],
#                     window=Window.from_slices((x1,x2),(y1,y2)))
#         image = np.moveaxis(image, 0, -1)
#         image = trfm(image)
        
        
#         pred = None
        
        
#         for fold_model in fold_models:
            
#             model.load_state_dict(fold_model)
#             model.eval()
#             model.to(DEVICE)
            
#             # with 3 times testifid
#             with torch.no_grad():
#                 image = image.to(DEVICE)
#                 image = image.reshape(1, 3, 320, 320)
                
#                 score = model(image)[0][0]
                            
#                 score2 = model(torch.flip(image, [0, 3]))
#                 score2 = torch.flip(score2, [3, 0])[0][0]

#                 score3 = model(torch.flip(image, [1, 2]))
#                 score3 = torch.flip(score3, [2, 1])[0][0]
                
#                 if pred is None:
#                     pred = (score + score2 + score3) / 3.0
#                 else:
#                     pred += (score + score2 + score3) / 3.0
# #                 if pred is None:
# #                     pred = score
# #                 else:
# #                     pred += score
        
#         pred = pred / len(fold_models)
#         score_sigmoid = pred.sigmoid().cpu().numpy()
#         score_sigmoid = cv2.resize(score_sigmoid, (WINDOW, WINDOW))
        
        
#         preds[x1:x2,y1:y2] = (score_sigmoid > 0.4).astype(np.uint8)
        
        
#     subm[i] = {'id':filename.stem, 'predicted': rle_numba_encode(preds)}
       
#     print((time.time()-start_time)/60**1) 
#     del preds
#     gc.collect();



In [None]:
# csv write fast for public/A leaderboard single fold 
# to be confirmed 
trfm = T.Compose([
    T.ToPILImage(),
    T.Resize(NEW_SIZE),
    T.ToTensor(),
    T.Normalize([0.625, 0.448, 0.688],
                [0.131, 0.177, 0.101]),
])

# define your model path 
mold_path = '../input/trained_model/'

model.load_state_dict(mold_path)
model.eval()
model.to(DEVICE)
            

p = pathlib.Path(DATA_PATH)

subm = {}


for i, filename in enumerate(p.glob('test/*.tiff')):
    print(f'{i+1} Predicting {filename.stem}')
    start_time = time.time()
    dataset = rasterio.open(filename.as_posix(), transform = identity)
    slices = make_grid(dataset.shape, window=WINDOW, min_overlap=MIN_OVERLAP)
    preds = np.zeros(dataset.shape, dtype=np.uint8)
    print(slices.shape)
    
    for (x1,x2,y1,y2) in slices:
        
        image = dataset.read([1,2,3],
                    window=Window.from_slices((x1,x2),(y1,y2)))
        image = np.moveaxis(image, 0, -1)
        image = trfm(image)
        
        
        pred = None
        
        
        
            
            
        # with 3 times testifid
        with torch.no_grad():
            image = image.to(DEVICE)
            image = image.reshape(1, 3, 320, 320)

            score = model(image)[0][0]

            score2 = model(torch.flip(image, [0, 3]))
            score2 = torch.flip(score2, [3, 0])[0][0]

            score3 = model(torch.flip(image, [1, 2]))
            score3 = torch.flip(score3, [2, 1])[0][0]

            if pred is None:
                pred = (score + score2 + score3) / 3.0
            else:
                pred += (score + score2 + score3) / 3.0
#                 if pred is None:
#                     pred = score
#                 else:
#                     pred += score
        
        pred = pred / len(fold_models)
        score_sigmoid = pred.sigmoid().cpu().numpy()
        score_sigmoid = cv2.resize(score_sigmoid, (WINDOW, WINDOW))
        
        
        preds[x1:x2,y1:y2] = (score_sigmoid > 0.4).astype(np.uint8)
        
        
    subm[i] = {'id':filename.stem, 'predicted': rle_numba_encode(preds)}
       
    print((time.time()-start_time)/60**1) 
    del preds
    gc.collect();



In [None]:
submission = pd.DataFrame.from_dict(subm, orient='index')
submission.to_csv('./submission_unet_8_fold.csv', index=False)
