# 分類モデルの作成  
入ってきた画像が、そもそも糸球体(セグメンテーション対象)を含むのかどうかを分類するモデルを作成する。  
過去コンペの解法の多くは、一度この分類器を通すことで大幅に精度が向上していた。  
恐らくこちらのモデルにはそこまで精度が要求されないので、軽めのモデル(enetb0-b1)で良いのだと思う。  
何を正解ラベルとするかだが、mask画像のsumをとったときに、閾値を設けて0or1で分ければ良いと思う。(閾値は色々試す)  

* v1_2 :  
    confusion matrixの分析コードを追加  
    enet-b1に変更  
    4fold学習させる  

In [None]:
DEBUG = False

## import

In [None]:
import sys
package_path = '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master'
sys.path.append(package_path)

import os
import pandas as pd
import numpy as np
import random
import time
from tqdm import tqdm
import json
import cv2
from PIL import Image
import tifffile

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
import torch
from torch import nn, optim
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from scipy.ndimage.interpolation import zoom
import albumentations as A
from torch.nn import functional as F
from albumentations.pytorch import ToTensorV2
from efficientnet_pytorch import model as enet

In [None]:
class CFG:
    model_name = "HuBMAP_tachyon_classifier_v1_2"
    backbone = 'efficientnet-b1'
    Progress_Bar = False
    max_grad_norm = 1000
    gradient_accumulation_steps = 1
    init_lr = 1e-3
    weight_decay = 1e-5
    image_size = 256
    batch_size = 64
    n_epochs = 5 if DEBUG else 25
    n_fold = 4
    train_fold = 4 #n_foldあるうち、この数だけしか学習しない
    n_seed = 1
    seed = 42
    num_workers = 4
    model_save_path = False
    mask_th = 100 #256x256pixelのうち、これを超えるマスク部分があれば正解ラベルを1とする

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

## Data

In [None]:
BASE_PATH = "../input/hubmap-kidney-segmentation/"
TRAIN_PATH = os.path.join(BASE_PATH, "train")

train_df = pd.read_csv(os.path.join(BASE_PATH, "train.csv"))
hubmap_df = pd.read_csv(os.path.join(BASE_PATH,"HuBMAP-20-dataset_information.csv"))

#iafoss dataset
TRAIN = '../input/hubmap-256x256/train/'
MASKS = '../input/hubmap-256x256/masks/'
LABELS = '../input/hubmap-kidney-segmentation/train.csv'

## Dataset

In [None]:
# https://www.kaggle.com/iafoss/256x256-images
mean = np.array([0.65459856,0.48386562,0.69428385])
std = np.array([0.15167958,0.23584107,0.13146145])

def img2tensor(img,dtype:np.dtype=np.float32):
    if img.ndim==2 : img = np.expand_dims(img,2)
    img = np.transpose(img,(2,0,1))
    return torch.from_numpy(img.astype(dtype, copy=False))

class HuBMAPDataset(Dataset):
    def __init__(self, fold, train=True, tfms=None):
        ids = pd.read_csv(LABELS).id.values
        kf = KFold(n_splits=CFG.n_fold,random_state=CFG.seed,shuffle=True)
        ids = set(ids[list(kf.split(ids))[fold][0 if train else 1]])
        if DEBUG:
            self.fnames = [fname for fname in os.listdir(TRAIN) if fname.split('_')[0] in ids][:200]
        else:
            self.fnames = [fname for fname in os.listdir(TRAIN) if fname.split('_')[0] in ids]
        self.train = train
        self.tfms = tfms
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fname = self.fnames[idx]
        img = cv2.cvtColor(cv2.imread(os.path.join(TRAIN,fname)), cv2.COLOR_BGR2RGB)
        mask = cv2.imread(os.path.join(MASKS,fname),cv2.IMREAD_GRAYSCALE)
        if self.tfms is not None:
            augmented = self.tfms(image=img,mask=mask)
            img,mask = augmented['image'],augmented['mask']
        mask = 1 if mask.sum() > CFG.mask_th else 0
        return img2tensor((img/255.0 - mean)/std),mask #img2tensor(mask)

## Augmentation

In [None]:
def get_aug(p=1.0):
    return A.Compose([
        A.HorizontalFlip(),
        A.VerticalFlip(),
        A.RandomRotate90(),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=15, p=0.9, 
                         border_mode=cv2.BORDER_REFLECT),
        A.OneOf([
            A.OpticalDistortion(p=0.3),
            A.GridDistortion(p=.1),
            A.IAAPiecewiseAffine(p=0.3),
        ], p=0.3),
        A.OneOf([
            A.HueSaturationValue(10,15,10),
            A.CLAHE(clip_limit=2),
            A.RandomBrightnessContrast(),            
        ], p=0.3),
    ], p=p)

def get_aug_lastnepo(p=1.0):
    return A.Compose([
        A.HorizontalFlip(),
        A.VerticalFlip(),
        A.RandomRotate90(),
    ], p=p)

## model

In [None]:
pretrained_model = {
        'efficientnet-b0': '../input/efficientnet-pytorch/efficientnet-b0-08094119.pth',
        'efficientnet-b1': '../input/efficientnet-pytorch/efficientnet-b1-dbc7070a.pth',
        'efficientnet-b2': '../input/efficientnet-pytorch/efficientnet-b2-27687264.pth',
        'efficientnet-b3': '../input/efficientnet-pytorch/efficientnet-b3-c8376fa2.pth',
        'efficientnet-b4': '../input/efficientnet-pytorch/efficientnet-b4-e116e8b3.pth',
        'efficientnet-b5': '../input/efficientnet-pytorch/efficientnet-b5-586e6cc6.pth',
        
    }


class enetv2(nn.Module):
    def __init__(self, backbone, out_dim=1):
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))

        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()
        self.sigmoid = nn.Sigmoid()

    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.extract(x)
        x = self.myfc(x)
        #x = self.sigmoid(x)
        return x

## train valid func

In [None]:
def train(model, iterator, optimizer, criterion, device, freeze):
    
    epoch_loss = 0
    model.train()
    
    #プログレスバーを表示するか否か
    bar = tqdm(iterator) if CFG.Progress_Bar else iterator
    
    for (x, y) in bar:
        x = torch.tensor(x, device=device, dtype=torch.float32)
        y = torch.tensor(y, device=device, dtype=torch.float32)
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y.unsqueeze(1))
        loss.backward()
        optimizer.step()
        loss_np = loss.detach().cpu().numpy()
        epoch_loss += loss_np
        
        if CFG.Progress_Bar:
            bar.set_description('Training loss: %.5f' % (loss_np))
        
    return epoch_loss/len(iterator)

def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    preds = torch.tensor([])
    targets = torch.tensor([])
    model.eval()
    
    bar = tqdm(iterator) if CFG.Progress_Bar else iterator
    
    with torch.no_grad():
        for (x, y) in bar:
            x = torch.tensor(x, device=device, dtype=torch.float32)
            y = torch.tensor(y, device=device, dtype=torch.float32)
            
            y_pred = model(x)
            loss = criterion(y_pred, y.type_as(y_pred).unsqueeze(1))
            loss_np = loss.detach().cpu().numpy()
            epoch_loss += loss_np
            ###logitです！
            y_pred = torch.sigmoid(y_pred)
            ###
            preds = torch.cat([preds,y_pred.detach().cpu()],dim=0)
            targets = torch.cat([targets,y.detach().cpu()],dim=0)
            
            if CFG.Progress_Bar:
                bar.set_description('Validation loss: %.5f' % (loss_np))
    
    try:
       val_roc = roc_auc_score(targets, preds)
    except ValueError:
       val_roc = -1
    
    return epoch_loss/len(iterator), val_roc

def fit_model(model, name, train_iterator, valid_iterator, optimizer, loss_criterion, device, freeze, epochs):
    """ Fits a dataset to model"""
    best_valid_score = float('inf')
    
    train_losses = []
    valid_losses = []
    valid_rocs = []
    
    for epoch in range(epochs):
        #最後の3世代、augmentationなしで学習
        if (freeze==False) and (epoch == CFG.n_epochs-11): #最後5世代に対して、Augを適用しない
            print("-_-_-_-_-_-_-_-_-")
            print("No augment mode")
            print("-_-_-_-_-_-_-_-_-")
            #データセット
            train_data = HuBMAPDataset(fold=fold,train=True,tfms=get_aug_lastnepo())
            valid_data = HuBMAPDataset(fold=fold,train=False)

            #データローダー
            #ローカル変数に上書き(あまり清潔な書き方ではない…)
            train_iterator = DataLoader(train_data,shuffle=True,batch_size=CFG.batch_size,num_workers=CFG.num_workers)
            valid_iterator = DataLoader(valid_data,shuffle=False,batch_size=8,num_workers=CFG.num_workers)
        
        if scheduler:
            scheduler.step(epoch)
        start_time = time.time()
    
        train_loss = train(model,train_iterator,optimizer,loss_criterion,device,freeze)
        valid_loss, valid_roc = evaluate(model,valid_iterator,loss_criterion,device)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        valid_rocs.append(valid_roc)

        """if valid_dice < best_valid_score:
            best_valid_score = valid_dice
            if CFG.model_save_path:
                torch.save(model.state_dict(), os.path.join(model_save_path,f'{name}.pt'))
            else:
                torch.save(model.state_dict(), f'{name}_best.pt')"""
        
        end_time = time.time()

        epoch_mins, epoch_secs = (end_time-start_time)//60,round((end_time-start_time)%60)
    
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins:.0f}m {epoch_secs}s')
        print(f'lr:{optimizer.param_groups[0]["lr"]:.7f}')
        print(f'Train Loss: {train_loss:.3f}')
        print(f'Val. Loss: {valid_loss:.3f} | Val. ROC Score: {valid_roc:.3f}')
        
        #最後のAugの効果を見たいため、最終世代のみを出力
        if not freeze:
            torch.save(model.state_dict(), f'{name}_final.pt')
        
    return train_losses, valid_losses, valid_rocs

## run training

In [None]:
tr_loss=[]
val_loss=[]
val_roc=[]
models = []


for fold in range((1 if DEBUG else CFG.train_fold)):
    print(f"Fitting on Fold {fold+1}")
    
    #データセット
    train_data = HuBMAPDataset(fold=fold,train=True,tfms=get_aug())
    valid_data = HuBMAPDataset(fold=fold,train=False)
    
    #データローダー
    train_iterator = DataLoader(train_data,shuffle=True,batch_size=CFG.batch_size,num_workers=CFG.num_workers)
    valid_iterator = DataLoader(valid_data,shuffle=False,batch_size=8,num_workers=CFG.num_workers)
    
    #モデルの呼び出し(設計図からインスタンスへ)
    model = enetv2(CFG.backbone).to(device)
    name = CFG.model_name + "_f" + str(fold)
    
    #最初の3世代を出力層以外freezeして学習
    print("+-+-+-+-+-+-+-+-+")
    print("pretrain mode")
    print("+-+-+-+-+-+-+-+-+")
    loss_criterion = nn.BCEWithLogitsLoss()
    opt= Adam(model.parameters(), lr=1e-3)
    scheduler=None
    
    head_name = ["myfc.weight","myfc.bias"]
    for hname,param in model.named_parameters():
        if hname in head_name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    
    nouse0,nouse1,nouse2 = fit_model(model, name, train_iterator, valid_iterator, opt, loss_criterion, device,freeze=True,epochs=5)
    
    print("@*@*@*@*@*@*@*@*@")
    print("fulltrain mode")
    print("@*@*@*@*@*@*@*@*@")
    #損失関数の定義
    loss_criterion = nn.BCEWithLogitsLoss()
    
    #最適化手法の定義
    opt= Adam(model.parameters(), lr=CFG.init_lr)
    
    #スケジューラーの定義
    scheduler = CosineAnnealingLR(opt,CFG.n_epochs-5)
    
    for hname,param in model.named_parameters():
        param.requires_grad = True
    #全ての情報をfit_modelに入れて、学習を開始します
    temp_tr_loss, temp_val_loss, temp_val_rocs = fit_model(model, name, train_iterator, valid_iterator, opt, loss_criterion, device,freeze=False, epochs=CFG.n_epochs-5)
    
    
    #lossと評価指標に対するスコアを記録します
    tr_loss.append(temp_tr_loss)
    val_loss.append(temp_val_loss)
    val_roc.append(temp_val_rocs)
    
    #foldごとにモデルを定義する為、学習し終わったモデルはリストに保持しておきます
    models.append(model)

In [None]:
for i in range(len(tr_loss)):
    fig,ax = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
    ax[0].plot(tr_loss[i])
    ax[0].set_title('Training and Validation Loss')
    ax[0].plot(val_loss[i])
    ax[0].set_xlabel('Epoch')

    ax[1].plot(val_roc[i])
    ax[1].set_title('Val roc Score')
    ax[1].set_xlabel('Epoch')


    ax[0].legend();
    ax[1].legend();

# confusion matrixの作成

In [None]:
from sklearn.metrics import classification_report

In [None]:
#(model数*TTA数)回すので注意
def get_predictions(model, iterator):
    model.eval()
    bar = tqdm(iterator) if CFG.Progress_Bar else iterator
    
    with torch.no_grad():
        res = np.array([])
        ans = np.array([])
        for x,y in bar:
            x = torch.tensor(x, device=device, dtype=torch.float32)
            y = torch.tensor(y, device=device, dtype=torch.float32)
            y_pred = model(x)
            y_pred = torch.sigmoid(y_pred)
            res = np.append(res, y_pred.detach().cpu().numpy())
            ans = np.append(ans, y.detach().cpu().numpy())
    return res,ans

In [None]:
pred = np.array([])
y = np.array([])
for fold in range((1 if DEBUG else CFG.train_fold)):
    print(f"Validating on Fold {fold+1}")
    
    #データセット
    valid_data = HuBMAPDataset(fold=fold,train=False)
    #データローダー
    valid_iterator = DataLoader(valid_data,shuffle=False,batch_size=8,num_workers=CFG.num_workers)
    
    model = models[fold]
    pred_res,y_res = get_predictions(model,valid_iterator)
    
    pred = np.append(pred, pred_res)
    y = np.append(y, y_res)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

def youden_index(y_pred,y):
    fpr, tpr, thres = roc_curve(y, y_pred)
    auc_ = auc(fpr, tpr)
    # 特異度
    sng = 1 - fpr
    # Youden indexを用いたカットオフ基準
    cutoff_criterion = tpr + sng - 1
    #print(f'{model_name}, auc:{auc}')
    return cutoff_criterion,tpr,fpr

In [None]:
#youden indexを用いた最適閾値の取得
#閾値の算出
cutoff,tpr,fpr = youden_index(pred,y) #cutoff:y軸,fpr:x軸 
cutoff_opt = [max(cutoff),fpr[cutoff.argmax()],tpr[cutoff.argmax()]] #arxmax:最大値のindexを取得
#これが最適な閾値
print("[Youden_index,x軸,y軸]")
print(cutoff_opt)

In [None]:
pred_binary = np.array(pred>cutoff_opt[1],dtype="int8")
print(classification_report(pred_binary,y))

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

fpr, tpr, thres = roc_curve(y,pred)
auc_ = auc(fpr, tpr)

# ROC曲線をプロット
plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc_)
plt.scatter(cutoff_opt[1],cutoff_opt[2])
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(cutoff_opt[1]+0.1, cutoff_opt[2]-0.1, f'cutoff = {cutoff_opt[0]:.3f}', fontsize=12);
plt.grid(True)