In [None]:
"""
timmを使ってCNNモデルを学習させる。
optimizer: Adam/AdamW
scheduler: OneCycleScheduler/plateau

学習:
  画像+テーブルデータで学習させる。
  テーブルデータを使用しない場合はOTHER_FEATURE_COLUMNSを空配列にする。
保存項目:
  CSV: フォールド,エポック,loss,ACC,F1,AUC,primary F1,primary F1の閾値
  オブジェクト: モデル(model),閾値(threshold),モデル種類(model_type)
※ 現在は閾値調整をprimal fscoreに対して行っている。
"""

In [None]:
# データ
TRAIN_IMAGE_PATH = 'images'
TRAIN_DF_PATH = "train.csv"
# CV
N_SPLITS = 5
EPOCHS_PER_FOLD = 1
# モデル
MODEL_TYPE = 'efficientnet_b2'
BATCH_SIZE = 60
HIDDEN_SIZE = 64
IMG_SIZE_H = 512
IMG_SIZE_W = 256
# log
SAVE_VERSION = 'rsna_first'

In [None]:
# カテゴリ特徴量
CATEGORY_COLUMNS = ['category1']
# 画像以外のデータフレームから使用する特徴量
# OTHER_FEATURE_COLUMNS = ['age'] + CATEGORY_COLUMNS
OTHER_FEATURE_COLUMNS = []
FILLNA_NUMERIC_COLUMNS = []

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
# 画像のtransform
from PIL import Image
from torch.utils import data as data
from torchvision import transforms as transforms

def get_transforms(transform_choises=[0,1,2,3],is_augmentation=False,resize_h=512,resize_w=512):
    """
    transform_choisesはis_augmentation=Trueの時に使用する拡張処理の選択です。
    0: ランダムに左右反転
    1: ランダムに上下反転
    2: ランダムに回転
    3: ランダムにリサイズ切り抜き
    """
    TRANS_FORM_LIST = [
            # ランダムに左右反転する
            transforms.RandomHorizontalFlip(0.5),
            # ランダムに上下反転する
            transforms.RandomVerticalFlip(0.5),
            # ランダム回転
            transforms.RandomRotation(degrees=(-5, 5)), 
            # ランダムにリサイズ切り抜き scaleはU(0.8,1)の倍率で変更される ratioはアスペクト比
            transforms.RandomResizedCrop((resize_h, resize_w), scale=(0.8, 1), ratio=(0.45, 0.55)) 
    ]
    if is_augmentation:
        transform_pipeline = transforms.Compose(
            [
               *[TRANS_FORM_LIST[i] for i in transform_choises] +\
                [transforms.ToTensor(),
                 transforms.Normalize(mean=0.2179, std=0.0529)],
            ]
        )
    else:
        transform_pipeline = transforms.Compose(
            [
                transforms.RandomHorizontalFlip(0.5),
                transforms.Resize((resize_h, resize_w)),
                transforms.ToTensor(),
                transforms.Normalize(mean=0.2179, std=0.0529)
            ]
        )
    return transform_pipeline

In [None]:
df = pd.read_csv(TRAIN_DF_PATH)
df.head()
df[FILLNA_NUMERIC_COLUMNS].fillna(df.age.mean(), inplace=True)

In [None]:
# 各行にパスを追加
df['my_image_path'] = TRAIN_IMAGE_PATH + '/' + df['tmp'].astype(str) + '.png'
df['my_image_path'].head(3)

In [None]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(df,columns):
    df_tmp = df.copy()
    for column in CATEGORY_COLUMNS:
        df_tmp[column] = LabelEncoder().fit_transform(df[column])
    return df_tmp
df = label_encoding(df,CATEGORY_COLUMNS)
df.head(5)

In [None]:
from sklearn.model_selection import StratifiedGroupKFold
def do_stratified_group_kfold(df,target_column,group_column):
    split = StratifiedGroupKFold(N_SPLITS)
    for k, (_, test_idx) in enumerate(split.split(df, df[target_column], groups=df[group_column])):
        df.loc[test_idx, 'split'] = k
    df.split = df.split.astype(int)
    return df
do_stratified_group_kfold(df,'cancer','patient_id')
display(df.groupby('split')['cancer'].mean())
df.head(5)

In [None]:
# データセット作成
import torch
from torch.utils.data import Dataset
class ImgDataSet(Dataset):
    """
    画像認識の汎用的なデータセットを作成
    """
    def __init__(self,df,target_column,other_columns,transform=None):
        self.transform = transform
        self.df = df
        self.target_column = target_column
        self.other_columns = other_columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self,index):
        try:
            img = Image.open(self.df.iloc[index].my_image_path).convert('RGB')
        except Exception as ex:
            print(self.df.iloc[index].my_image_path,ex)
            return None

        if self.transform is not None:
            img = self.transform(img)
        else:
            transform_tmp = transforms.ToTensor()
            img = transform_tmp(img)

        other_features = None
        if self.other_columns:
            other_features = torch.as_tensor(self.df.iloc[index][self.other_columns])

        target_value = None
        if self.target_column:
            target_value = torch.as_tensor(self.df.iloc[index][self.target_column])

        if other_features is not None and target_value is not None:
            return img,other_features,target_value
        elif other_features is not None:
            return img,other_features
        elif target_value is not None:
            return img,target_value
        return img

In [None]:
from sklearn.utils import compute_class_weight
def get_class_weight(df,target_column):
    """
    クラスウェイトを取得
    """
    class_num = np.unique(df[target_column])
    class_weights = compute_class_weight(class_weight="balanced",
                                         classes=class_num,
                                         y=df[target_column])
    class_weight_dict = dict(zip(class_num, class_weights))
    return class_weight_dict

In [None]:
# データローダー作成
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
def make_data_loader(df,target_column,other_columns,transform=None,use_class_weight=False,batch_size = 32):
    # データセット作成
    dataset = ImgDataSet(df,target_column=target_column,other_columns=other_columns,transform=transform)
    # クラスサンプラー作成
    sampler = None
    if use_class_weight:
        class_weight_dict = get_class_weight(df,target_column)
        print(f'class weight {class_weight_dict}')
        sample_weights = [class_weight_dict[label] for label in df[target_column].values]
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        return DataLoader(dataset, batch_size = batch_size, sampler=sampler,num_workers=os.cpu_count(), pin_memory=True)
    return DataLoader(dataset, batch_size = batch_size, shuffle = True)

In [None]:
# モデル作成
from torch import nn
from timm import create_model, list_models
class ImgBaseModel(torch.nn.Module):
    def __init__(self,num_other_features=0,output_dim=1,hidden_state_features=128,model_name='tf_efficientnet_b4',drop_rate=0.):
        """
        num_other_features: 画像以外の特徴量があればその種類数を記入
        """
        super().__init__()
        # num_classes=0でbackboneとして使用する
        self.backbone_model = create_model(model_name, pretrained=True,drop_rate=drop_rate,num_classes=0)
        self.backbone_features_num = self.backbone_model.num_features
        if num_other_features:
            self.other_features_network = nn.Sequential(
                # 全結合 -> バッチ正規化 -> 活性化関数 -> ドロップアウトの順。
                nn.Linear(num_other_features, hidden_state_features),
                nn.BatchNorm1d(hidden_state_features),
                nn.ReLU(),
                nn.Dropout(.2),
#                 nn.Linear(hidden_state_features, hidden_state_features),
#                 nn.BatchNorm1d(hidden_state_features),
#                 nn.ReLU(),
#                 nn.Dropout(.2),
            )
            # 最終層
            self.head = nn.Sequential(
                nn.Linear(self.backbone_features_num + hidden_state_features, output_dim),
            )
        else:
            self.other_features_network = None
            # 最終層
            self.head = nn.Sequential(
                nn.Linear(self.backbone_features_num, output_dim),
            )

    def forward(self,X,other_features):
        # 画像1枚のみ
        if X.dim() == 3:
            X = X.unsqueeze(0)
        image_network_output = self.backbone_model(X)
        # 画像以外の特徴量も使用する場合
        if self.other_features_network is not None:
            other_features_network_output = self.other_features_network(other_features.float())
            head_input = torch.cat((image_network_output, other_features_network_output), dim=1)
        else:
            head_input = image_network_output
        predict_y = self.head(head_input)
        return predict_y

    def predict(self, X,other_features):
        predict_y = self.forward(X,other_features)
        return torch.sigmoid(predict_y)

In [None]:
# デバイスの選択
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ImgBaseModel(model_name=MODEL_TYPE,hidden_state_features=HIDDEN_SIZE,num_other_features=len(OTHER_FEATURE_COLUMNS))
model.to(device)
print(device)

In [None]:
# 荷重減衰
def add_weight_decay(model, weight_decay=1e-5, skip_list=['bias']):
    """
    荷重減衰を加える
    """
    decay = []
    no_decay = []
    # named_parametersで各層とそのパラメータを確認する
    ## 層: fcはfull connection layer,convはconvolution
    ## param: biasとweightがある
    for name, param in model.named_parameters():
        # 勾配計算が不要ならとばす
        if not param.requires_grad:
            continue
        # スキップリストに入っているならno_decayに入れる(主にbiasを除去)
        if len(param.shape) == 1 or np.any([v in name.lower() for v in skip_list]):
            no_decay.append(param)
        else:
            decay.append(param)
    return [
        {'params': no_decay, 'weight_decay': 0.},
        {'params': decay, 'weight_decay': weight_decay}]

In [None]:
def get_objects_for_train(model,use_weight_decay):
    """
    モデル学習に必要な下記オブジェクトを取得。
    ・損失関数
    ・スケーラ
    ・最適化関数
    """
    # 損失関数
    loss_fn = torch.nn.BCEWithLogitsLoss()
    # 学習率のスケジューラ
    if use_weight_decay:
        # betasは勾配の1次と2次モーメントの指数減衰率→ 通常0.9と0.999
        # weight decayにはL2正則化のハイパーパラメータを設定
        optimizer = torch.optim.AdamW(
            add_weight_decay(model,weight_decay=0.025,skip_list=['bias']),
            lr=0.001,
            betas=(0.9, 0.999),# default
            weight_decay=0.025)
    else:
        optimizer = torch.optim.Adam(model.parameters())
    # スケーラ
    # 混合精度学習により高速化
    scaler = torch.cuda.amp.GradScaler()
    return loss_fn,scaler,optimizer

In [None]:
def get_scheduler(is_one_cycle_schedule,optimizer,dataloader,epochs):
    if is_one_cycle_schedule:
        # onecycle training
        ## https://arxiv.org/pdf/1803.09820.pdf
        # 基準のlrから、バッチごとに上限のlrまでいけば、次のバッチでは下限のlrまで進める
        scheduler =\
        torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                            max_lr=0.0008,
                                            epochs=epochs,
                                            steps_per_epoch=len(dataloader)) # len(dataloader)でバッチ数を取得
    else:
        # スケジューラの設定、scheduler_patienceで指定したエポック内に前回を超えた精度がでないと、学習率をscheduler_factor倍する。
        scheduler =\
        torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   patience=2,
                                                   verbose=True)
    return scheduler

In [None]:
def pfbeta(labels, predictions, beta=1.):
    y_true_count = 0
    ctp = 0
    cfp = 0

    for idx in range(len(labels)):
        prediction = min(max(predictions[idx], 0), 1)
        if (labels[idx]):
            y_true_count += 1
            ctp += prediction
        else:
            cfp += prediction
    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp +1e-7)
    c_recall = ctp / max(y_true_count, 1)  # avoid / 0
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return 0

In [None]:
def optimal_f1(labels, predictions):
    thres = np.linspace(0, 1, 101)
    predictions = np.concatenate(predictions)
    f1s = [pfbeta(labels, predictions > thr) for thr in thres]
    idx = np.argmax(f1s)
    return f1s[idx], thres[idx]

In [None]:
from torchmetrics import Accuracy, F1Score
from sklearn.metrics import roc_auc_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accuracy = Accuracy(task="binary").to(device)
f1_score = F1Score(task="binary").to(device)

def calc_pred_result(y,y_proba):
    """
    性能指標の結果を返します。
    """
    if y_proba.dim() == 0:
        y_proba = y_proba.unsqueeze(0)
    accuracy_value = accuracy(y_proba, y).item()
    f1_score_value = f1_score(y_proba, y).item()
    
    return accuracy_value,f1_score_value

In [None]:
import gc
def gc_collect():
    gc.collect()
    torch.cuda.empty_cache()
gc_collect()

In [None]:
loss_fn,scaler,optimizer =\
    get_objects_for_train(model,use_weight_decay=True)

In [None]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
# モデルの学習
def train_one_epoch(fold,total_fold,cur_epoch,total_epoch,model,dataloader,loss_fn,scheduler,scaler,optimizer):
    """
    1エポックでの訓練処理
    学習
     - ロス計算
     - 逆伝搬
     - 最適化関数の適用
     - スケジューラの更新
    評価
     - ロスの計算
     - 最高のスコアが出たらモデルの保存
    """
    # 準備
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.train()
    torch.manual_seed(42)
    losses = []
    all_labels = []
    all_outputs = []

    mean_accuracy = 0.
    mean_f1_score = 0.
    

    # 学習
    ## 1ループ=1バッチサイズ
    loop = tqdm(enumerate(dataloader), total=len(dataloader))
    for batch_idx, (X,other_features, y) in loop:
        gc_collect()
        # 前回計算した勾配を0にする
        ## (前回の勾配は主にRNNなどに用いられる)
        optimizer.zero_grad(set_to_none=True)
        loop.set_description(f"Fold [{fold+1}/{total_fold}] Epoch [{cur_epoch}/{total_epoch}]")
        X = X.to(device,non_blocking=True)
        other_features = other_features.to(device,non_blocking=True)
        y = y.to(device,non_blocking=True)
        # ampの対象とする
        with torch.cuda.amp.autocast():
            # 推論
            logits = model(X,other_features)
            loss = loss_fn(logits, y.unsqueeze(1).float())
        losses.append(loss.item())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()

        # 精度評価
        y_proba = torch.sigmoid(logits).squeeze()
        accuracy_value,f1_score_value = calc_pred_result(y,y_proba)
        mean_accuracy += accuracy_value
        mean_f1_score += f1_score_value

        outputs = list(torch.sigmoid(logits).detach().cpu().numpy())
        labels = list(y.detach().cpu().numpy())

        all_outputs.extend(outputs)
        all_labels.extend(labels)
        auc = roc_auc_score(all_labels,all_outputs)

        pfscore,thresh=optimal_f1(all_labels,all_outputs)
        loop.set_postfix(loss=loss.item(), acc=accuracy_value, f1=f1_score_value,auc=auc,pfscore=pfscore,thresh=thresh)

        del loss, logits, X, other_features, y

    mean_loss = sum(losses) / len(losses)
    mean_accuracy /= len(losses)
    mean_f1_score /= len(losses)
    return [mean_loss, mean_accuracy, mean_f1_score,auc,pfscore,thresh]

In [None]:
def valid_one_epoch(fold,total_fold,cur_epoch,total_epoch,model, dataloader, loss_fn):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval()
    losses = []
    mean_accuracy = 0.
    mean_f1_score = 0.
    all_labels = []
    all_outputs = []
    
    with torch.inference_mode():
        loop = tqdm(enumerate(dataloader), total=len(dataloader))
        for batch_idx, (X, other_features, y) in loop:
            gc_collect()
            loop.set_description(f"Fold {fold+1}/{total_fold} Epoch [{cur_epoch}/{total_epoch}]")

            X = X.to(device,non_blocking=True)
            other_features = other_features.to(device,non_blocking=True)
            y = y.to(device,non_blocking=True)

            logits = model(X,other_features)
            loss = loss_fn(logits, y.unsqueeze(1).float())
            losses.append(loss.item())

            y_proba = torch.sigmoid(logits).squeeze()
            accuracy_value,f1_score_value = calc_pred_result(y,y_proba)
            mean_accuracy += accuracy_value
            mean_f1_score += f1_score_value

            outputs = list(torch.sigmoid(logits).detach().cpu().numpy())
            labels = list(y.detach().cpu().numpy())
            all_outputs.extend(outputs)
            all_labels.extend(labels)
            auc = roc_auc_score(all_labels,all_outputs)

            pfscore,thresh=optimal_f1(all_labels,all_outputs)

            loop.set_postfix(loss=loss.item(), acc=accuracy_value, f1=f1_score_value,auc=auc,pfscore=pfscore,thresh=thresh)
            del loss, logits, X, other_features, y
            
        mean_loss = sum(losses) / len(losses)
        mean_accuracy /= len(losses)
        mean_f1_score /= len(losses)
        
    return [mean_loss, mean_accuracy, mean_f1_score,auc,pfscore,thresh]

In [None]:
import csv
# csvに精度などのlogを残していく
cols = ["fold","epoch", "training_loss", "training_acc", "training_f1",'training_auc','training_pfscore','training_thresh',
        "validation_loss","validation_acc", "validation_f1",'validation_auc','validation_pfscore','validation_thresh']

if not os.path.isdir(SAVE_VERSION):
    os.makedirs(SAVE_VERSION)

with open(f"{SAVE_VERSION}/log.csv", "w") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(cols)
    
def writeCSVLog(vals):
    with open(f"{SAVE_VERSION}/log.csv", "a") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(vals)

In [None]:
def train_valid(model,df,loss_fn,optimizer,scaler,batch_size=16,total_epoch_per_fold=3):
    for fold in range(N_SPLITS):
        gc_collect()
        train_dataloader =\
        make_data_loader(df.query('split != @fold'),target_column='cancer',other_columns=OTHER_FEATURE_COLUMNS,transform=get_transforms(transform_choises=[0,1,2,3],is_augmentation=True,resize_h=IMG_SIZE_H,resize_w=IMG_SIZE_W),use_class_weight=True,batch_size = BATCH_SIZE)
        valid_dataloader =\
        make_data_loader(df.query('split == @fold'),target_column='cancer',other_columns=OTHER_FEATURE_COLUMNS,transform=get_transforms(transform_choises=[0,1,2,3],is_augmentation=True,resize_h=IMG_SIZE_H,resize_w=IMG_SIZE_W),use_class_weight=True,batch_size = BATCH_SIZE)
        # データローダのサイズに合わせてスケジューラを作成
        scheduler =\
        get_scheduler(is_one_cycle_schedule=True,optimizer=optimizer,dataloader=train_dataloader,epochs=total_epoch_per_fold)
        # 学習実施
        for epoch in range(1, total_epoch_per_fold+1):
            train_vals = train_one_epoch(fold,N_SPLITS,epoch,total_epoch_per_fold,model,train_dataloader,loss_fn,scheduler,scaler,optimizer)
            valid_vals = valid_one_epoch(fold,N_SPLITS,epoch,total_epoch_per_fold,model,valid_dataloader, loss_fn)
            torch.save({'model': model.state_dict(), 'threshold': valid_vals[-1], 'model_type': MODEL_TYPE}, f"fold{fold}_epoch{epoch}_{MODEL_TYPE}_{HIDDEN_SIZE}")
            vals = [fold,epoch, ] + train_vals + valid_vals
            writeCSVLog(vals)

train_valid(
    model,
    df,
    loss_fn,
    optimizer,
    scaler,
    batch_size=BATCH_SIZE,
    total_epoch_per_fold=EPOCHS_PER_FOLD)