In [1]:
!git clone https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer.git
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
sys.path.append('./Ranger-Deep-Learning-Optimizer/ranger')

Cloning into 'Ranger-Deep-Learning-Optimizer'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 135 (delta 34), reused 17 (delta 5), pack-reused 69[K
Receiving objects: 100% (135/135), 181.14 KiB | 2.70 MiB/s, done.
Resolving deltas: 100% (59/59), done.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.autonotebook import tqdm
from pprint import pprint
from datetime import datetime
import os, sys, cv2, glob, random ,ast, warnings, time
warnings.filterwarnings('ignore')
# sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import timm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts
from torch.optim import Adam, SGD, AdamW

import albumentations as A
from albumentations.pytorch import ToTensorV2

from ranger import Ranger  # this is from ranger.py
from ranger913A import RangerVA  # this is from ranger913A.py
from rangerqh import RangerQH  # this is from rangerqh.py

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

  """


In [3]:
model_names = timm.list_models()
pprint(model_names)

['adv_inception_v3',
 'cspdarknet53',
 'cspdarknet53_iabn',
 'cspresnet50',
 'cspresnet50d',
 'cspresnet50w',
 'cspresnext50',
 'cspresnext50_iabn',
 'darknet53',
 'densenet121',
 'densenet121d',
 'densenet161',
 'densenet169',
 'densenet201',
 'densenet264',
 'densenet264d_iabn',
 'densenetblur121d',
 'dla34',
 'dla46_c',
 'dla46x_c',
 'dla60',
 'dla60_res2net',
 'dla60_res2next',
 'dla60x',
 'dla60x_c',
 'dla102',
 'dla102x',
 'dla102x2',
 'dla169',
 'dpn68',
 'dpn68b',
 'dpn92',
 'dpn98',
 'dpn107',
 'dpn131',
 'eca_vovnet39b',
 'ecaresnet18',
 'ecaresnet50',
 'ecaresnet50d',
 'ecaresnet50d_pruned',
 'ecaresnet101d',
 'ecaresnet101d_pruned',
 'ecaresnetlight',
 'ecaresnext26tn_32x4d',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b1_pruned',
 'efficientnet_b2',
 'efficientnet_b2_pruned',
 'efficientnet_b2a',
 'efficientnet_b3',
 'efficientnet_b3_pruned',
 'efficientnet_b3a',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_b8',


# CFG

In [4]:
BATCH_SIZE = 8 # 8 for bigger architectures
VAL_BATCH_SIZE = 32
EPOCHS = 12 # train upto 10 epochs
IMG_SIZE = 640 # 384 for bigger architectures
if BATCH_SIZE == 8:
    ITER_FREQ = 1500
else:
    ITER_FREQ = 500
NUM_WORKERS = 8
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]
SEED = 416
N_FOLDS = 5
TR_FOLDS = [0,1,2,3,4]
START_FOLD = 1

target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
             'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
             'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
             'Swan Ganz Catheter Present']

MODEL_PATH = '../input/stage3-res200-320-640-f1-focal-calr/Stage3_resnet200d_320_fold_1_epoch_9_95.71.pth'
MODEL_ARCH = 'resnet200d_320' # tf_efficientnet_b4_ns, tf_efficientnet_b5_ns, resnext50_32x4d, seresnet152d
TEACHER_MODEL_PATH = '../input/stage1-resnet200d-320/stage1_resnet200d_320_fold_4_epoch_4.pth'
STARTING_POINT_PATH = '../input/startingpointschestx/resnet200d_320_chestx.pth'
WEIGHTS = [0.5, 1]
STUDENT_MODEL_PATH = '../input/stage2-resnet200d-320-f1e9-9524/Stage2_resnet200d_320_fold_1_epoch_9_95.24000000000001.pth'

LR = 5e-4
MIN_LR = 1e-6 # SAM, CosineAnnealingWarmRestarts
WEIGHT_DECAY = 1e-6
MOMENTUM = 0.9
T_0 = EPOCHS # SAM, CosineAnnealingWarmRestarts
MAX_NORM = 1000
T_MAX = 5
ITERS_TO_ACCUMULATE = 1

BASE_OPTIMIZER = SGD #for SAM, Ranger
OPTIMIZER = 'Ranger' # Ranger, AdamW, AdamP, SGD, SAM

SCHEDULER = 'CosineAnnealingWarmRestarts' # ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts, OneCycleLR
SCHEDULER_UPDATE = 'epoch' # batch

TR_CRITERION = 'FocalLoss'
VL_CRITERION = 'BCE' # CrossEntropyLoss, TaylorSmoothedLoss, LabelSmoothedLoss
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.sum = 0
        self.avg = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum / self.count
        
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

def macro_multilabel_auc(label, pred):
    aucs = []
    for i in range(len(target_cols)):
        aucs.append(roc_auc_score(label[:, i], pred[:, i]))
    print(np.round(aucs, 4))
    return np.mean(aucs)

# Dataset

In [6]:
TRAIN_DIR = '../input/ranzcr-clip-catheter-line-classification/train/'
train_df = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
folds = pd.read_csv('../input/ranzcr-folds/train_folds.csv')
train_annotations = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train_annotations.csv')

In [7]:
class RanzcrDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.image_id = self.df['StudyInstanceUID'].values
        self.label = self.df[target_cols].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        image_id = self.image_id[idx]
        img = cv2.imread(TRAIN_DIR+image_id+'.jpg', cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
        if self.transform is not None:
            augmented = self.transform(image=img)
            img = augmented['image']
#         img = img.astype(np.float32)
        label = torch.tensor(self.label[idx]).float()
#         img = torch.tensor(img).float() # convert to float
#         img = img[:3] # remove the alpha channel
        return img, label

In [8]:
def get_transform(*, train=True):
    
    if train:
        return A.Compose([
            A.RandomResizedCrop(IMG_SIZE, IMG_SIZE, scale=(0.85, 1.0)),
            A.HorizontalFlip(p=0.5),
            A.RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2)),
            A.HueSaturationValue(p=0.2, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
            A.ShiftScaleRotate(p=0.2, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
            #A.CLAHE(clip_limit=(1,4), p=0.5),
            A.OneOf([A.OpticalDistortion(distort_limit=1.0), A.ElasticTransform(alpha=3),], p=0.2),
            A.OneOf([A.GaussianBlur(), A.MotionBlur(),], p=0.2),
            A.CoarseDropout(p=0.2),
            A.OneOf([A.JpegCompression(), A.Downscale(scale_min=0.1, scale_max=0.15),], p=0.2),
            A.IAAPiecewiseAffine(p=0.2),
            A.IAASharpen(p=0.2),
            A.Cutout(p=0.2, max_h_size=16, max_w_size=16, fill_value=(0., 0., 0.), num_holes=16),
            A.Normalize(mean=MEAN, std=STD),
            ToTensorV2(), # commented now will convert to torch tensors later.
        ])
    else:
        return A.Compose([
#             A.CenterCrop(IMG_SIZE, IMG_SIZE),
            A.Resize(IMG_SIZE, IMG_SIZE),
            A.Normalize(mean=MEAN, std=STD, max_pixel_value=255.0, p=1.0),
            ToTensorV2(),
        ])

# Model

In [9]:
class SeResnet152D(nn.Module): 
    def __init__(self, model_arch, n_classes, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_arch, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(n_features, n_classes)

    def forward(self, x):
        bs = x.size(0)
        features = self.model(x)
        pooled_features = self.pooling(features).view(bs, -1)
        output = self.fc(pooled_features)
        return output
            
        
class CustomResNet200D(nn.Module):
    def __init__(self, model_arch, n_classes, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_arch, pretrained=False)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, n_classes)
        if pretrained:
            pretrained_path = STARTING_POINT_PATH
            checkpoint = torch.load(pretrained_path)['model']
            for k in list(checkpoint.keys()):
                checkpoint[k.replace('model.','')] = checkpoint[k]
                del checkpoint[k]
            self.model.load_state_dict(checkpoint)
            print(f'load {model_name} pretrained model')
        n_features = self.model.fc.in_features
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(n_features, n_classes)

    def forward(self, x):
        bs = x.size(0)
        features = self.model(x)
        pooled_features = self.pooling(features).view(bs, -1)
        output = self.fc(pooled_features)
        return features, pooled_features, output

In [10]:
class FocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalLoss, self).__init__()

    def forward(self, inputs, targets, alpha=0.8, gamma=2, smooth=1):

#         inputs = F.sigmoid(inputs)       

        inputs = inputs.view(-1)
        targets = targets.view(-1)

        BCE = F.binary_cross_entropy_with_logits(inputs, targets, reduction='mean')
        BCE_EXP = torch.exp(-BCE)
        focal_loss = alpha * (1-BCE_EXP)**gamma * BCE

        return focal_loss

In [11]:
def GetCriterion(criterion_name, criterion=None):

    if criterion_name == 'FocalLoss':
        criterion = FocalLoss()
    elif criterion_name == 'CustomLoss':
        criterion = CustomLoss(WEIGHTS)
    elif criterion_name == 'BCE':
        criterion = nn.BCEWithLogitsLoss()
    return criterion
    
    
def GetScheduler(scheduler_name, optimizer, batches=None):
    #['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'GradualWarmupSchedulerV2']
    if scheduler_name == 'OneCycleLR':
        return torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr = 1e-2,epochs=EPOCHS,
                                                   steps_per_epoch = batches+1,pct_start = 0.1)
    if scheduler_name == 'CosineAnnealingWarmRestarts':
        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = T_0, T_mult=1,
                                                                    eta_min=MIN_LR, last_epoch=-1)
    elif scheduler_name == 'CosineAnnealingLR':
        return torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_MAX, eta_min=0, last_epoch=-1)
    elif scheduler_name == 'ReduceLROnPlateau':
        return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=1, threshold=0.0001,
                                                          cooldown=0, min_lr=MIN_LR)
#     elif scheduler_name == 'GradualWarmupSchedulerV2':
#         return GradualWarmupSchedulerV2(optimizer=optimizer)
    
def GetOptimizer(optimizer_name,parameters):
    #['Adam','Ranger']
    if optimizer_name == 'Adam':
#         if CFG.scheduler_name == 'GradualWarmupSchedulerV2':
#             return torch.optim.Adam(parameters, lr=CFG.LR_START, weight_decay=CFG.weight_decay, amsgrad=False)
#         else:
        return torch.optim.Adam(parameters, lr=LR, weight_decay=WEIGHT_DECAY, amsgrad=False)
    elif optimizer_name == 'AdamW':
#         if CFG.scheduler_name == 'GradualWarmupSchedulerV2':
#             return torch.optim.AdamW(parameters, lr=CFG.LR_START, weight_decay=CFG.weight_decay, amsgrad=False)
#         else:
        return torch.optim.Adam(parameters, lr=LR, weight_decay=WEIGHT_DECAY, amsgrad=False)
    elif optimizer_name == 'AdamP':
#         if CFG.scheduler_name == 'GradualWarmupSchedulerV2':
#             return AdamP(parameters, lr=CFG.LR_START, weight_decay=CFG.weight_decay)
#         else:
        return AdamP(parameters, lr=LR, weight_decay=WEIGHT_DECAY)
    elif optimizer_name == 'Ranger':
        return Ranger(parameters, lr = LR, alpha = 0.5, k = 6, N_sma_threshhold = 5, 
                      betas = (0.95,0.999), weight_decay=WEIGHT_DECAY)

# Train and Validation Functions

In [12]:
def train_fn(model, dataloader, device, epoch, optimizer, criterion, scheduler):
    
#     data_time = AverageMeter()
#     batch_time = AverageMeter()
    losses = AverageMeter()
    accuracies = AverageMeter()
    
    model.train()
    scaler = GradScaler()
    start_time = time.time()
    loader = tqdm(dataloader, total=len(dataloader))
    for step, (images, labels) in enumerate(loader):
        
#         with torch.no_grad():
#             teacher_features, _, _ = teacher_model(images_annot.to(device))

        images = images.to(device).float()
        labels = labels.to(device)
#         data_time.update(time.time() - start_time)

        with autocast():
#             features, _, output = model(images)
#             loss = criterion(teacher_features, features, output, labels)
            _, _, output = model(images)
            loss = criterion(output, labels)
            losses.update(loss.item(), BATCH_SIZE)
        
            scaler.scale(loss).backward()
            grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm = MAX_NORM)

            if (step+1) % ITERS_TO_ACCUMULATE == 0:
                scaler.step(optimizer)
                # Update the scale for next iteration.
                scaler.update()
                optimizer.zero_grad()
        
        if scheduler is not None and SCHEDULER_UPDATE == 'batch':
            scheduler.step()

#         batch_time.update(time.time() - start_time)
        start_time = time.time()
        
        if step % ITER_FREQ == 0:
            
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss: {loss.val:.4f} ({loss.avg:.4f})'.format((epoch+1),
                                                                  step, len(dataloader),
                                                                  loss=losses))

        loader.set_description(f'Training Epoch {epoch+1}/{EPOCHS}')
        loader.set_postfix(loss=losses.avg) #accuracy=accuracies.avg)
#         del images, labels
    if scheduler is not None and SCHEDULER_UPDATE == 'epoch':
        scheduler.step()
        
    return losses.avg#, accuracies.avg

In [13]:
def valid_fn(epoch, model, criterion, val_loader, device, scheduler):
    
    model.eval()
    losses = AverageMeter()
    accuracies = AverageMeter()
    PREDS = []
    TARGETS = []
    loader = tqdm(val_loader, total=len(val_loader))
    with torch.no_grad():  # without torch.no_grad() will make the CUDA run OOM.
        for step, (images, labels) in enumerate(loader):

            images = images.to(device)
            labels = labels.to(device)

#             output = model(images)
            _, _, output = model(images)
            loss = criterion(output, labels)
            losses.update(loss.item(), BATCH_SIZE)
            PREDS += [output.sigmoid()]
            TARGETS += [labels.detach().cpu()]
#             accuracy = (F.softmax(output).argmax(dim=1) == labels).float().mean()
#             accuracies.update(accuracy.item(), VAL_BATCH_SIZE)
            loader.set_description(f'Validating Epoch {epoch+1}/{EPOCHS}')
            loader.set_postfix(loss=losses.avg)#, accuracy=accuracies.avg)
#             del images, labels
    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    roc_auc = macro_multilabel_auc(TARGETS, PREDS)
    if scheduler is not None:
        scheduler.step()
        
    return losses.avg, roc_auc# accuracies.avg

# Engine and Main

In [14]:
def engine(device, folds, fold, model_path=None):
    
    trn_idx = folds[folds['kfold'] != fold].index
    val_idx = folds[folds['kfold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)

    train_data = RanzcrDataset(train_folds, transform=get_transform())
    val_data = RanzcrDataset(valid_folds, transform=get_transform(train=False))            
    
    train_loader = DataLoader(train_data,
                              batch_size=BATCH_SIZE, 
                              shuffle=True, 
                              num_workers=NUM_WORKERS,
                              pin_memory=True, # enables faster data transfer to CUDA-enabled GPUs.
                              drop_last=True)
    val_loader = DataLoader(val_data, 
                            batch_size=VAL_BATCH_SIZE,
                            num_workers=NUM_WORKERS,
                            shuffle=False, 
                            pin_memory=True,
                            drop_last=False)
    
    if model_path is not None:
        model = CustomResNet200D(MODEL_ARCH, 11, pretrained=False)
        model.load_state_dict(torch.load(model_path))
        START_EPOCH = int(model_path.split('_')[-2])
    else:
        model = CustomResNet200D(MODEL_ARCH, 11, pretrained=False)
        model.load_state_dict(torch.load(STUDENT_MODEL_PATH))
        START_EPOCH = 0
    model.to(device)
    
    params = filter(lambda p: p.requires_grad, model.parameters())    
    optimizer = GetOptimizer(OPTIMIZER, params)
    
    train_criterion = GetCriterion(TR_CRITERION).to(device) #CustomLoss(weights=WEIGHTS).to(device)        
    val_criterion = GetCriterion(VL_CRITERION).to(device)
    
    scheduler = GetScheduler(SCHEDULER, optimizer)
    
    loss = []
    for epoch in range(START_EPOCH, EPOCHS):
        
        epoch_start = time.time()        
        avg_loss = train_fn(model, train_loader, device, epoch, optimizer, train_criterion, scheduler)

        torch.cuda.empty_cache()
        avg_val_loss, roc_auc_score = valid_fn(epoch, model, val_criterion, val_loader, device, scheduler)
        epoch_end = time.time() - epoch_start
        
        print(f'Training Loss after epoch {epoch+1}: {avg_loss:.4f}')
        print(f'Validation Loss after epoch {epoch+1}: {avg_val_loss:.4f}')
        print(f'Validation ROC AUC Score after epoch {epoch+1}: {roc_auc_score:.4f}')
        loss.append(avg_loss)
#         accuracy.append(avg_accuracy)
        
        content = f'Fold {fold} Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} roc_auc_score: {roc_auc_score:.4f} time: {epoch_end:.0f}s'
        with open(f'Stage3_{MODEL_ARCH}_{OPTIMIZER}_{IMG_SIZE}.txt', 'a') as appender:
            appender.write(content + '\n')                                         # avg_train_accuracy: {avg_accuracy:.4f}
        
        torch.save(model.state_dict(), f'Stage3_{MODEL_ARCH}_fold_{fold}_epoch_{(epoch+1)}_{round(roc_auc_score,4)*100}.pth')
#         torch.save(model, f'Stage3_{MODEL_ARCH}_fold_{fold}_epoch_{(epoch+1)}')
        torch.cuda.empty_cache()
    
    return loss#{'loss':loss, 'accuracy':accuracy}

In [15]:
if __name__ == '__main__':
    
#     if MODEL_PATH is not None:
#         START_FOLD = int(MODEL_PATH.split('_')[-3])
    
    for fold in range(START_FOLD, N_FOLDS):
        if fold == 2:
            break
        print(f'===== Fold {fold} Starting =====')
        fold_start = time.time()
        logs = engine(DEVICE, folds, fold, MODEL_PATH)
        print(f'Time taken in fold {fold}: {time.time()-fold_start}')

===== Fold 1 Starting =====
Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


  0%|          | 0/3008 [00:00<?, ?it/s]

Epoch: [10][0/3008]	Loss: 0.0007 (0.0007)
Epoch: [10][1500/3008]	Loss: 0.0060 (0.0065)
Epoch: [10][3000/3008]	Loss: 0.0024 (0.0061)


  0%|          | 0/189 [00:00<?, ?it/s]

[0.9588 0.9542 0.9887 0.9337 0.9287 0.9777 0.9845 0.9009 0.8504 0.899
 0.9977]
Training Loss after epoch 10: 0.0061
Validation Loss after epoch 10: 0.1718
Validation ROC AUC Score after epoch 10: 0.9431


  0%|          | 0/3008 [00:00<?, ?it/s]

Epoch: [11][0/3008]	Loss: 0.0048 (0.0048)
Epoch: [11][1500/3008]	Loss: 0.0039 (0.0069)
Epoch: [11][3000/3008]	Loss: 0.0181 (0.0069)


  0%|          | 0/189 [00:00<?, ?it/s]

[0.9671 0.9615 0.9906 0.9405 0.9538 0.9831 0.984  0.9186 0.8689 0.9074
 0.9987]
Training Loss after epoch 11: 0.0069
Validation Loss after epoch 11: 0.1455
Validation ROC AUC Score after epoch 11: 0.9522


  0%|          | 0/3008 [00:00<?, ?it/s]

Epoch: [12][0/3008]	Loss: 0.0006 (0.0006)
Epoch: [12][1500/3008]	Loss: 0.0047 (0.0069)
Epoch: [12][3000/3008]	Loss: 0.0003 (0.0071)


  0%|          | 0/189 [00:00<?, ?it/s]

[0.9695 0.9628 0.9915 0.9489 0.9546 0.9826 0.984  0.9238 0.8772 0.9101
 0.9972]
Training Loss after epoch 12: 0.0071
Validation Loss after epoch 12: 0.1404
Validation ROC AUC Score after epoch 12: 0.9547
Time taken in fold 1: 17268.723111629486
