In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip -q install timm


In [None]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm
import time

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, OneCycleLR
from torch.optim.optimizer import Optimizer
import torchvision.utils as vutils

# import pytorch_lightning as pl
# from pytorch_lightning import seed_everything
# from pytorch_lightning.metrics.functional import accuracy, f1, auroc
# from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import timm
import albumentations as A
from albumentations.core.transforms_interface import ImageOnlyTransform
from albumentations.pytorch import ToTensorV2

In [None]:
SEED = 42
N_FOLDS = 5
TRAIN_FOLD = 4
TARGET_COL = 'target'
N_EPOCHS = 17
BATCH_SIZE = 32
DIM1 = 256
DIM2 = 256
LR = 1e-4
MAX_LR = 5e-4
PRECISION = 16
GRADIENT_ACCUMULATION = 1
EARLY_STOP = 3
MODEL = 'efficientnet_b1'
# backbone = 'vit_deit_base_distilled_patch16_384'
NEW_HEAD = False
CHANNELS = [0, 2, 4]


LR *= BATCH_SIZE / 32
MAX_LR *= BATCH_SIZE / 32

In [None]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

random_state = set_seed(SEED)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available.")
else:
    device = torch.device("cpu")
    print("GPU not available, going to use CPU instead.")

In [None]:
train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
test = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

def get_train_file_path(image_id):
    return "../input/seti-breakthrough-listen/train/{}/{}.npy".format(image_id[0], image_id)

def get_test_file_path(image_id):
    return "../input/seti-breakthrough-listen/test/{}/{}.npy".format(image_id[0], image_id)

train['file_path'] = train['id'].apply(get_train_file_path)
test['file_path'] = test['id'].apply(get_test_file_path)

display(train.sample(5))

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, test=False, transform=None, use_vit=False):
        self.df = df
        self.test = test
        self.file_names = df['file_path'].values
        if not self.test:
            self.labels = df[TARGET_COL].values
        self.transform = transform
        self.use_vit = use_vit
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.file_names[idx]
        
        image = np.load(file_path)[CHANNELS]
        image = image.astype(np.float32)
        image = np.vstack(image).T
        if self.transform:
            image = self.transform(image=image)['image']
            image = self.inv_stem(image)
        else:
            image = image[np.newaxis,:,:]
            image = torch.from_numpy(image).float()
            image = self.inv_stem(image)
        if not self.test:
            label = torch.unsqueeze(torch.tensor(self.labels[idx]).float(),-1)
            return {"spect":image, "target": label}
        else:
            return {"spect":image}
        
    def inv_stem(self, x):
        if self.use_vit:
            x1 = x.transpose(0, 1).view(24, 24, 16, 16)
            y = torch.zeros(384, 384, dtype=x.dtype)
            for i in range(24):
                for j in range(24):
                    y[i*16:(i+1)*16, j*16:(j+1)*16] = x1[i, j]
            return y
        else:
            return x

In [None]:
def spec_augment(x, alpha=0.1):
    t0 = np.random.randint(0, x.shape[0])
    delta = np.random.randint(0, int(x.shape[0]*alpha))
    x[t0:min(t0+delta, x.shape[0])] = 0
    t0 = np.random.randint(0, x.shape[1])
    delta = np.random.randint(0, int(x.shape[1]*alpha))
    x[:, t0:min(t0+delta, x.shape[1])] = 0
    return x

class SpecAugment(ImageOnlyTransform):
    def apply(self, img, **params):
        return spec_augment(img)
    
# https://www.kaggle.com/shionhonda/search-for-effective-data-augmentation
def get_transforms(*, data):
    if data == 'train':
        return A.Compose([
            A.Resize(DIM1,
                    DIM2),
            A.VerticalFlip(p=0.5),
            #A.HorizontalFlip(p=0.5),
            A.ShiftScaleRotate(rotate_limit=0, p=0.3),
            A.MotionBlur(p=0.3),
            SpecAugment(p=0.3),
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return A.Compose([
            A.Resize(DIM1, DIM2),
            ToTensorV2(),
        ])

In [None]:
def train_dataloader():
    return DataLoader(train_dataset, batch_size=4, num_workers=2,
                      drop_last=False, shuffle=False, pin_memory=True)
train_dataset = TrainDataset(train[train['target']==0], transform=get_transforms(data='train'), use_vit=False)
train_batch = next(iter(train_dataloader()))
batch, targets = train_batch["spect"], train_batch["target"]
print(batch.shape)
plt.figure(figsize=(16, 16))
plt.axis("off")
plt.title("Target = 0")
plt.imshow(vutils.make_grid(
    batch, nrow=1, padding=10, normalize=True).permute(1,2,0).cpu().numpy())
plt.show()

In [None]:
train_dataset = TrainDataset(train[train['target']==1], transform=None)
train_batch = next(iter(train_dataloader()))
batch, targets = train_batch["spect"], train_batch["target"]

plt.figure(figsize=(16, 16))
plt.axis("off")
plt.title("Target = 1")
plt.imshow(vutils.make_grid(
    batch, nrow=1, padding=10, normalize=True).permute(1,2,0).cpu().numpy())
plt.show()

In [None]:
def mixup_data(x, y, alpha=1.0):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = torch.distributions.Beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]

    index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [None]:
from torch.distributions import Beta

class Mixup(nn.Module):
    def __init__(self, mix_beta=1.0):

        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)

    def forward(self, x, y):
        lam = self.beta_distribution.sample().to(device)
        batch_size = x.shape[0]
        index = torch.randperm(batch_size)
        mixed_x = lam * x + (1 - lam) * x[index, :]
        y_a, y_b = y, y[index]
        return mixed_x, y_a, y_b, lam

In [None]:
import math
from typing import TYPE_CHECKING, Any, Callable, Optional

if TYPE_CHECKING:
    from torch.optim.optimizer import _params_t
else:
    _params_t = Any

class MADGRAD(Optimizer):

    def __init__(
        self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6,
    ):
        if momentum < 0 or momentum >= 1:
            raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
        if lr <= 0:
            raise ValueError(f"Learning rate {lr} must be positive")
        if weight_decay < 0:
            raise ValueError(f"Weight decay {weight_decay} must be non-negative")
        if eps < 0:
            raise ValueError(f"Eps must be non-negative")

        defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
        super().__init__(params, defaults)

    @property
    def supports_memory_efficient_fp16(self) -> bool:
        return False

    @property
    def supports_flat_params(self) -> bool:
        return True

    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:

        loss = None
        if closure is not None:
            loss = closure()

        if 'k' not in self.state:
            self.state['k'] = torch.tensor([0], dtype=torch.long)
        k = self.state['k'].item()

        for group in self.param_groups:
            eps = group["eps"]
            lr = group["lr"] + eps
            decay = group["weight_decay"]
            momentum = group["momentum"]

            ck = 1 - momentum
            lamb = lr * math.pow(k + 1, 0.5)

            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                if "grad_sum_sq" not in state:
                    state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
                    state["s"] = torch.zeros_like(p.data).detach()
                    if momentum != 0:
                        state["x0"] = torch.clone(p.data).detach()

                if momentum != 0.0 and grad.is_sparse:
                    raise RuntimeError("momentum != 0 is not compatible with sparse gradients")

                grad_sum_sq = state["grad_sum_sq"]
                s = state["s"]

                # Apply weight decay
                if decay != 0:
                    if grad.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")

                    grad.add_(p.data, alpha=decay)

                if grad.is_sparse:
                    grad = grad.coalesce()
                    grad_val = grad._values()

                    p_masked = p.sparse_mask(grad)
                    grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
                    s_masked = s.sparse_mask(grad)

                    # Compute x_0 from other known quantities
                    rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
                    x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1)

                    # Dense + sparse op
                    grad_sq = grad * grad
                    grad_sum_sq.add_(grad_sq, alpha=lamb)
                    grad_sum_sq_masked.add_(grad_sq, alpha=lamb)

                    rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)

                    s.add_(grad, alpha=lamb)
                    s_masked._values().add_(grad_val, alpha=lamb)

                    # update masked copy of p
                    p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1)
                    # Copy updated masked p to dense p using an add operation
                    p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
                    p.data.add_(p_masked, alpha=-1)
                else:
                    if momentum == 0:
                        # Compute x_0 from other known quantities
                        rms = grad_sum_sq.pow(1 / 3).add_(eps)
                        x0 = p.data.addcdiv(s, rms, value=1)
                    else:
                        x0 = state["x0"]

                    # Accumulate second moments
                    grad_sum_sq.addcmul_(grad, grad, value=lamb)
                    rms = grad_sum_sq.pow(1 / 3).add_(eps)

                    # Update s
                    s.data.add_(grad, alpha=lamb)

                    # Step
                    if momentum == 0:
                        p.data.copy_(x0.addcdiv(s, rms, value=-1))
                    else:
                        z = x0.addcdiv(s, rms, value=-1)

                        # p is a moving average of z
                        p.data.mul_(1 - ck).add_(z, alpha=ck)


        self.state['k'] += 1
        return loss

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
def get_activation(activ_name: str="relu"):
    """"""
    act_dict = {
        "relu": nn.ReLU(inplace=True),
        "tanh": nn.Tanh(),
        "sigmoid": nn.Sigmoid(),
        "identity": nn.Identity()}
    if activ_name in act_dict:
        return act_dict[activ_name]
    else:
        raise NotImplementedError

class Conv2dBNActiv(nn.Module):
    """Conv2d -> (BN ->) -> Activation"""

    def __init__(
        self, in_channels: int, out_channels: int,
        kernel_size: int, stride: int=1, padding: int=0,
        bias: bool=False, use_bn: bool=True, activ: str="relu"
    ):
        """"""
        super(Conv2dBNActiv, self).__init__()
        layers = []
        layers.append(nn.Conv2d(
            in_channels, out_channels,
            kernel_size, stride, padding, bias=bias))
        if use_bn:
            layers.append(nn.BatchNorm2d(out_channels))

        layers.append(get_activation(activ))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        """Forward"""
        return self.layers(x)


class SSEBlock(nn.Module):
    """channel `S`queeze and `s`patial `E`xcitation Block."""

    def __init__(self, in_channels: int):
        """Initialize."""
        super(SSEBlock, self).__init__()
        self.channel_squeeze = nn.Conv2d(
            in_channels=in_channels, out_channels=1,
            kernel_size=1, stride=1, padding=0, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """Forward."""
        # # x: (bs, ch, h, w) => h: (bs, 1, h, w)
        h = self.sigmoid(self.channel_squeeze(x))
        # # x, h => return: (bs, ch, h, w)
        return x * h


class SpatialAttentionBlock(nn.Module):
    """Spatial Attention for (C, H, W) feature maps"""

    def __init__(
        self, in_channels,
        out_channels_list,
    ):
        """Initialize"""
        super(SpatialAttentionBlock, self).__init__()
        self.n_layers = len(out_channels_list)
        channels_list = [in_channels] + out_channels_list
        assert self.n_layers > 0
        assert channels_list[-1] == 1

        for i in range(self.n_layers - 1):
            in_chs, out_chs = channels_list[i: i + 2]
            layer = Conv2dBNActiv(in_chs, out_chs, 3, 1, 1, activ="relu")
            setattr(self, f"conv{i + 1}", layer)

        in_chs, out_chs = channels_list[-2:]
        layer = Conv2dBNActiv(in_chs, out_chs, 3, 1, 1, activ="sigmoid")
        setattr(self, f"conv{self.n_layers}", layer)

    def forward(self, x):
        """Forward"""
        h = x
        for i in range(self.n_layers):
            h = getattr(self, f"conv{i + 1}")(h)

        h = h * x
        return h

In [None]:
class Backbone(nn.Module):

    def __init__(self, name='resnet18', pretrained=True):
        super(Backbone, self).__init__()
        self.net = timm.create_model(name, pretrained=pretrained)

        if 'regnet' in name:
            self.out_features = self.net.head.fc.in_features
        elif 'vit' in name:
            self.out_features = self.net.head.in_features
        elif name == 'deit_base_distilled_patch16_384':
            self.out_features = 768
        elif 'csp' in name:
            self.out_features = self.net.head.fc.in_features
        elif 'res' in name:  # works also for resnest
            self.out_features = self.net.fc.in_features
        elif 'efficientnet' in name:
            self.out_features = self.net.classifier.in_features
        elif 'densenet' in name:
            self.out_features = self.net.classifier.in_features
        elif 'senet' in name:
            self.out_features = self.net.fc.in_features
        elif 'inception' in name:
            self.out_features = self.net.last_linear.in_features

        else:
            self.out_features = self.net.classifier.in_features

    def forward(self, x):
        x = self.net.forward_features(x)

        return x

In [None]:
class SETINet(nn.Module):
    def __init__(self, backbone, out_dim, embedding_size=512,
                 loss=False, pretrained=True, use_mixup=True):
        super(SETINet, self).__init__()
        self.backbone_name = backbone
        self.loss = loss
        self.out_dim = out_dim
        self.use_mixup = use_mixup

        self.mixup = Mixup()
        self.backbone = Backbone(backbone, pretrained=pretrained)
        if int(embedding_size) != int(self.backbone.out_features):
            self.embedding_size = self.backbone.out_features // 2
        else:
            self.embedding_size = embedding_size

        self.neck = nn.Sequential(
                SpatialAttentionBlock(self.backbone.out_features, [64, 32, 16, 1]),
                nn.AdaptiveAvgPool2d(output_size=1),
                nn.Flatten(start_dim=1),
                nn.Linear(self.backbone.out_features, self.embedding_size),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
        )

        self.head = nn.Linear(self.embedding_size, out_dim)

    def forward(self, input_dict, training=True, get_embeddings=False, get_attentions=False):

        x = input_dict['spect']
        if self.use_mixup and training==True:
            x, y_a, y_b, lam = self.mixup(x, input_dict['target'])
            
            
        if 'deit_base_distilled_patch16_384' == self.backbone_name:
            x = x.unsqueeze(1)
        x = x.expand(-1, 3, -1, -1)

        x = self.backbone(x)
        x = self.neck(x)

        logits = self.head(x)

        output_dict = {'logits': logits}
        
        if self.loss and self.use_mixup and training==True:
            target = input_dict['target']
            loss = mixup_criterion(criterion, logits, y_a, y_b, lam)
            output_dict['loss'] = loss
        elif self.loss:
            target = input_dict['target']
            loss = criterion(logits, target)
            output_dict['loss'] = loss

        return output_dict

In [None]:
def train_epoch(loader, model, optimizer, scheduler, scaler, device):

    model.train()
    model.zero_grad()
    train_loss = []
    bar = tqdm(range(len(loader)))
    load_iter = iter(loader)
    batch = load_iter.next()
    batch = {k: batch[k].to(device, non_blocking=True) for k in batch.keys()}

    for i in bar:
        input_dict = batch.copy()
        if i + 1 < len(loader):
            batch = load_iter.next()
            batch = {k: batch[k].to(device, non_blocking=True)
                     for k in batch.keys()}

        with autocast():
            out_dict = model(input_dict)
        loss = out_dict['loss']
        loss_np = loss.detach().cpu().numpy()
        # loss.backward()
        scaler.scale(loss).backward()

        if (i+1) % GRADIENT_ACCUMULATION == 0 or i == len(loader) - 1:
            # optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            model.zero_grad()
            scheduler.step()

        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.4f, smth: %.4f' % (loss_np, smooth_loss))
    return train_loss

In [None]:
def val_epoch(loader, model, device):

    model.eval()
    val_loss = []
    LOGITS = []
    TARGETS = []

    with torch.no_grad():
        if 1:
            bar = tqdm(range(len(loader)))
            load_iter = iter(loader)
            batch = load_iter.next()
            batch = {k: batch[k].to(device, non_blocking=True)
                     for k in batch.keys()}

            for i in bar:
                input_dict = batch.copy()
                if i + 1 < len(loader):
                    batch = load_iter.next()
                    batch = {k: batch[k].to(device, non_blocking=True)
                             for k in batch.keys()}

                out_dict = model(input_dict,training=False)
                logits = out_dict['logits'].sigmoid()
                loss = out_dict['loss']
                target = input_dict['target']
                loss_np = loss.detach().cpu().numpy()
                LOGITS.append(logits.detach())
                TARGETS.append(target.detach())
                val_loss.append(loss_np)

                smooth_loss = sum(val_loss[-100:]) / min(len(val_loss), 100)
                bar.set_description('loss: %.4f, smth: %.4f' %
                                    (loss_np, smooth_loss))

            val_loss = np.mean(val_loss)

    LOGITS = torch.cat(LOGITS)
    TARGETS = torch.cat(TARGETS)
    # auc_score = fast_auc_torch(TARGETS, LOGITS).detach().cpu().numpy()
    sklearn_auc = roc_auc_score(TARGETS.detach().cpu(), LOGITS.detach().cpu()) 
    return val_loss, LOGITS, 0, sklearn_auc

In [None]:
def save_checkpoint(model, optimizer, scheduler, scaler, epoch, fold, seed, fname="stft_tranformer"):
    checkpoint = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'scaler': scaler.state_dict(),
        'epoch': epoch,
    }
    torch.save(checkpoint, '%s_%d_%d_%d.pt' %
               (fname, fold, seed, epoch))

In [None]:
def load_checkpoint(backbone, epoch, fold, seed, fname):
    model = SETINet(backbone,
                      out_dim=len(BIRD_CODE),
                      loss=True,
                      pretrained=False,
                      ).to(device)
    optimizer = MADGRAD(model.parameters(), lr=LR)
    scheduler = OneCycleLR(
          optimizer,
          epochs = N_EPOCHS,
          max_lr = MAX_LR,
          total_steps = n_training_steps,
          steps_per_epoch = steps_per_epoch
        )
    
    scaler = GradScaler()
    checkpoint = torch.load('%s_%d_%d_%d.pt' %
                            (fname, fold, seed, epoch))
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    scaler.load_state_dict(checkpoint['scaler'])
    return model, optimizer, scheduler, scaler

In [None]:
steps_per_epoch=(len(train)// N_EPOCHS) // BATCH_SIZE
n_training_steps = steps_per_epoch * N_EPOCHS

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
train["fold"] = -1
for fold_id, (_, val_idx) in enumerate(skf.split(train["id"], train["target"])):
    train.loc[val_idx, "fold"] = fold_id

In [None]:
for seed in [0]:
    for fold in range(N_FOLDS):
        if fold != TRAIN_FOLD:
            continue
            
        train_fold = train[train['fold']!=TRAIN_FOLD]
        train_dataset = TrainDataset(
            train_fold,
            transform=get_transforms(data='train'),
            use_vit=False
        )
        train_data_loader = DataLoader(
            train_dataset,
            batch_size=BATCH_SIZE,
            num_workers=4,
            shuffle=True,
            pin_memory=True,
        )
        
        val_df = train[train['fold']==TRAIN_FOLD]
        val_dataset = TrainDataset(          
            val_df,
            transform=get_transforms(data='valid'),
            use_vit=False
        )

        valid_data_loader_orig = DataLoader(
            val_dataset,
            batch_size=BATCH_SIZE*2,
            num_workers=4,
            shuffle=False,
            pin_memory=True,
        )
        
        model = SETINet(backbone=MODEL,
                      out_dim=1,
                      loss=True,
                      pretrained=True,
                      ).to(device)
        optimizer = MADGRAD(model.parameters(), lr=LR)
        scheduler = OneCycleLR(
              optimizer,
              epochs = N_EPOCHS,
              max_lr = MAX_LR,
              steps_per_epoch=int(np.ceil(len(train_data_loader)/GRADIENT_ACCUMULATION)))
        scaler = GradScaler()
        
        roc_auc_max = 0.
        loss_min = 99999
        not_improving = 0
        
        for epoch in range(N_EPOCHS):
            print(time.ctime(), 'Epoch:', epoch, flush=True)
            # train_loss = 0.0
            train_loss = train_epoch(train_data_loader, model, optimizer, scheduler, scaler, device)

            (val_loss, _ , auc_score, sklearn_auc) = val_epoch(valid_data_loader_orig, model, device)
            content = 'Orig %d Ep %d, lr: %.7f, train loss: %.5f, val loss: %.5f, sklearnAUC: %.4f'
            values = (fold,
                      epoch,
                      optimizer.param_groups[0]["lr"],
                      np.mean(train_loss),
                      np.mean(val_loss),
                      
                      sklearn_auc
                      )
            print(content % values, flush=True)
            
            not_improving += 1
            if sklearn_auc > roc_auc_max:
                save_checkpoint(model, optimizer, scheduler,
                                scaler, epoch, fold, seed)
                roc_auc_max = sklearn_auc
                not_improving = 0
                
            if not_improving == EARLY_STOP:
                print('Early Stopping...')
                break