# GPU

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

Fri Jan  8 09:04:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# CFG

In [2]:
CONFIG_NAME = 'config08.yml'
TITLE = '08t-resnext50-512'

In [3]:
! git clone https://github.com/raijin0704/cassava.git
# ====================================================
# CFG
# ====================================================
import yaml

CONFIG_PATH = f'./cassava/config/{CONFIG_NAME}'
with open(CONFIG_PATH) as f:
    config = yaml.load(f)

INFO = config['info']
TAG = config['tag']
CFG = config['cfg']

CFG['train'] = True
CFG['inference'] = False

# CFG['debug'] = True

if CFG['debug']:
    CFG['epochs'] = 1


assert INFO['TITLE'] == TITLE

Cloning into 'cassava'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 55 (delta 33), reused 10 (delta 5), pack-reused 0[K
Unpacking objects: 100% (55/55), done.


# colab & kaggle notebookでの環境面の処理

## colab

In [5]:
def _colab_kaggle_authority():
    from googleapiclient.discovery import build
    import io, os
    from googleapiclient.http import MediaIoBaseDownload

    drive_service = build('drive', 'v3')
    results = drive_service.files().list(
            q="name = 'kaggle.json'", fields="files(id)").execute()
    kaggle_api_key = results.get('files', [])

    filename = "/root/.kaggle/kaggle.json"
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
    fh = io.FileIO(filename, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print("Download %d%%." % int(status.progress() * 100))
    os.chmod(filename, 600)


def _install_apex():
    import os
    import subprocess
    import sys
    # import time
    subprocess.run('git clone https://github.com/NVIDIA/apex'.split(' '))
    # time.sleep(10)
    os.chdir('apex')
    subprocess.run('pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .'.split(' '))
    os.chdir('..')


def process_colab():
    import subprocess
    
    # ドライブのマウント
    from google.colab import drive
    drive.mount('/content/drive')

    # Google Cloudの権限設定
    from google.colab import auth
    auth.authenticate_user()

    # kaggle設定
    # _colab_kaggle_authority()
    # subprocess.run('pip install --upgrade --force-reinstall --no-deps kaggle'.split(' '))

    # ライブラリ関係
    subprocess.run('pip install --upgrade opencv-python'.split(' '))
    subprocess.run('pip install --upgrade albumentations'.split(' '))
    subprocess.run('pip install timm'.split(' '))
    # if CFG['apex']:
    #     print('installing apex')
    #     _install_apex()
    #     print('done')

    # 各種pathの設定
    DATA_PATH = '/content/drive/Shareddrives/便利用/kaggle/cassava/input/'
    OUTPUT_DIR = './output/'
    NOTEBOOK_PATH = f'/content/drive/Shareddrives/便利用/kaggle/cassava/notebook/{TITLE}.ipynb'

    return DATA_PATH, OUTPUT_DIR, NOTEBOOK_PATH

## kaggle notebook

In [6]:
def _kaggle_gcp_authority():
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    user_credential = user_secrets.get_gcloud_credential()
    user_secrets.set_tensorflow_credential(user_credential)

def process_kaggle():
    # GCP設定
    _kaggle_gcp_authority()

    # 各種pathの設定
    DATA_PATH = '../input/cassava-leaf-disease-classification/'
    OUTPUT_DIR = './'
    NOTEBOOK_PATH = './__notebook__.ipynb'
    # system path
    import sys
    sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

    return DATA_PATH, OUTPUT_DIR, NOTEBOOK_PATH

## 共通

In [7]:
def process_common():
    # ライブラリ関係
    import subprocess
    subprocess.run('pip install mlflow'.split(' '))

    # 環境変数
    import os
    os.environ["GCLOUD_PROJECT"] = INFO['PROJECT_ID']

In [8]:
try:
    from google.colab import auth
except ImportError:
    DATA_PATH, OUTPUT_DIR, NOTEBOOK_PATH = process_kaggle()
else:
    DATA_PATH, OUTPUT_DIR, NOTEBOOK_PATH = process_colab()
finally:
    process_common()

Mounted at /content/drive


# install apex

In [10]:
if CFG['apex']:
    try:
        import apex
    except Exception:
        ! git clone https://github.com/NVIDIA/apex.git
        % cd apex
        !pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
        %cd ..

Cloning into 'apex'...
remote: Enumerating objects: 7872, done.[K
remote: Total 7872 (delta 0), reused 0 (delta 0), pack-reused 7872[K
Receiving objects: 100% (7872/7872), 13.98 MiB | 29.04 MiB/s, done.
Resolving deltas: 100% (5374/5374), done.
/content/apex
  cmdoptions.check_install_build_global(options)
Processing /content/apex
Skipping wheel build for apex, due to binaries being disabled for it.
Installing collected packages: apex
    Running setup.py install for apex ... [?25l[?25hdone
Successfully installed apex-0.1
/content


# Library

In [11]:
# ====================================================
# Library
# ====================================================
import os
import datetime
import math
import time
import random
import glob
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm
import mlflow

import warnings 
warnings.filterwarnings('ignore')

if CFG['apex']:
    from apex import amp

if CFG['debug']:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('cuda')
start_time = datetime.datetime.now()
start_time_str = start_time.strftime('%m%d%H%M')

# Directory settings

In [12]:
# ====================================================
# Directory settings
# ====================================================
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# save basic files

In [13]:
# with open(f'{OUTPUT_DIR}/{start_time_str}_TAG.json', 'w') as f:
#     json.dump(TAG, f, indent=4)
    
# with open(f'{OUTPUT_DIR}/{start_time_str}_CFG.json', 'w') as f:
#     json.dump(CFG, f, indent=4)

import shutil
notebook_path = f'{OUTPUT_DIR}/{start_time_str}_{TITLE}.ipynb'
shutil.copy2(NOTEBOOK_PATH, notebook_path)

'./output//01080913_08t-resnext50-512.ipynb'

# Data Loading

In [14]:
train = pd.read_csv(f'{DATA_PATH}/train.csv')
test = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')
label_map = pd.read_json(f'{DATA_PATH}/label_num_to_disease_map.json', 
                         orient='index')

if CFG['debug']:
    train = train.sample(n=1000, random_state=CFG['seed']).reset_index(drop=True)

# Utils

In [15]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, FileHandler,  Formatter,  StreamHandler
    from logging import INFO as INFO_
    logger = getLogger(__name__)
    logger.setLevel(INFO_)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger_path = OUTPUT_DIR+f'{start_time_str}_train.log'
LOGGER = init_logger(logger_path)


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG['seed'])


class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, save_path='checkpoint.pt',
                 counter=0, best_score=None, save_latest_path=None):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            save_path (str): Directory for saving a model.
                             Default: "'checkpoint.pt'"
        """
        self.patience = patience
        self.verbose = verbose
        self.save_path = save_path
        self.counter = counter
        self.best_score = best_score
        self.save_latest_path = save_latest_path
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model, preds, epoch):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, preds, epoch)
        elif score >= self.best_score:
            self.best_score = score
            self.save_checkpoint(val_loss, model, preds, epoch)
            self.counter = 0
        # nanになったら学習ストップ
        elif math.isnan(score):
            self.early_stop = True
        else:
            self.counter += 1
            if self.save_latest_path is not None:
                self.save_latest(val_loss, model, preds, epoch, score)
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model, preds, epoch):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.10f} --> {val_loss:.10f}).  Saving model ...')
        torch.save({'model': model.state_dict(), 'preds': preds, 
                    'epoch' : epoch, 'best_score' : self.best_score, 'counter' : self.counter},
                   self.save_path)
        self.val_loss_min = val_loss

    def save_latest(self, val_loss, model, preds, epoch, score):
        '''Saves latest model.'''
        torch.save({'model': model.state_dict(), 'preds': preds, 
                    'epoch' : epoch, 'score' : score, 'counter' : self.counter},
                   self.save_latest_path)
        self.val_loss_min = val_loss

# CV split

In [16]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG['n_fold'], shuffle=True, random_state=CFG['seed'])
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG['target_col']])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG['target_col']]).size())

fold  label
0     0         218
      1         438
      2         477
      3        2631
      4         516
1     0         218
      1         438
      2         477
      3        2631
      4         516
2     0         217
      1         438
      2         477
      3        2632
      4         515
3     0         217
      1         438
      2         477
      3        2632
      4         515
4     0         217
      1         437
      2         478
      3        2632
      4         515
dtype: int64


# Dataset

In [17]:
# ====================================================
# Dataset
# ====================================================
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{DATA_PATH}/train_images/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    

class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{DATA_PATH}/test_images/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [18]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show() 

# Transforms

In [19]:
def _get_augmentations(aug_list):
    process = []
    for aug in aug_list:
        if aug ==  'Resize':
            process.append(Resize(CFG['size'], CFG['size']))
        elif aug == 'RandomResizedCrop':
            process.append(RandomResizedCrop(CFG['size'], CFG['size']))
        elif aug == 'Transpose':
            process.append(Transpose(p=0.5))
        elif aug == 'HorizontalFlip':
            process.append(HorizontalFlip(p=0.5))
        elif aug == 'VerticalFlip':
            process.append(VerticalFlip(p=0.5))
        elif aug == 'ShiftScaleRotate':
            process.append(ShiftScaleRotate(p=0.5))
        elif aug == 'Normalize':
            process.append(Normalize(
                            mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225],
                        ))
        else:
            raise ValueError(f'{aug} is not suitable')

    process.append(ToTensorV2())

    return process

In [20]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose(
            _get_augmentations(TAG['augmentation'])
        )

    elif data == 'valid':
        return Compose(
            _get_augmentations(['Resize', 'Normalize'])
        )

In [21]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show() 

# Bi-tempered logistic loss

In [22]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

# MODEL

In [23]:
# ====================================================
# MODEL
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        if hasattr(self.model, 'classifier'):
            n_features = self.model.classifier.in_features
            self.model.classifier = nn.Linear(n_features, CFG['target_size'])
        elif hasattr(self.model, 'fc'):
            n_features = self.model.fc.in_features
            self.model.fc = nn.Linear(n_features, CFG['target_size'])

    def forward(self, x):
        x = self.model(x)
        return x

In [24]:
model = CustomModel(model_name=TAG['model_name'], pretrained=False)
train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,
                          num_workers=4, pin_memory=True, drop_last=True)

for image, label in train_loader:
    output = model(image)
    print(output)
    break

tensor([[ 0.1942, -0.0228,  0.0547,  0.3079, -0.0856],
        [ 0.1010,  0.0974,  0.0921,  0.5386,  0.1265],
        [ 0.1124,  0.0810,  0.0548,  0.4364, -0.0349],
        [ 0.1089,  0.0594,  0.0561,  0.5599,  0.1196]],
       grad_fn=<AddmmBackward>)


# Helper functions

In [25]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))




In [26]:
# ====================================================
# loss
# ====================================================
def get_loss(criterion, y_preds, labels):
    if TAG['criterion']=='CrossEntropyLoss':
        loss = criterion(y_preds, labels)
    elif TAG['criterion'] == 'bi_tempered_logistic_loss':
        loss = criterion(y_preds, labels, t1=CFG['bi_tempered_loss_t1'], t2=CFG['bi_tempered_loss_t2'])
    return loss

In [27]:
# ====================================================
# Helper functions
# ====================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        loss = get_loss(criterion, y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG['gradient_accumulation_steps'] > 1:
            loss = loss / CFG['gradient_accumulation_steps']
        if CFG['apex']:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG['max_grad_norm'])
        if (step + 1) % CFG['gradient_accumulation_steps'] == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG['print_freq'] == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = get_loss(criterion, y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG['gradient_accumulation_steps'] > 1:
            loss = loss / CFG['gradient_accumulation_steps']
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG['print_freq'] == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avg_preds = []
        for state in states:
            # model.load_state_dict(state['model'])
            model.load_state_dict(state)
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

# Train loop

In [28]:
# ====================================================
# scheduler 
# ====================================================
def get_scheduler(optimizer):
    if TAG['scheduler']=='ReduceLROnPlateau':
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG['factor'], patience=CFG['patience'], verbose=True, eps=CFG['eps'])
    elif TAG['scheduler']=='CosineAnnealingLR':
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG['T_max'], eta_min=CFG['min_lr'], last_epoch=-1)
    elif TAG['scheduler']=='CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG['T_0'], T_mult=1, eta_min=CFG['min_lr'], last_epoch=-1)
    return scheduler

# ====================================================
# criterion
# ====================================================
def get_criterion():
    if TAG['criterion']=='CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss()
    elif TAG['criterion'] == 'bi_tempered_logistic_loss':
        criterion = bi_tempered_logistic_loss
    return criterion

In [38]:
# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")
    if not CFG['debug']:
        mlflow.set_tag('running.fold', str(fold))
    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG['batch_size'], 
                              shuffle=True, 
                              num_workers=CFG['num_workers'], pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG['batch_size'], 
                              shuffle=False, 
                              num_workers=CFG['num_workers'], pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer & criterion
    # ====================================================
    best_model_path = OUTPUT_DIR+f'{TAG["model_name"]}_fold{fold}_best.pth'
    latest_model_path = OUTPUT_DIR+f'{TAG["model_name"]}_fold{fold}_latest.pth'

    model = CustomModel(TAG['model_name'], pretrained=True)
    model.to(device)
    # 学習途中の重みがあれば読み込み
    if os.path.isfile(latest_model_path):
        state_latest = torch.load(latest_model_path)
        state_best = torch.load(best_model_path)
        model.load_state_dict(state_latest['model'])
        epoch_start = state_latest['epoch']+1
        # er_best_score = state_latest['score']
        er_counter = state_latest['counter']
        er_best_score = state_best['best_score']

        LOGGER.info(f'Retrain model in epoch:{epoch_start}, best_score:{er_best_score:.3f}, counter:{er_counter}')
    else:
        epoch_start = 0
        er_best_score = None
        er_counter = 0

    optimizer = Adam(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'], amsgrad=False)
    scheduler = get_scheduler(optimizer)
    criterion = get_criterion()

    # ====================================================
    # apex
    # ====================================================
    if CFG['apex']:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # ====================================================
    # loop
    # ====================================================
    # best_score = 0.
    # best_loss = np.inf
    early_stopping = EarlyStopping(
                            patience=CFG['early_stopping_round'], 
                            verbose=True,
                            save_path=best_model_path,
                            counter=er_counter, best_score=er_best_score, 
                            save_latest_path=latest_model_path)
    
    for epoch in range(epoch_start, CFG['epochs']):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds[CFG['target_col']].values
        
        # early stopping
        early_stopping(avg_val_loss, model, preds, epoch)
        if early_stopping.early_stop:
            print(f'Epoch {epoch+1} - early stopping')
            break
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(valid_labels, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        # log mlflow
        if not CFG['debug']:
            mlflow.log_metric(f"fold{fold} avg_train_loss", avg_loss, step=epoch)
            mlflow.log_metric(f"fold{fold} avg_valid_loss", avg_val_loss, step=epoch)
            mlflow.log_metric(f"fold{fold} score", score, step=epoch)
            mlflow.log_metric(f"fold{fold} lr", scheduler.get_last_lr()[0], step=epoch)
            mlflow.log_artifact(best_model_path)
            if os.path.isfile(latest_model_path):
                mlflow.log_artifact(latest_model_path)
    
    check_point = torch.load(best_model_path)
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)

    return valid_folds

In [40]:
# ====================================================
# main
# ====================================================
def get_result(result_df):
    preds = result_df['preds'].values
    labels = result_df[CFG['target_col']].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.5f}')
    
    return score

    

def main():

    """
    Prepare: 1.train  2.test  3.submission  4.folds
    """
    
    if CFG['train']:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG['n_fold']):
            if fold in CFG['trn_fold']:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                _ = get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        score = get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        # log mlflow
        if not CFG['debug']:
            mlflow.log_metric('oof score', score)
            mlflow.delete_tag('running.fold')
            mlflow.log_artifact(OUTPUT_DIR+'oof_df.csv')
    
    if CFG['inference']:
        # inference
        model = CustomModel(TAG['model_name'], pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{TAG["model_name"]}_fold{fold}_best.pth') for fold in CFG['trn_fold']]
        test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
        test_loader = DataLoader(test_dataset, batch_size=CFG['batch_size'], shuffle=False, 
                                 num_workers=CFG['num_workers'], pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

# rerun

In [41]:
def _load_save_point(run_id):
    # どこで中断したか取得
    stop_fold = int(mlflow.get_run(run_id=run_id).to_dictionary()['data']['tags']['running.fold'])
    # 学習対象のfoldを変更
    CFG['trn_fold'] = [fold for fold in CFG['trn_fold'] if fold>=stop_fold]
    # 学習済みモデルがあれば.pthファイルを取得(学習中も含む)
    client = mlflow.tracking.MlflowClient()
    artifacts = [artifact for artifact in client.list_artifacts(run_id) if ".pth" in artifact.path]
    for artifact in artifacts:
        client.download_artifacts(run_id, artifact.path, OUTPUT_DIR)


def check_have_run():
    results = mlflow.search_runs(INFO['EXPERIMENT_ID'])
    run_id_list = results[results['tags.mlflow.runName']==TITLE]['run_id'].tolist()
    # 初めて実行する場合
    if len(run_id_list) == 0:
        run_id = None
    # 既に実行されている場合
    else:
        assert len(run_id_list)==1
        run_id = run_id_list[0]
        _load_save_point(run_id)

    return run_id

In [42]:
if __name__ == '__main__':
    if CFG['debug']:
        main()
    else:
        mlflow.set_tracking_uri(INFO['TRACKING_URI'])
        mlflow.set_experiment('single model')
        # 既に実行済みの場合は続きから実行する
        run_id = check_have_run()
        with mlflow.start_run(run_id=run_id, run_name=TITLE):
            if run_id is None:
                mlflow.log_artifact(CONFIG_PATH)
                mlflow.log_param('device', device)
                mlflow.set_tags(TAG)
                mlflow.log_params(CFG)
            mlflow.log_artifact(notebook_path)
            main()
            mlflow.log_artifacts(OUTPUT_DIR)
            shutil.copytree(OUTPUT_DIR, f'{INFO["SHARE_DRIVE_PATH"]}/{TITLE}')
            shutil.copy2(CONFIG_PATH, f'{INFO["SHARE_DRIVE_PATH"]}/{TITLE}/{CONFIG_NAME}')



Epoch: [1][0/534] Data 3.137 (3.137) Elapsed 0m 4s (remain 39m 36s) Loss: 0.5014(0.5014) Grad: 1.5255  
Epoch: [1][100/534] Data 0.000 (0.031) Elapsed 1m 53s (remain 8m 7s) Loss: 0.1617(0.2486) Grad: 2.7599  
Epoch: [1][200/534] Data 0.000 (0.016) Elapsed 3m 42s (remain 6m 8s) Loss: 0.1508(0.2125) Grad: 2.0002  
Epoch: [1][300/534] Data 0.000 (0.011) Elapsed 5m 31s (remain 4m 16s) Loss: 0.1389(0.1939) Grad: 1.1239  
Epoch: [1][400/534] Data 0.000 (0.008) Elapsed 7m 19s (remain 2m 25s) Loss: 0.1250(0.1858) Grad: 1.3923  
Epoch: [1][500/534] Data 0.000 (0.006) Elapsed 9m 9s (remain 0m 36s) Loss: 0.1180(0.1793) Grad: 0.3481  
Epoch: [1][533/534] Data 0.000 (0.006) Elapsed 9m 45s (remain 0m 0s) Loss: 0.1705(0.1782) Grad: 0.0000  
EVAL: [0/134] Data 2.253 (2.253) Elapsed 0m 2s (remain 5m 39s) Loss: 0.1940(0.1940) 
EVAL: [100/134] Data 0.235 (0.253) Elapsed 0m 56s (remain 0m 18s) Loss: 0.1573(0.1438) 
EVAL: [133/134] Data 0.000 (0.244) Elapsed 1m 13s (remain 0m 0s) Loss: 0.1921(0.1433) 
Vali

Epoch 1 - avg_train_loss: 0.1782  avg_val_loss: 0.1433  time: 660s
Epoch 1 - Accuracy: 0.8462616822429907


Epoch: [2][0/534] Data 2.525 (2.525) Elapsed 0m 3s (remain 33m 54s) Loss: 0.2735(0.2735) Grad: 0.0000  
Epoch: [2][100/534] Data 0.000 (0.025) Elapsed 1m 52s (remain 8m 2s) Loss: 0.1569(0.1516) Grad: 0.0000  
Epoch: [2][200/534] Data 0.000 (0.013) Elapsed 3m 41s (remain 6m 6s) Loss: 0.1104(0.1519) Grad: 0.0000  
Epoch: [2][300/534] Data 0.000 (0.009) Elapsed 5m 29s (remain 4m 15s) Loss: 0.1595(0.1504) Grad: 0.0000  
Epoch: [2][400/534] Data 0.000 (0.006) Elapsed 7m 18s (remain 2m 25s) Loss: 0.2147(0.1485) Grad: 0.0000  
Epoch: [2][500/534] Data 0.000 (0.005) Elapsed 9m 6s (remain 0m 36s) Loss: 0.1096(0.1495) Grad: 0.0000  
Epoch: [2][533/534] Data 0.000 (0.005) Elapsed 9m 42s (remain 0m 0s) Loss: 0.2729(0.1493) Grad: 0.0000  
EVAL: [0/134] Data 2.546 (2.546) Elapsed 0m 2s (remain 6m 20s) Loss: 0.1903(0.1903) 
EVAL: [100/134] Data 0.632 (0.240) Elapsed 0m 55s (remain 0m 17s) Loss: 0.1552(0.1418) 
EVAL: [133/134] Data 0.000 (0.231) Elapsed 1m 11s (remain 0m 0s) Loss: 0.1949(0.1413) 
Vali

Epoch 2 - avg_train_loss: 0.1493  avg_val_loss: 0.1413  time: 655s
Epoch 2 - Accuracy: 0.8432242990654205


Epoch: [3][0/534] Data 2.420 (2.420) Elapsed 0m 3s (remain 32m 25s) Loss: 0.1233(0.1233) Grad: 0.0000  
Epoch: [3][100/534] Data 0.000 (0.024) Elapsed 1m 51s (remain 7m 57s) Loss: nan(nan) Grad: nan  
Epoch: [3][200/534] Data 0.000 (0.012) Elapsed 3m 38s (remain 6m 1s) Loss: nan(nan) Grad: nan  
Epoch: [3][300/534] Data 0.000 (0.008) Elapsed 5m 24s (remain 4m 11s) Loss: nan(nan) Grad: nan  
Epoch: [3][400/534] Data 0.000 (0.006) Elapsed 7m 11s (remain 2m 23s) Loss: nan(nan) Grad: nan  
Epoch: [3][500/534] Data 0.000 (0.005) Elapsed 8m 58s (remain 0m 35s) Loss: nan(nan) Grad: nan  
Epoch: [3][533/534] Data 0.000 (0.005) Elapsed 9m 33s (remain 0m 0s) Loss: nan(nan) Grad: nan  
EVAL: [0/134] Data 2.759 (2.759) Elapsed 0m 3s (remain 6m 50s) Loss: nan(nan) 
EVAL: [100/134] Data 0.677 (0.249) Elapsed 0m 55s (remain 0m 18s) Loss: nan(nan) 
EVAL: [133/134] Data 0.000 (0.235) Elapsed 1m 12s (remain 0m 0s) Loss: nan(nan) 
Epoch 3 - early stopping


Score: 0.84322


Epoch: [1][0/534] Data 2.648 (2.648) Elapsed 0m 3s (remain 35m 10s) Loss: 0.4979(0.4979) Grad: 1.6756  
Epoch: [1][100/534] Data 0.000 (0.026) Elapsed 1m 54s (remain 8m 9s) Loss: 0.1361(0.2372) Grad: 1.8338  
Epoch: [1][200/534] Data 0.000 (0.013) Elapsed 3m 43s (remain 6m 10s) Loss: 0.2044(0.2084) Grad: 1.7975  
Epoch: [1][300/534] Data 0.000 (0.009) Elapsed 5m 32s (remain 4m 17s) Loss: 0.1044(0.1935) Grad: 1.0544  
Epoch: [1][400/534] Data 0.000 (0.007) Elapsed 7m 21s (remain 2m 26s) Loss: 0.1259(0.1841) Grad: 0.4988  
Epoch: [1][500/534] Data 0.000 (0.005) Elapsed 9m 10s (remain 0m 36s) Loss: 0.0893(0.1787) Grad: 0.0000  
Epoch: [1][533/534] Data 0.000 (0.005) Elapsed 9m 46s (remain 0m 0s) Loss: 0.1220(0.1773) Grad: nan  
EVAL: [0/134] Data 2.021 (2.021) Elapsed 0m 2s (remain 5m 8s) Loss: 0.0877(0.0877) 
EVAL: [100/134] Data 0.000 (0.227) Elapsed 0m 53s (remain 0m 17s) Loss: 0.1020(0.1345) 
EVAL: [133/134] Data 0.000 (0.219) Elapsed 1m 10s (remain 0m 0s) Loss: 0.0619(0.1347) 
Valida

Epoch 1 - avg_train_loss: 0.1773  avg_val_loss: 0.1347  time: 657s
Epoch 1 - Accuracy: 0.8518691588785047


Epoch: [2][0/534] Data 2.796 (2.796) Elapsed 0m 4s (remain 36m 24s) Loss: 0.1437(0.1437) Grad: 0.0000  
Epoch: [2][100/534] Data 0.000 (0.028) Elapsed 1m 53s (remain 8m 6s) Loss: 0.0696(0.1530) Grad: 0.0000  
Epoch: [2][200/534] Data 0.000 (0.014) Elapsed 3m 42s (remain 6m 8s) Loss: 0.0877(0.1533) Grad: 0.0000  
Epoch: [2][300/534] Data 0.000 (0.009) Elapsed 5m 30s (remain 4m 16s) Loss: 0.2038(0.1527) Grad: 0.0000  
Epoch: [2][400/534] Data 0.000 (0.007) Elapsed 7m 20s (remain 2m 25s) Loss: 0.0941(0.1535) Grad: 0.0000  
Epoch: [2][500/534] Data 0.000 (0.006) Elapsed 9m 9s (remain 0m 36s) Loss: 0.0799(0.1517) Grad: 0.0000  
Epoch: [2][533/534] Data 0.000 (0.005) Elapsed 9m 45s (remain 0m 0s) Loss: 0.1477(0.1507) Grad: 0.0000  
EVAL: [0/134] Data 2.688 (2.688) Elapsed 0m 3s (remain 6m 41s) Loss: 0.0918(0.0918) 
EVAL: [100/134] Data 0.999 (0.240) Elapsed 0m 54s (remain 0m 17s) Loss: 0.0964(0.1302) 
EVAL: [133/134] Data 0.000 (0.223) Elapsed 1m 10s (remain 0m 0s) Loss: 0.0511(0.1306) 
Vali

Epoch 2 - avg_train_loss: 0.1507  avg_val_loss: 0.1306  time: 656s
Epoch 2 - Accuracy: 0.8567757009345794


Epoch: [3][0/534] Data 2.431 (2.431) Elapsed 0m 3s (remain 34m 25s) Loss: 0.1195(0.1195) Grad: 0.0000  
Epoch: [3][100/534] Data 0.000 (0.024) Elapsed 1m 52s (remain 8m 4s) Loss: 0.1329(0.1538) Grad: 0.0000  
Epoch: [3][200/534] Data 0.000 (0.012) Elapsed 3m 41s (remain 6m 7s) Loss: 0.1962(0.1521) Grad: 0.0000  
Epoch: [3][300/534] Data 0.000 (0.008) Elapsed 5m 30s (remain 4m 15s) Loss: 0.1217(0.1502) Grad: 0.0000  
Epoch: [3][400/534] Data 0.000 (0.006) Elapsed 7m 18s (remain 2m 25s) Loss: 0.0861(0.1505) Grad: 0.0000  
Epoch: [3][500/534] Data 0.000 (0.005) Elapsed 9m 7s (remain 0m 36s) Loss: 0.1469(0.1520) Grad: 0.0000  
Epoch: [3][533/534] Data 0.000 (0.005) Elapsed 9m 43s (remain 0m 0s) Loss: 0.1520(0.1526) Grad: 0.0000  
EVAL: [0/134] Data 1.967 (1.967) Elapsed 0m 2s (remain 5m 2s) Loss: 0.0927(0.0927) 
EVAL: [100/134] Data 0.458 (0.217) Elapsed 0m 52s (remain 0m 17s) Loss: 0.1020(0.1332) 
EVAL: [133/134] Data 0.000 (0.209) Elapsed 1m 8s (remain 0m 0s) Loss: 0.0456(0.1331) 


Epoch 3 - avg_train_loss: 0.1526  avg_val_loss: 0.1331  time: 652s
Epoch 3 - Accuracy: 0.8495327102803738


EarlyStopping counter: 1 out of 25
Epoch: [4][0/534] Data 2.712 (2.712) Elapsed 0m 4s (remain 35m 42s) Loss: 0.2770(0.2770) Grad: 0.0000  
Epoch: [4][100/534] Data 0.000 (0.027) Elapsed 1m 52s (remain 8m 4s) Loss: 0.2566(0.1492) Grad: nan  
Epoch: [4][200/534] Data 0.000 (0.014) Elapsed 3m 41s (remain 6m 7s) Loss: 0.2021(0.1517) Grad: 0.0000  
Epoch: [4][300/534] Data 0.000 (0.009) Elapsed 5m 30s (remain 4m 15s) Loss: 0.1704(0.1530) Grad: 0.0000  
Epoch: [4][400/534] Data 0.000 (0.007) Elapsed 7m 18s (remain 2m 25s) Loss: 0.1698(0.1524) Grad: 0.0000  
Epoch: [4][500/534] Data 0.000 (0.006) Elapsed 9m 7s (remain 0m 36s) Loss: 0.1900(0.1508) Grad: 0.0000  
Epoch: [4][533/534] Data 0.000 (0.005) Elapsed 9m 43s (remain 0m 0s) Loss: 0.1504(0.1509) Grad: 0.0000  
EVAL: [0/134] Data 2.512 (2.512) Elapsed 0m 2s (remain 6m 17s) Loss: 0.0887(0.0887) 
EVAL: [100/134] Data 0.894 (0.230) Elapsed 0m 54s (remain 0m 17s) Loss: 0.0965(0.1311) 
EVAL: [133/134] Data 0.000 (0.214) Elapsed 1m 9s (remain 0m

Epoch 4 - avg_train_loss: 0.1509  avg_val_loss: 0.1314  time: 653s
Epoch 4 - Accuracy: 0.8551401869158879


EarlyStopping counter: 2 out of 25
Epoch: [5][0/534] Data 2.548 (2.548) Elapsed 0m 3s (remain 34m 17s) Loss: 0.2118(0.2118) Grad: 0.0000  
Epoch: [5][100/534] Data 0.000 (0.025) Elapsed 1m 53s (remain 8m 5s) Loss: 0.2332(0.1526) Grad: nan  
Epoch: [5][200/534] Data 0.000 (0.013) Elapsed 3m 42s (remain 6m 7s) Loss: 0.0621(0.1504) Grad: 0.0000  
Epoch: [5][300/534] Data 0.000 (0.009) Elapsed 5m 30s (remain 4m 16s) Loss: 0.2436(0.1507) Grad: 0.0000  
Epoch: [5][400/534] Data 0.000 (0.007) Elapsed 7m 19s (remain 2m 25s) Loss: nan(nan) Grad: nan  
Epoch: [5][500/534] Data 0.000 (0.005) Elapsed 9m 5s (remain 0m 35s) Loss: nan(nan) Grad: nan  
Epoch: [5][533/534] Data 0.000 (0.005) Elapsed 9m 40s (remain 0m 0s) Loss: nan(nan) Grad: nan  
EVAL: [0/134] Data 2.425 (2.425) Elapsed 0m 2s (remain 6m 4s) Loss: nan(nan) 
EVAL: [100/134] Data 1.009 (0.215) Elapsed 0m 52s (remain 0m 17s) Loss: nan(nan) 
EVAL: [133/134] Data 0.000 (0.201) Elapsed 1m 8s (remain 0m 0s) Loss: nan(nan) 
Epoch 5 - early sto

Score: 0.85678


Epoch: [1][0/534] Data 2.899 (2.899) Elapsed 0m 4s (remain 37m 32s) Loss: 0.5096(0.5096) Grad: 1.7074  
Epoch: [1][100/534] Data 0.000 (0.029) Elapsed 1m 53s (remain 8m 6s) Loss: 0.1953(0.2280) Grad: 2.4237  
Epoch: [1][200/534] Data 0.000 (0.015) Elapsed 3m 42s (remain 6m 8s) Loss: 0.1593(0.2048) Grad: 2.1657  
Epoch: [1][300/534] Data 0.000 (0.010) Elapsed 5m 30s (remain 4m 16s) Loss: 0.0599(0.1913) Grad: 1.3054  
Epoch: [1][400/534] Data 0.000 (0.007) Elapsed 7m 19s (remain 2m 25s) Loss: 0.0856(0.1832) Grad: 2.3302  
Epoch: [1][500/534] Data 0.000 (0.006) Elapsed 9m 7s (remain 0m 36s) Loss: 0.1434(0.1779) Grad: nan  
Epoch: [1][533/534] Data 0.000 (0.006) Elapsed 9m 43s (remain 0m 0s) Loss: 0.0699(0.1766) Grad: 0.0000  
EVAL: [0/134] Data 2.374 (2.374) Elapsed 0m 2s (remain 5m 55s) Loss: 0.1341(0.1341) 
EVAL: [100/134] Data 0.920 (0.222) Elapsed 0m 53s (remain 0m 17s) Loss: 0.1015(0.1535) 
EVAL: [133/134] Data 0.000 (0.204) Elapsed 1m 8s (remain 0m 0s) Loss: 0.1684(0.1541) 
Validati

Epoch 1 - avg_train_loss: 0.1766  avg_val_loss: 0.1541  time: 652s
Epoch 1 - Accuracy: 0.8317363870063099


Epoch: [2][0/534] Data 2.838 (2.838) Elapsed 0m 4s (remain 37m 6s) Loss: 0.1440(0.1440) Grad: 0.0000  
Epoch: [2][100/534] Data 0.000 (0.028) Elapsed 1m 52s (remain 8m 3s) Loss: 0.1434(0.1608) Grad: 0.0000  
Epoch: [2][200/534] Data 0.000 (0.014) Elapsed 3m 41s (remain 6m 6s) Loss: 0.1453(0.1597) Grad: 0.0000  
Epoch: [2][300/534] Data 0.000 (0.010) Elapsed 5m 29s (remain 4m 15s) Loss: 0.1486(0.1564) Grad: 0.0000  
Epoch: [2][400/534] Data 0.000 (0.007) Elapsed 7m 17s (remain 2m 25s) Loss: nan(nan) Grad: nan  
Epoch: [2][500/534] Data 0.000 (0.006) Elapsed 9m 4s (remain 0m 35s) Loss: nan(nan) Grad: nan  
Epoch: [2][533/534] Data 0.000 (0.005) Elapsed 9m 39s (remain 0m 0s) Loss: nan(nan) Grad: nan  
EVAL: [0/134] Data 2.062 (2.062) Elapsed 0m 2s (remain 5m 15s) Loss: nan(nan) 
EVAL: [100/134] Data 0.053 (0.211) Elapsed 0m 52s (remain 0m 17s) Loss: nan(nan) 
EVAL: [133/134] Data 0.000 (0.205) Elapsed 1m 8s (remain 0m 0s) Loss: nan(nan) 
Epoch 2 - early stopping


Score: 0.83174


Epoch: [1][0/534] Data 2.743 (2.743) Elapsed 0m 4s (remain 36m 33s) Loss: 0.5140(0.5140) Grad: 1.6405  
Epoch: [1][100/534] Data 0.000 (0.027) Elapsed 1m 53s (remain 8m 7s) Loss: 0.1560(0.2377) Grad: 1.9371  
Epoch: [1][200/534] Data 0.000 (0.014) Elapsed 3m 42s (remain 6m 9s) Loss: 0.0899(0.2048) Grad: 1.6228  
Epoch: [1][300/534] Data 0.000 (0.009) Elapsed 5m 32s (remain 4m 17s) Loss: 0.2076(0.1911) Grad: 1.3885  
Epoch: [1][400/534] Data 0.000 (0.007) Elapsed 7m 20s (remain 2m 26s) Loss: 0.1416(0.1836) Grad: 1.5408  
Epoch: [1][500/534] Data 0.000 (0.006) Elapsed 9m 9s (remain 0m 36s) Loss: 0.1370(0.1763) Grad: nan  
Epoch: [1][533/534] Data 0.000 (0.005) Elapsed 9m 45s (remain 0m 0s) Loss: 0.0906(0.1755) Grad: 0.0000  
EVAL: [0/134] Data 2.141 (2.141) Elapsed 0m 2s (remain 5m 25s) Loss: 0.2209(0.2209) 
EVAL: [100/134] Data 0.882 (0.225) Elapsed 0m 53s (remain 0m 17s) Loss: 0.1184(0.1518) 
EVAL: [133/134] Data 0.000 (0.213) Elapsed 1m 9s (remain 0m 0s) Loss: 0.1718(0.1541) 
Validati

Epoch 1 - avg_train_loss: 0.1755  avg_val_loss: 0.1541  time: 655s
Epoch 1 - Accuracy: 0.825893900444029


Epoch: [2][0/534] Data 2.905 (2.905) Elapsed 0m 4s (remain 37m 34s) Loss: 0.1414(0.1414) Grad: 0.0000  
Epoch: [2][100/534] Data 0.000 (0.029) Elapsed 1m 52s (remain 8m 2s) Loss: 0.2097(0.1471) Grad: nan  
Epoch: [2][200/534] Data 0.000 (0.015) Elapsed 3m 40s (remain 6m 5s) Loss: 0.0556(0.1483) Grad: 0.0000  
Epoch: [2][300/534] Data 0.000 (0.010) Elapsed 5m 27s (remain 4m 13s) Loss: nan(nan) Grad: nan  
Epoch: [2][400/534] Data 0.000 (0.007) Elapsed 7m 14s (remain 2m 24s) Loss: nan(nan) Grad: nan  
Epoch: [2][500/534] Data 0.000 (0.006) Elapsed 9m 0s (remain 0m 35s) Loss: nan(nan) Grad: nan  
Epoch: [2][533/534] Data 0.000 (0.006) Elapsed 9m 36s (remain 0m 0s) Loss: nan(nan) Grad: nan  
EVAL: [0/134] Data 1.991 (1.991) Elapsed 0m 2s (remain 5m 4s) Loss: nan(nan) 
EVAL: [100/134] Data 0.804 (0.210) Elapsed 0m 52s (remain 0m 17s) Loss: nan(nan) 
EVAL: [133/134] Data 0.000 (0.199) Elapsed 1m 7s (remain 0m 0s) Loss: nan(nan) 
Epoch 2 - early stopping


Score: 0.82589


Epoch: [1][0/534] Data 2.726 (2.726) Elapsed 0m 4s (remain 36m 30s) Loss: 0.5263(0.5263) Grad: 1.6825  
Epoch: [1][100/534] Data 0.000 (0.027) Elapsed 1m 53s (remain 8m 7s) Loss: 0.1665(0.2366) Grad: 1.6544  
Epoch: [1][200/534] Data 0.000 (0.014) Elapsed 3m 42s (remain 6m 9s) Loss: 0.2210(0.2081) Grad: 3.0252  
Epoch: [1][300/534] Data 0.000 (0.009) Elapsed 5m 32s (remain 4m 17s) Loss: 0.2537(0.1971) Grad: 2.9451  
Epoch: [1][400/534] Data 0.000 (0.007) Elapsed 7m 21s (remain 2m 26s) Loss: 0.1472(0.1872) Grad: 1.6982  
Epoch: [1][500/534] Data 0.000 (0.006) Elapsed 9m 10s (remain 0m 36s) Loss: 0.1301(0.1794) Grad: 0.2766  
Epoch: [1][533/534] Data 0.000 (0.005) Elapsed 9m 46s (remain 0m 0s) Loss: 0.1118(0.1764) Grad: 0.5947  
EVAL: [0/134] Data 1.857 (1.857) Elapsed 0m 2s (remain 4m 47s) Loss: 0.1742(0.1742) 
EVAL: [100/134] Data 0.190 (0.205) Elapsed 0m 51s (remain 0m 16s) Loss: 0.1791(0.1490) 
EVAL: [133/134] Data 0.000 (0.202) Elapsed 1m 7s (remain 0m 0s) Loss: 0.1084(0.1482) 
Vali

Epoch 1 - avg_train_loss: 0.1764  avg_val_loss: 0.1482  time: 655s
Epoch 1 - Accuracy: 0.8392147698060295


Epoch: [2][0/534] Data 2.571 (2.571) Elapsed 0m 3s (remain 34m 20s) Loss: 0.0733(0.0733) Grad: 0.4804  
Epoch: [2][100/534] Data 0.000 (0.026) Elapsed 1m 52s (remain 8m 3s) Loss: 0.1626(0.1497) Grad: 0.0000  
Epoch: [2][200/534] Data 0.000 (0.013) Elapsed 3m 41s (remain 6m 6s) Loss: 0.1031(0.1495) Grad: nan  
Epoch: [2][300/534] Data 0.000 (0.009) Elapsed 5m 30s (remain 4m 15s) Loss: 0.1216(0.1494) Grad: 0.0000  
Epoch: [2][400/534] Data 0.000 (0.007) Elapsed 7m 18s (remain 2m 25s) Loss: 0.1378(0.1461) Grad: 0.0000  
Epoch: [2][500/534] Data 0.000 (0.005) Elapsed 9m 6s (remain 0m 35s) Loss: nan(nan) Grad: nan  
Epoch: [2][533/534] Data 0.000 (0.005) Elapsed 9m 41s (remain 0m 0s) Loss: nan(nan) Grad: nan  
EVAL: [0/134] Data 1.930 (1.930) Elapsed 0m 2s (remain 5m 0s) Loss: nan(nan) 
EVAL: [100/134] Data 0.811 (0.215) Elapsed 0m 52s (remain 0m 17s) Loss: nan(nan) 
EVAL: [133/134] Data 0.000 (0.202) Elapsed 1m 7s (remain 0m 0s) Loss: nan(nan) 
Epoch 2 - early stopping


Score: 0.83921
Score: 0.83937
