In [1]:
!nvidia-smi

Tue Jun 29 15:50:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# ドライブをマウント
import sys
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
NAME = "ex11"

In [4]:
import os, sys

if "google.colab" in sys.modules:
    CP_DIR = f"/content/drive/MyDrive/Work/probspace_religious_art/notebook/{NAME}_colab/output"
    INPUT_DIR = "./eda_output/output"
    sys.path.append("/content/drive/MyDrive/Work/probspace_religious_art/code")
elif "kaggle_web_client" in sys.modules:
    pass
elif "/kqi/output" in os.getcwd():
    pass
else:
    # local
    CP_DIR = "output"
    INPUT_DIR = "../../eda/output"
    sys.path.append("../../../code")
    sys.path.append('../../../Git/Ranger-Deep-Learning-Optimizer')
    sys.path.append('../../../Git/pytorch-optimizer')

from mix_aug import cutmix, fmix, snapmix, SnapMixLoss, resizemix

In [5]:
# driveからzipコピーしてくる
if os.getcwd() == "/content" and os.path.exists(INPUT_DIR) == False:
    !mkdir -p "./eda_output"
    !cp -r "/content/drive/MyDrive/Work/probspace_religious_art/notebook/eda/output.zip" "./eda_output"
    !unzip -qq "./eda_output/output.zip" -d "./eda_output"
    pass

In [6]:
# colabで足りないライブラリinstall
import os, sys
if ("google.colab" in sys.modules) or ("kaggle_web_client" in sys.modules) or ("/kqi/output" in os.getcwd()):
    !pip install --upgrade albumentations
    !pip install --upgrade timm
    !pip install torch-optimizer
    pass

Collecting albumentations
[?25l  Downloading https://files.pythonhosted.org/packages/b0/be/3db3cd8af771988748f69eace42047d5edebf01eaa7e1293f3b3f75f989e/albumentations-1.0.0-py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 4.1MB/s ta 0:00:011
Collecting opencv-python-headless>=4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/c3/35/bfc76533f2274cd3da4e2cf255cd13ab9d7f6fc8990c06911e7f8fcc2130/opencv_python_headless-4.5.2.54-cp37-cp37m-manylinux2014_x86_64.whl (38.2MB)
[K     |████████████████████████████████| 38.2MB 1.3MB/s 
Installing collected packages: opencv-python-headless, albumentations
  Found existing installation: albumentations 0.1.12
    Uninstalling albumentations-0.1.12:
      Successfully uninstalled albumentations-0.1.12
Successfully installed albumentations-1.0.0 opencv-python-headless-4.5.2.54
Collecting timm
[?25l  Downloading https://files.pythonhosted.org/packages/ee/08/1ccaf8d516935666b7fa5f6aaddf157c66208ea0c93bb847ae09f16

In [7]:
import pandas as pd

# ====================================================
# Data Load
# ====================================================
def get_train_file_path(image_id):
    return f"{INPUT_DIR}/train/{str(image_id)}.jpg"

train = pd.read_csv(INPUT_DIR + "/train.csv")
train["file_path"] = train["image_id"].apply(get_train_file_path)
n_classes = 13

## train

In [8]:
import os, yaml, shutil

# ====================================================
# Param
# ====================================================

OUTPUT_DIR = "output"
os.makedirs(CP_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

epochs = 50
class Config:
    def __init__(self):
        self.name = NAME
        self.debug = False
        self.size = 384
        self.batch_size = 16
        self.num_workers = 0
        self.seeds = [0,1,2]
        self.n_fold = 5
        self.trn_fold = [0,1,2,3,4]
        self.n_classes = n_classes
        self.lr = 1e-3
        self.min_lr = 1e-6
        self.weight_decay = 0  # 1e-6
        self.optimizer = "radam"
        self.scheduler = "CosineAnnealingLR"
        self.T_max = epochs
        self.gradient_accumulation_steps = 1
        self.max_grad_norm = 5
        self.model_name = "swin_base_patch4_window12_384_in22k"
        self.load_model_path = "none"
        self.is_load_opt = True
        self.epochs = epochs
        self.print_freq = 10000  # 学習結果をprintするstep数
        self.label_smoothing = 0.0
        self.mix_decision_th = 0.5  # cutmixなどの発生確率
        self.mixmethod = "cutmix"
        self.mix_alpha = 1.0
CFG = Config()

with open(OUTPUT_DIR + "/cfg.yaml", "w") as wf:
    yaml.dump(CFG.__dict__, wf)

In [9]:
import os
import sys
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader, Dataset

import albumentations as A
from albumentations import Compose
from albumentations.pytorch import ToTensorV2

# ====================================================
# Dataset
# ====================================================
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        super().__init__()
        self.df = df
        self.file_paths = df["file_path"].values
        self.labels = df["label"].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented["image"]
        label = self.labels[idx]
        return image, torch.from_numpy(np.array(label)).long()


class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        super().__init__()
        self.df = df
        self.file_paths = df["file_path"].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented["image"]
        return image


def get_transforms(*, data):

    if data == "train":
        return A.Compose(
            [
                A.Resize(CFG.size, CFG.size),
                A.HorizontalFlip(p=0.5),
                A.ShiftScaleRotate(p=0.5),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0,),
                A.OneOf([
                    A.ToSepia(p=0.5),
                    A.ToGray(p=0.5),
                ], p=0.5),
                A.CoarseDropout(p=0.5),
                A.Cutout(p=0.5),
                ToTensorV2(),
            ]
        )

    elif data == "valid":
        return Compose(
            [
                A.Resize(CFG.size, CFG.size),
                A.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0,
                    p=1.0,
                ),
                ToTensorV2(),
            ]
        )


def collate(batch):
    """DataLoaderに追加可能なbatchを加工する関数"""
    images, labels = list(zip(*batch))
    images = torch.stack(images)
    labels = torch.stack(labels)
    return images, labels.long()


# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import math
import time
import random
import yaml
import shutil
import glob
import pickle
import pathlib
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from distutils.dir_util import copy_tree

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.metrics import accuracy_score, log_loss
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence  # 文字列の長さを揃えてくれる関数
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    ReduceLROnPlateau,
)
from torch.cuda.amp import autocast, GradScaler
from torch_optimizer import RAdam, Lookahead

import timm

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


class LabelSmoothingCrossEntropy(nn.Module):
    # https://build-medical-ai.com/2021/02/21/label-smoothing%EF%BC%88%E3%83%A9%E3%83%99%E3%83%AB%E3%82%B9%E3%83%A0%E3%83%BC%E3%82%B8%E3%83%B3%E3%82%B0%EF%BC%89%E3%82%92pytorch%E3%81%A7%E5%AE%9F%E8%A3%85%E3%81%99%E3%82%8B/
    def __init__(self, epsilon=0.1, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction

    def forward(self, preds, target):
        n = preds.size()[-1]
        log_preds = F.log_softmax(preds, dim=-1)
        loss = LabelSmoothingCrossEntropy.reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return LabelSmoothingCrossEntropy.linear_combination(nll, loss/n, self.epsilon)
    
    @staticmethod
    def linear_combination(x, y, epsilon):
        return (1 - epsilon) * x + epsilon * y

    @staticmethod
    def reduce_loss(loss, reduction='mean'):
        return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss
    

def train_fn(
    train_loader, model, criterion, optimizer, epoch, scheduler, device, scaler
):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = images.size(0)
        with autocast():
            # ====================================================
            # cutmix/fmix/snapmix
            # ====================================================
            mix_decision = np.random.rand() if epoch < CFG.epochs - 5 else 1.0  # 最後の5epochはmix系なしにする
            if mix_decision < CFG.mix_decision_th:
                if CFG.mixmethod == "cutmix":
                    x, y_mixs = cutmix(images, labels.long(), CFG.mix_alpha)
                    y_hat = model(x.float())
                    loss = criterion(y_hat, y_mixs[0]) * y_mixs[2] + criterion(y_hat, y_mixs[1]) * (1.0 - y_mixs[2])

                elif CFG.mixmethod == "fmix":
                    x, y_mixs = fmix(images, labels.long(), alpha=CFG.mix_alpha, decay_power=5.0, shape=(CFG.size, CFG.size))
                    y_hat = model(images.float())
                    loss = criterion(y_hat, y_mixs[0]) * y_mixs[2] + criterion(y_hat, y_mixs[1]) * (1.0 - y_mixs[2])

                elif CFG.mixmethod == "resizemix":
                    x, y_mixs = resizemix(images, labels.long(), alpha=CFG.mix_alpha)
                    y_hat = model(images.float())
                    loss = criterion(y_hat, y_mixs[0]) * y_mixs[2] + criterion(y_hat, y_mixs[1]) * (1.0 - y_mixs[2])
                    
                else:
                    x = images
                    y_hat = model(images)

                # --- 画像表示（mix画像確認用）---
                if CFG.debug:
                    try:
                        print("mix_decision:", mix_decision)
                        fig = plt.figure(figsize=(16, 16))
                        for i in range(5):
                            print("y_hat:", y_hat[i])
                            ax = fig.add_subplot(1, 5, i + 1, xticks=[], yticks=[])
                            im = x[i].to("cpu").numpy().transpose(1, 2, 0)
                            plt.imshow(im)
                        plt.show(); plt.clf(); plt.close()
                    except:
                        pass
                # -----------------------------------------
            else:
                logits = model(images)
                loss = criterion(logits, labels)
            
            # record loss
            losses.update(loss.item(), batch_size)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            scaler.scale(loss).backward()
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), CFG.max_grad_norm, norm_type=2.0
                )
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad Norm: {grad_norm:.4f}  "
                "LR: {lr:.4e}  ".format(
                    epoch + 1,
                    step,
                    len(train_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    remain=timeSince(start, float(step + 1) / len(train_loader)),
                    grad_norm=grad_norm,
                    lr=scheduler.get_lr()[0],
                )
            )

    return losses.avg


def valid_fn(valid_loader, model, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        batch_size = images.size(0)
        with torch.no_grad():
            with autocast():
                predictions = model.forward_argmax(images)
        pred = predictions.detach().cpu().numpy()
        preds.append(pred)
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} ".format(
                    step,
                    len(valid_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step + 1) / len(valid_loader)),
                )
            )
    preds = np.concatenate(preds)
    return preds


# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold, seed):

    LOGGER.info(f"========== fold: {fold}, seed: {seed} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds["fold"] != fold].index
    val_idx = folds[folds["fold"] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    valid_labels = valid_folds["label"].values

    train_dataset = TrainDataset(train_folds, transform=get_transforms(data="train"))
    valid_dataset = TestDataset(valid_folds, transform=get_transforms(data="valid"))

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=collate,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer,
                mode="min",
                factor=CFG.factor,
                patience=CFG.patience,
                verbose=True,
                eps=CFG.eps,
            )
        elif CFG.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(
                optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1
            )
        elif CFG.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = TimmModel(CFG.n_classes, model_name=CFG.model_name, pretrained=True)
    model.to(device)

    if CFG.optimizer == "adam":
        optimizer = Adam(
            model.parameters(), lr=CFG.lr, amsgrad=False, weight_decay=CFG.weight_decay
        )
    elif CFG.optimizer == "radam":
        optimizer = RAdam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
        optimizer = Lookahead(optimizer, alpha=0.5, k=5)

    scheduler = get_scheduler(optimizer)
    scaler = GradScaler()

    if os.path.exists(CFG.load_model_path):
        # モデルロード
        LOGGER.info("=> loading checkpoint '{}'".format(CFG.load_model_path))
        states = torch.load(CFG.load_model_path, map_location=torch.device("cpu"))
        model.load_state_dict(states["model"])
        model.to(device)
        if CFG.is_load_opt:
            LOGGER.info("=> loading optimizer and scheduler")
            optimizer.load_state_dict(states["optimizer"])
            scheduler.load_state_dict(states["scheduler"])

    # ====================================================
    # loop
    # ====================================================
    if CFG.label_smoothing > 0.0:
        criterion = LabelSmoothingCrossEntropy(epsilon=CFG.label_smoothing)
    else:
        criterion = nn.CrossEntropyLoss()  # loss計算したくないクラスは, ignore_index=1 で指定できる

    best_score = -1  # np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(
            train_loader, model, criterion, optimizer, epoch, scheduler, device, scaler
        )

        # eval
        preds = valid_fn(valid_loader, model, device)
        LOGGER.info(f"labels: {valid_labels[:5]}")
        LOGGER.info(f"preds: {preds[:5]}")

        # scoring
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  lr: {scheduler.get_lr()[0]:.4e}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(score)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            best_pth = OUTPUT_DIR + f"/fold{fold}_seed{seed}_best.pth"
            torch.save(
                {
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "preds": preds,
                },
                best_pth,
            )
            val_pred_df = pd.DataFrame(
                {"id": val_idx, "label": valid_labels, "pred": preds}
            )
            
    return val_pred_df


# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


def init_logger(log_file='train.log'):
    """学習ログファイル出す"""
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


# ====================================================
# CV split
# ====================================================
def cv_split(df, seed):
    folds = df.copy()
    cv = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=seed)
    for j, (train_idx, valid_idx) in enumerate(cv.split(df, df["label"])):
        folds.loc[valid_idx, "fold"] = int(j)
    folds["fold"] = folds["fold"].astype(int)
    print(folds.groupby(["fold"]).size())
    return folds


# ====================================================
# Model
# ====================================================
class TimmModel(nn.Module):
    def __init__(self, n_classes, model_name="resnet18", pretrained=True):
        super().__init__()
        self.cnn = timm.create_model(model_name, pretrained=pretrained)
        if "efficient" in model_name:
            self.cnn.classifier = nn.Linear(self.cnn.classifier.in_features, n_classes)
        elif "vit" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        elif "nfnet" in model_name:
            self.cnn.head.fc = nn.Linear(self.cnn.head.fc.in_features, n_classes)
        elif "tnt" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        elif "swin" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        elif "cait" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        elif "mixer" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        else:
            self.cnn.head = nn.Linear(self.cnn.fc.in_features, n_classes)

    def forward(self, x):
        return self.cnn(x)
    
    def forward_argmax(self, x):
        return self.cnn(x).argmax(1)


# ====================================================
# LOGGER
# ====================================================
LOGGER = init_logger(OUTPUT_DIR + "/train.log")


# ====================================================
# main
# ====================================================
def main(train):
    for seed in CFG.seeds:
        seed_torch(seed=seed)
        
        if CFG.debug:
            CFG.epochs = 2
            train = train.sample(n=300, random_state=seed).reset_index(drop=True)
        
        folds = cv_split(train, seed)
        oof_df = None
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                val_pred_df = train_loop(folds, fold, seed)
                val_pred_df["fold"] = fold

                if oof_df is None:
                    oof_df = val_pred_df
                else:
                    oof_df = oof_df.append(val_pred_df)

        oof_df.to_csv(OUTPUT_DIR + f"/oof_seed{seed}.csv", index=False)
        #display(oof_df)
        
        LOGGER.info(f"\noof score: {get_score(oof_df['label'].values, oof_df['pred'].values)}\n")
    
    # colabは短時間でフdriveにファイル出力多いとエラーになるので最後に保存
    # 出力ディレクトリをdriveに保存する
    if "google.colab" in sys.modules:
        copy_tree(OUTPUT_DIR, CP_DIR)

if __name__ == '__main__':
    print("timm version:", timm.__version__)
    print(device)
    
    main(train)
    
    LOGGER.info("\ntrain finish!!!")



timm version: 0.4.9
cuda
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64


Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth" to /root/.cache/torch/hub/checkpoints/swin_base_patch4_window12_384_22k.pth


Epoch: [1][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 55s) Loss: 2.6802(2.6802) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.114 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.7077(2.5376) Grad Norm: 41.5340  LR: 1.0000e-03  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [0 2 2 2 2]
Epoch 1 - avg_train_loss: 2.5376  lr: 1.0000e-03  time: 50s
Epoch 1 - Score: 0.1832
Epoch 1 - Save Best Score: 0.1832 Model


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3136(2.3136) Grad Norm: 11.7700  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.110 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2911(2.2851) Grad Norm: 14.7757  LR: 9.9803e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [2 2 2 2 2]
Epoch 2 - avg_train_loss: 2.2851  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.3282
Epoch 2 - Save Best Score: 0.3282 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.155 (0.155) Elapsed 0m 1s (remain 0m 45s) Loss: 1.8878(1.8878) Grad Norm: 14.7811  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.120 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8133(2.0261) Grad Norm: 15.4150  LR: 9.9312e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  2  5 11  2]
Epoch 3 - avg_train_loss: 2.0261  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.4962
Epoch 3 - Save Best Score: 0.4962 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0700(2.0700) Grad Norm: 8.9163  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3015(1.7794) Grad Norm: 10.5339  LR: 9.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [0 2 5 0 2]
Epoch 4 - avg_train_loss: 1.7794  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.4427


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.6892(1.6892) Grad Norm: 6.6454  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.115 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7772(1.4141) Grad Norm: 12.1122  LR: 9.7751e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  2  5  2  2]
Epoch 5 - avg_train_loss: 1.4141  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.4885


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8356(0.8356) Grad Norm: 15.8134  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.131 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2686(1.3198) Grad Norm: 11.6647  LR: 9.6688e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  2  5  0  3]
Epoch 6 - avg_train_loss: 1.3198  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.4733


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.6857(1.6857) Grad Norm: 10.1803  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.111 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 1.4883(1.2873) Grad Norm: 10.0393  LR: 9.5441e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11 11  5  6  3]
Epoch 7 - avg_train_loss: 1.2873  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.5954
Epoch 7 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6254(0.6254) Grad Norm: 11.0347  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.134 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8406(1.0972) Grad Norm: 9.3278  LR: 9.4016e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 1 3]
Epoch 8 - avg_train_loss: 1.0972  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1733(0.1733) Grad Norm: 5.6452  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.121 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7318(1.2750) Grad Norm: 11.6077  LR: 9.2418e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [0 0 5 6 4]
Epoch 9 - avg_train_loss: 1.2750  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.4962


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6307(0.6307) Grad Norm: 11.9611  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.109 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9367(1.0073) Grad Norm: 16.5659  LR: 9.0654e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [3 2 2 5 3]
Epoch 10 - avg_train_loss: 1.0073  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.5344


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.5305(0.5305) Grad Norm: 9.1714  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.119 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 2.0002(1.0380) Grad Norm: 14.5302  LR: 8.8730e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 0 5 5 4]
Epoch 11 - avg_train_loss: 1.0380  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6041(0.6041) Grad Norm: 9.5154  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.119 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3295(0.8568) Grad Norm: 8.7682  LR: 8.6655e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [ 3 11  5  5  4]
Epoch 12 - avg_train_loss: 0.8568  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6296(0.6296) Grad Norm: 26.3819  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1104(0.9782) Grad Norm: 8.8317  LR: 8.4436e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [3 0 5 5 4]
Epoch 13 - avg_train_loss: 0.9782  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.5725


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3885(1.3885) Grad Norm: 9.9128  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.121 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1110(0.9023) Grad Norm: 4.5861  LR: 8.2081e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 2 4]
Epoch 14 - avg_train_loss: 0.9023  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.4885


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0070(1.0070) Grad Norm: 5.0728  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2060(1.0347) Grad Norm: 9.9944  LR: 7.9601e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [ 1 11  5  6  4]
Epoch 15 - avg_train_loss: 1.0347  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1519(0.1519) Grad Norm: 8.9445  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.115 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2791(0.8069) Grad Norm: 6.5205  LR: 7.7006e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11 11  5  6  4]
Epoch 16 - avg_train_loss: 0.8069  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5267


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0826(0.0826) Grad Norm: 4.8662  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.112 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4593(0.7321) Grad Norm: 6.6244  LR: 7.4304e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [2 2 5 2 2]
Epoch 17 - avg_train_loss: 0.7321  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8637(1.8637) Grad Norm: 8.9072  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.120 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1454(0.8663) Grad Norm: 6.4175  LR: 7.1508e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  2  5  6  4]
Epoch 18 - avg_train_loss: 0.8663  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5191


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2395(0.2395) Grad Norm: 5.3344  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.127 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.1682(0.5543) Grad Norm: 7.1867  LR: 6.8627e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 6 2]
Epoch 19 - avg_train_loss: 0.5543  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.4733


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5393(1.5393) Grad Norm: 9.8713  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.120 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 1.6398(0.6189) Grad Norm: 6.9553  LR: 6.5674e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [ 3  2  5 11  2]
Epoch 20 - avg_train_loss: 0.6189  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5496


EVAL: [8/9] Data 0.013 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1850(1.1850) Grad Norm: 6.5881  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.114 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5085(0.5855) Grad Norm: 16.9057  LR: 6.2661e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [9 0 5 4 2]
Epoch 21 - avg_train_loss: 0.5855  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5115


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0101(0.0101) Grad Norm: 1.6190  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.131 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0972(0.6907) Grad Norm: 6.6068  LR: 5.9598e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11 12  5  0 10]
Epoch 22 - avg_train_loss: 0.6907  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4283(1.4283) Grad Norm: 6.1497  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.125 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0765(0.8163) Grad Norm: 4.5665  LR: 5.6498e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  4  5  4  2]
Epoch 23 - avg_train_loss: 0.8163  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0344(1.0344) Grad Norm: 6.4863  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.123 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.4211(0.5213) Grad Norm: 12.5477  LR: 5.3373e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [9 4 5 4 2]
Epoch 24 - avg_train_loss: 0.5213  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5191


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2909(0.2909) Grad Norm: 13.2085  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.117 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7358(0.6317) Grad Norm: 3.6565  LR: 5.0236e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 0 2]
Epoch 25 - avg_train_loss: 0.6317  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1565(1.1565) Grad Norm: 5.3412  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.114 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0277(0.5689) Grad Norm: 2.3078  LR: 4.7099e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 0 5 0 2]
Epoch 26 - avg_train_loss: 0.5689  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5267


EVAL: [8/9] Data 0.012 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1725(1.1725) Grad Norm: 5.3065  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0849(0.7192) Grad Norm: 5.9618  LR: 4.3974e-04  
EVAL: [0/9] Data 0.067 (0.067) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [10 11  5  1  2]
Epoch 27 - avg_train_loss: 0.7192  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0077(0.0077) Grad Norm: 0.4676  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.139 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8127(0.4986) Grad Norm: 6.1238  LR: 4.0874e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [3 0 5 0 2]
Epoch 28 - avg_train_loss: 0.4986  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0686(0.0686) Grad Norm: 14.4726  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.114 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0107(0.6088) Grad Norm: 0.6659  LR: 3.7811e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [9 0 5 0 2]
Epoch 29 - avg_train_loss: 0.6088  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0043(0.0043) Grad Norm: 0.2565  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.132 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0122(0.6206) Grad Norm: 2.5866  LR: 3.4797e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  2  5  0  2]
Epoch 30 - avg_train_loss: 0.6206  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9901(0.9901) Grad Norm: 5.3782  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8533(0.5334) Grad Norm: 5.5345  LR: 3.1843e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [10  0  5  0  2]
Epoch 31 - avg_train_loss: 0.5334  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0009(0.0009) Grad Norm: 0.0794  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.125 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0039(0.4387) Grad Norm: 0.4082  LR: 2.8962e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  0  5  0  2]
Epoch 32 - avg_train_loss: 0.4387  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0028(0.0028) Grad Norm: 0.2664  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.133 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0046(0.4559) Grad Norm: 0.3241  LR: 2.6165e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [2 0 5 0 2]
Epoch 33 - avg_train_loss: 0.4559  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8829(0.8829) Grad Norm: 6.5386  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.113 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9453(0.4067) Grad Norm: 5.2769  LR: 2.3463e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [6 0 5 6 4]
Epoch 34 - avg_train_loss: 0.4067  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0454(1.0454) Grad Norm: 3.2337  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.118 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0018(0.4200) Grad Norm: 0.1645  LR: 2.0866e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 0 2]
Epoch 35 - avg_train_loss: 0.4200  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0049(0.0049) Grad Norm: 0.3839  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.127 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7464(0.4404) Grad Norm: 3.9142  LR: 1.8385e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 0 5 0 2]
Epoch 36 - avg_train_loss: 0.4404  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9338(0.9338) Grad Norm: 5.1655  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.128 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0103(0.4499) Grad Norm: 1.0766  LR: 1.6030e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 0 4]
Epoch 37 - avg_train_loss: 0.4499  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.100 (0.100) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0006(0.0006) Grad Norm: 0.0174  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.119 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6963(0.5327) Grad Norm: 2.5132  LR: 1.3809e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 0 4]
Epoch 38 - avg_train_loss: 0.5327  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6031
Epoch 38 - Save Best Score: 0.6031 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0020(0.0020) Grad Norm: 0.2815  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0039(0.4832) Grad Norm: 0.3312  LR: 1.1732e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [11  9  5  0  4]
Epoch 39 - avg_train_loss: 0.4832  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7837(0.7837) Grad Norm: 3.6230  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.114 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0006(0.4305) Grad Norm: 0.0584  LR: 9.8058e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [1 2 5 0 4]
Epoch 40 - avg_train_loss: 0.4305  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6126(0.6126) Grad Norm: 4.8701  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.122 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7014(0.4332) Grad Norm: 3.0608  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 41 - avg_train_loss: 0.4332  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9979(0.9979) Grad Norm: 8.2163  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5938(0.5149) Grad Norm: 2.3155  LR: 6.4381e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 42 - avg_train_loss: 0.5149  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0002(0.0002) Grad Norm: 0.0141  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.129 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0010(0.5013) Grad Norm: 0.0685  LR: 5.0093e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 43 - avg_train_loss: 0.5013  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.113 (0.113) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7731(0.7731) Grad Norm: 2.8430  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.109 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.4855(0.3370) Grad Norm: 1.3923  LR: 3.7578e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 44 - avg_train_loss: 0.3370  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8532(0.8532) Grad Norm: 6.2802  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.133 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.6223(0.3900) Grad Norm: 3.0764  LR: 2.6881e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 45 - avg_train_loss: 0.3900  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0010(0.0010) Grad Norm: 0.0673  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.120 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.0016) Grad Norm: 0.0189  LR: 1.8039e-05  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 46 - avg_train_loss: 0.0016  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0015(0.0015) Grad Norm: 0.0751  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.114 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0008(0.0023) Grad Norm: 0.0331  LR: 1.1073e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 47 - avg_train_loss: 0.0023  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.117 (0.117) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0017(0.0017) Grad Norm: 0.1444  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.118 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0054(0.0032) Grad Norm: 0.3270  LR: 5.9882e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 48 - avg_train_loss: 0.0032  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0015(0.0015) Grad Norm: 0.0956  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.109 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0010(0.0018) Grad Norm: 0.0632  LR: 2.7534e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 49 - avg_train_loss: 0.0018  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.0345  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.121 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0007(0.0011) Grad Norm: 0.0264  LR: 1.2467e-06  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  6  2  2  4]
preds: [5 2 5 0 4]
Epoch 50 - avg_train_loss: 0.0011  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 2.5294(2.5294) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.119 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 2.5541(2.5490) Grad Norm: 13.7579  LR: 1.0000e-03  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [2 2 2 2 2]
Epoch 1 - avg_train_loss: 2.5490  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2366
Epoch 1 - Save Best Score: 0.2366 Model


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3630(2.3630) Grad Norm: 8.5038  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.117 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3647(2.3336) Grad Norm: 18.4263  LR: 9.9803e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [6 2 5 6 8]
Epoch 2 - avg_train_loss: 2.3336  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.4427
Epoch 2 - Save Best Score: 0.4427 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 45s) Loss: 2.0218(2.0218) Grad Norm: 13.7985  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.124 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2377(2.0185) Grad Norm: 8.1192  LR: 9.9312e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [0 0 1 0 0]
Epoch 3 - avg_train_loss: 2.0185  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.5267
Epoch 3 - Save Best Score: 0.5267 Model


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 45s) Loss: 1.4347(1.4347) Grad Norm: 10.1929  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.124 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1599(1.7312) Grad Norm: 15.5267  LR: 9.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [7 0 3 0 1]
Epoch 4 - avg_train_loss: 1.7312  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.5420
Epoch 4 - Save Best Score: 0.5420 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0440(1.0440) Grad Norm: 10.4745  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.112 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 2.5285(1.6686) Grad Norm: 10.7662  LR: 9.7751e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [5 7 3 6 9]
Epoch 5 - avg_train_loss: 1.6686  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.5573
Epoch 5 - Save Best Score: 0.5573 Model


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6188(1.6188) Grad Norm: 8.0866  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.110 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8708(1.3637) Grad Norm: 12.2446  LR: 9.6688e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [5 0 3 6 8]
Epoch 6 - avg_train_loss: 1.3637  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5344


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 1.8828(1.8828) Grad Norm: 12.0737  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.120 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3347(1.2526) Grad Norm: 17.0654  LR: 9.5441e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 4]
Epoch 7 - avg_train_loss: 1.2526  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7611(0.7611) Grad Norm: 18.6718  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7902(1.0331) Grad Norm: 11.6280  LR: 9.4016e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [2 0 3 0 9]
Epoch 8 - avg_train_loss: 1.0331  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.5802
Epoch 8 - Save Best Score: 0.5802 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5050(1.5050) Grad Norm: 9.1815  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.128 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6533(1.4169) Grad Norm: 9.9486  LR: 9.2418e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [10  0  6  0  2]
Epoch 9 - avg_train_loss: 1.4169  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5092(1.5092) Grad Norm: 7.8089  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.120 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0971(1.0202) Grad Norm: 11.9410  LR: 9.0654e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 10 - avg_train_loss: 1.0202  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.5954
Epoch 10 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3076(0.3076) Grad Norm: 5.3878  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3799(1.1111) Grad Norm: 10.0229  LR: 8.8730e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 1 9]
Epoch 11 - avg_train_loss: 1.1111  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.6183
Epoch 11 - Save Best Score: 0.6183 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3000(0.3000) Grad Norm: 7.9030  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.126 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7120(0.9365) Grad Norm: 15.7588  LR: 8.6655e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 12 - avg_train_loss: 0.9365  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4077(1.4077) Grad Norm: 6.4927  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4904(0.8288) Grad Norm: 10.8530  LR: 8.4436e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [11  0  3  6  9]
Epoch 13 - avg_train_loss: 0.8288  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.5115


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0739(0.0739) Grad Norm: 2.8246  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.111 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8672(0.9905) Grad Norm: 11.5690  LR: 8.2081e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 4]
Epoch 14 - avg_train_loss: 0.9905  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5878


EVAL: [8/9] Data 0.016 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0839(0.0839) Grad Norm: 2.9541  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.116 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8622(1.0400) Grad Norm: 8.8009  LR: 7.9601e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [ 4  0  3 11  9]
Epoch 15 - avg_train_loss: 1.0400  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.6183


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2351(1.2351) Grad Norm: 4.9718  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.119 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1623(0.8244) Grad Norm: 7.1109  LR: 7.7006e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [ 5  0  3 11  9]
Epoch 16 - avg_train_loss: 0.8244  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2482(1.2482) Grad Norm: 19.1829  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.116 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2140(0.7611) Grad Norm: 7.3728  LR: 7.4304e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [ 1  0  3 11  9]
Epoch 17 - avg_train_loss: 0.7611  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0552(0.0552) Grad Norm: 3.6839  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.123 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1069(0.7674) Grad Norm: 4.5830  LR: 7.1508e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [ 0  0  3 11  9]
Epoch 18 - avg_train_loss: 0.7674  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.114 (0.114) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8009(0.8009) Grad Norm: 4.6304  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.109 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1186(0.6999) Grad Norm: 8.5939  LR: 6.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [ 4  0  3 11  9]
Epoch 19 - avg_train_loss: 0.6999  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5191


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3814(1.3814) Grad Norm: 6.2939  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.129 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5356(0.6491) Grad Norm: 25.8581  LR: 6.5674e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 20 - avg_train_loss: 0.6491  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0728(0.0728) Grad Norm: 4.3884  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.113 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3676(0.8377) Grad Norm: 5.6618  LR: 6.2661e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 9]
Epoch 21 - avg_train_loss: 0.8377  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5150(1.5150) Grad Norm: 8.4966  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1200(0.6281) Grad Norm: 6.6218  LR: 5.9598e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [0 0 3 0 9]
Epoch 22 - avg_train_loss: 0.6281  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2418(0.2418) Grad Norm: 9.9049  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.117 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 1.2572(0.5880) Grad Norm: 6.6688  LR: 5.6498e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 4]
Epoch 23 - avg_train_loss: 0.5880  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0137(0.0137) Grad Norm: 0.5303  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.123 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 0.1114(0.5735) Grad Norm: 12.6722  LR: 5.3373e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [2 0 3 6 9]
Epoch 24 - avg_train_loss: 0.5735  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5725


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3837(0.3837) Grad Norm: 11.5996  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.120 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6244(0.5941) Grad Norm: 13.8148  LR: 5.0236e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 8 9]
Epoch 25 - avg_train_loss: 0.5941  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0076(0.0076) Grad Norm: 0.6606  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.122 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0351(0.7317) Grad Norm: 1.9071  LR: 4.7099e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 4]
Epoch 26 - avg_train_loss: 0.7317  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6489
Epoch 26 - Save Best Score: 0.6489 Model


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0090(0.0090) Grad Norm: 0.3012  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.125 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0321(0.5434) Grad Norm: 3.6342  LR: 4.3974e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 0 9]
Epoch 27 - avg_train_loss: 0.5434  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.106 (0.106) Elapsed 0m 1s (remain 0m 43s) Loss: 1.4869(1.4869) Grad Norm: 8.0663  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.110 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 1.0855(0.6088) Grad Norm: 5.9904  LR: 4.0874e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 28 - avg_train_loss: 0.6088  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7634(0.7634) Grad Norm: 2.6787  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.118 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0059(0.5827) Grad Norm: 0.3541  LR: 3.7811e-04  
EVAL: [0/9] Data 0.067 (0.067) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 2]
Epoch 29 - avg_train_loss: 0.5827  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9384(0.9384) Grad Norm: 3.9522  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.108 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1594(0.6003) Grad Norm: 6.2672  LR: 3.4797e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 30 - avg_train_loss: 0.6003  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0025(0.0025) Grad Norm: 0.2575  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.113 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0015(0.5715) Grad Norm: 0.0635  LR: 3.1843e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 4]
Epoch 31 - avg_train_loss: 0.5715  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7681(0.7681) Grad Norm: 4.3839  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.120 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7038(0.5294) Grad Norm: 2.9282  LR: 2.8962e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 32 - avg_train_loss: 0.5294  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.6260


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9666(0.9666) Grad Norm: 8.1026  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.117 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0020(0.5920) Grad Norm: 0.0562  LR: 2.6165e-04  
EVAL: [0/9] Data 0.067 (0.067) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 33 - avg_train_loss: 0.5920  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8890(0.8890) Grad Norm: 5.2064  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.123 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0035(0.4582) Grad Norm: 0.3318  LR: 2.3463e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 34 - avg_train_loss: 0.4582  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6489


EVAL: [8/9] Data 0.015 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8357(0.8357) Grad Norm: 3.3092  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.122 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.3968) Grad Norm: 0.0125  LR: 2.0866e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 35 - avg_train_loss: 0.3968  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6489


EVAL: [8/9] Data 0.017 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0043(0.0043) Grad Norm: 0.6833  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.130 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.3559(0.4746) Grad Norm: 6.6753  LR: 1.8385e-04  
EVAL: [0/9] Data 0.066 (0.066) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 36 - avg_train_loss: 0.4746  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8778(0.8778) Grad Norm: 4.2554  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.099 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7792(0.3886) Grad Norm: 3.6688  LR: 1.6030e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 37 - avg_train_loss: 0.3886  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6565
Epoch 37 - Save Best Score: 0.6565 Model


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9574(0.9574) Grad Norm: 4.0937  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.122 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.5980(0.4011) Grad Norm: 3.9588  LR: 1.3809e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 38 - avg_train_loss: 0.4011  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6489


EVAL: [8/9] Data 0.013 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9112(0.9112) Grad Norm: 4.6517  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.106 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.3754) Grad Norm: 0.0102  LR: 1.1732e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 39 - avg_train_loss: 0.3754  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6718
Epoch 39 - Save Best Score: 0.6718 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6524(0.6524) Grad Norm: 6.7453  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5921(0.4889) Grad Norm: 4.7698  LR: 9.8058e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 9 9]
Epoch 40 - avg_train_loss: 0.4889  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7540(0.7540) Grad Norm: 2.8617  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.130 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.4165) Grad Norm: 0.0502  LR: 8.0390e-05  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 41 - avg_train_loss: 0.4165  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6941(0.6941) Grad Norm: 4.5350  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.114 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0034(0.3040) Grad Norm: 0.1897  LR: 6.4381e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 42 - avg_train_loss: 0.3040  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6641


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.109 (0.109) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0170  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.124 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.3761) Grad Norm: 0.0127  LR: 5.0093e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 43 - avg_train_loss: 0.3761  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6641


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0211  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.115 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0016(0.2404) Grad Norm: 0.2658  LR: 3.7578e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 44 - avg_train_loss: 0.2404  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6794
Epoch 44 - Save Best Score: 0.6794 Model


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.1062  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.110 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0002(0.3828) Grad Norm: 0.0080  LR: 2.6881e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 45 - avg_train_loss: 0.3828  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6641


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.111 (0.111) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0006(0.0006) Grad Norm: 0.0387  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.122 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0002(0.0017) Grad Norm: 0.0101  LR: 1.8039e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 46 - avg_train_loss: 0.0017  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0057(0.0057) Grad Norm: 0.2738  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.137 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0033) Grad Norm: 0.0224  LR: 1.1073e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 47 - avg_train_loss: 0.0033  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6565


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0005(0.0005) Grad Norm: 0.0162  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.113 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0014) Grad Norm: 0.0185  LR: 5.9882e-06  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 48 - avg_train_loss: 0.0014  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0006(0.0006) Grad Norm: 0.0609  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.117 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.0163) Grad Norm: 0.0081  LR: 2.7534e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 49 - avg_train_loss: 0.0163  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0118  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.112 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0009) Grad Norm: 0.0164  LR: 1.2467e-06  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [4 0 3 6 0]
preds: [4 0 3 6 9]
Epoch 50 - avg_train_loss: 0.0009  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6565


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.114 (0.114) Elapsed 0m 1s (remain 0m 42s) Loss: 2.9734(2.9734) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.134 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 2.4460(2.5576) Grad Norm: 19.0562  LR: 1.0000e-03  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [6 9 6 6 1]
Epoch 1 - avg_train_loss: 2.5576  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2519
Epoch 1 - Save Best Score: 0.2519 Model


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 2.5663(2.5663) Grad Norm: 21.3169  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.107 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0874(2.2200) Grad Norm: 17.2704  LR: 9.9803e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5  9  2 11  8]
Epoch 2 - avg_train_loss: 2.2200  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.4046
Epoch 2 - Save Best Score: 0.4046 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 45s) Loss: 2.1333(2.1333) Grad Norm: 13.0374  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.124 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0212(1.9631) Grad Norm: 9.5826  LR: 9.9312e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 2  9  2 11  8]
Epoch 3 - avg_train_loss: 1.9631  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.4885
Epoch 3 - Save Best Score: 0.4885 Model


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3387(2.3387) Grad Norm: 11.1595  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.127 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6036(1.7600) Grad Norm: 11.4551  LR: 9.8627e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 2 12  2 11  8]
Epoch 4 - avg_train_loss: 1.7600  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.6183
Epoch 4 - Save Best Score: 0.6183 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4171(1.4171) Grad Norm: 15.9386  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.125 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2786(1.5290) Grad Norm: 12.0804  LR: 9.7751e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5  9  4 11  8]
Epoch 5 - avg_train_loss: 1.5290  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.4504


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4087(1.4087) Grad Norm: 12.7238  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.110 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0504(1.3376) Grad Norm: 11.2812  LR: 9.6688e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 6 - avg_train_loss: 1.3376  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2917(0.2917) Grad Norm: 6.3221  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.114 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9168(1.2069) Grad Norm: 10.9300  LR: 9.5441e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5  9  2 10  8]
Epoch 7 - avg_train_loss: 1.2069  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.8721(1.8721) Grad Norm: 10.6046  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.120 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1790(1.1693) Grad Norm: 9.8981  LR: 9.4016e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 8 - avg_train_loss: 1.1693  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.6260
Epoch 8 - Save Best Score: 0.6260 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2677(0.2677) Grad Norm: 8.5608  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.125 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9439(0.7543) Grad Norm: 9.2272  LR: 9.2418e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5  9  2 11  8]
Epoch 9 - avg_train_loss: 0.7543  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.6489
Epoch 9 - Save Best Score: 0.6489 Model


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3213(1.3213) Grad Norm: 7.2442  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6918(1.0748) Grad Norm: 9.1179  LR: 9.0654e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 10 - avg_train_loss: 1.0748  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3125(0.3125) Grad Norm: 8.2497  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.105 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6231(1.0088) Grad Norm: 8.6980  LR: 8.8730e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 2  9  2 11  8]
Epoch 11 - avg_train_loss: 1.0088  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3994(0.3994) Grad Norm: 4.8117  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.118 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1923(1.0012) Grad Norm: 4.7163  LR: 8.6655e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 4 12  2 11  8]
Epoch 12 - avg_train_loss: 1.0012  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8317(1.8317) Grad Norm: 11.4060  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.114 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6139(1.0730) Grad Norm: 7.6054  LR: 8.4436e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 13 - avg_train_loss: 1.0730  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.5954


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2070(0.2070) Grad Norm: 6.4029  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.114 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1538(0.8246) Grad Norm: 3.2805  LR: 8.2081e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [5 9 2 6 8]
Epoch 14 - avg_train_loss: 0.8246  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3788(1.3788) Grad Norm: 7.9182  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2257(0.7529) Grad Norm: 11.5272  LR: 7.9601e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2  1  8]
Epoch 15 - avg_train_loss: 0.7529  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4511(1.4511) Grad Norm: 6.2964  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.109 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4008(0.9586) Grad Norm: 6.1066  LR: 7.7006e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 3  9  2 11  8]
Epoch 16 - avg_train_loss: 0.9586  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5649


EVAL: [8/9] Data 0.012 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5669(1.5669) Grad Norm: 9.0234  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.125 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 0.9831(0.7930) Grad Norm: 22.8327  LR: 7.4304e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 3 11  0 11  8]
Epoch 17 - avg_train_loss: 0.7930  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.4962


EVAL: [8/9] Data 0.014 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.5445(0.5445) Grad Norm: 11.0842  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.112 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 1.8998(0.5819) Grad Norm: 21.5002  LR: 7.1508e-04  
EVAL: [0/9] Data 0.064 (0.064) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 2 12  2 11  8]
Epoch 18 - avg_train_loss: 0.5819  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.6031


EVAL: [8/9] Data 0.012 (0.059) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.114 (0.114) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0694(0.0694) Grad Norm: 2.6178  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.119 (0.115) Elapsed 0m 44s (remain 0m 0s) Loss: 1.3732(0.8176) Grad Norm: 6.4683  LR: 6.8627e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5  9  2 11  8]
Epoch 19 - avg_train_loss: 0.8176  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5725


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3401(1.3401) Grad Norm: 5.5659  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.119 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0914(0.6332) Grad Norm: 5.5359  LR: 6.5674e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 4 10  2 11  8]
Epoch 20 - avg_train_loss: 0.6332  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1746(1.1746) Grad Norm: 9.5013  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.120 (0.114) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0252(0.7186) Grad Norm: 2.4951  LR: 6.2661e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 10  2 11  8]
Epoch 21 - avg_train_loss: 0.7186  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0134(0.0134) Grad Norm: 1.5069  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.124 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0461(0.8194) Grad Norm: 2.2878  LR: 5.9598e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 2 12  2 11  8]
Epoch 22 - avg_train_loss: 0.8194  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3164(0.3164) Grad Norm: 10.4206  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.109 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5086(0.5515) Grad Norm: 6.3722  LR: 5.6498e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 23 - avg_train_loss: 0.5515  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1725(1.1725) Grad Norm: 7.1782  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.123 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8567(0.6401) Grad Norm: 3.7154  LR: 5.3373e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 24 - avg_train_loss: 0.6401  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.6183


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1156(1.1156) Grad Norm: 7.3320  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.121 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8477(0.6455) Grad Norm: 5.5126  LR: 5.0236e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2  8  8]
Epoch 25 - avg_train_loss: 0.6455  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2728(1.2728) Grad Norm: 7.6885  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.113 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3570(0.8404) Grad Norm: 11.2557  LR: 4.7099e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 26 - avg_train_loss: 0.8404  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3081(1.3081) Grad Norm: 6.7058  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.118 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0211(0.6155) Grad Norm: 1.8064  LR: 4.3974e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 27 - avg_train_loss: 0.6155  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6641
Epoch 27 - Save Best Score: 0.6641 Model


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2641(1.2641) Grad Norm: 10.4446  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.126 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0030(0.4898) Grad Norm: 0.1927  LR: 4.0874e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 28 - avg_train_loss: 0.4898  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2112(1.2112) Grad Norm: 8.5909  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.106 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.9530(0.4267) Grad Norm: 5.3209  LR: 3.7811e-04  
EVAL: [0/9] Data 0.084 (0.084) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 29 - avg_train_loss: 0.4267  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.6565


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0082(0.0082) Grad Norm: 0.3190  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.117 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0411(0.3994) Grad Norm: 3.1772  LR: 3.4797e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 30 - avg_train_loss: 0.3994  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0401(1.0401) Grad Norm: 8.4214  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.141 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0837(0.5663) Grad Norm: 5.2188  LR: 3.1843e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 31 - avg_train_loss: 0.5663  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6870
Epoch 31 - Save Best Score: 0.6870 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0028(0.0028) Grad Norm: 0.1440  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.127 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0016(0.5925) Grad Norm: 0.0543  LR: 2.8962e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 32 - avg_train_loss: 0.5925  lr: 2.8962e-04  time: 50s
Epoch 32 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2093(1.2093) Grad Norm: 8.4362  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.126 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0486(0.4793) Grad Norm: 5.4616  LR: 2.6165e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 33 - avg_train_loss: 0.4793  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.7023
Epoch 33 - Save Best Score: 0.7023 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0015(0.0015) Grad Norm: 0.0555  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.123 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0522(0.4669) Grad Norm: 5.9143  LR: 2.3463e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 34 - avg_train_loss: 0.4669  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0736(1.0736) Grad Norm: 6.1396  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.102 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0058(0.6150) Grad Norm: 0.2806  LR: 2.0866e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 35 - avg_train_loss: 0.6150  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0039(0.0039) Grad Norm: 0.1556  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.110 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0020(0.4734) Grad Norm: 0.0737  LR: 1.8385e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 36 - avg_train_loss: 0.4734  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.7176
Epoch 36 - Save Best Score: 0.7176 Model


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0027(0.0027) Grad Norm: 0.1805  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.122 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0046(0.5173) Grad Norm: 0.2170  LR: 1.6030e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 37 - avg_train_loss: 0.5173  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6718


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8930(0.8930) Grad Norm: 5.7098  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.131 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.3464) Grad Norm: 0.0224  LR: 1.3809e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 38 - avg_train_loss: 0.3464  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6947


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8759(0.8759) Grad Norm: 6.2393  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7539(0.4438) Grad Norm: 4.0493  LR: 1.1732e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 39 - avg_train_loss: 0.4438  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.7023


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8763(0.8763) Grad Norm: 4.4971  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.122 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7958(0.5065) Grad Norm: 3.9123  LR: 9.8058e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 40 - avg_train_loss: 0.5065  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6658(0.6658) Grad Norm: 3.1222  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.105 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7196(0.3556) Grad Norm: 3.5124  LR: 8.0390e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 41 - avg_train_loss: 0.3556  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6947


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0053(0.0053) Grad Norm: 0.9682  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7954(0.4450) Grad Norm: 3.4284  LR: 6.4381e-05  
EVAL: [0/9] Data 0.067 (0.067) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 42 - avg_train_loss: 0.4450  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.7176


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1595(1.1595) Grad Norm: 7.2044  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.132 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9531(0.4447) Grad Norm: 49.2551  LR: 5.0093e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 43 - avg_train_loss: 0.4447  lr: 5.0093e-05  time: 50s
Epoch 43 - Score: 0.7176


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.0271  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.133 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0025(0.4748) Grad Norm: 9.3201  LR: 3.7578e-05  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 44 - avg_train_loss: 0.4748  lr: 3.7578e-05  time: 50s
Epoch 44 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7958(0.7958) Grad Norm: 4.8537  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7802(0.4857) Grad Norm: 4.3545  LR: 2.6881e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 45 - avg_train_loss: 0.4857  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6947


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0004(0.0004) Grad Norm: 0.0127  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.128 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0002(0.0037) Grad Norm: 0.0050  LR: 1.8039e-05  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 46 - avg_train_loss: 0.0037  lr: 1.8039e-05  time: 50s
Epoch 46 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0214  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.126 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0012(0.0017) Grad Norm: 0.1324  LR: 1.1073e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 47 - avg_train_loss: 0.0017  lr: 1.1073e-05  time: 50s
Epoch 47 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0219  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.122 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.0021) Grad Norm: 0.0451  LR: 5.9882e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 48 - avg_train_loss: 0.0021  lr: 5.9882e-06  time: 50s
Epoch 48 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0018(0.0018) Grad Norm: 0.1457  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0015(0.0014) Grad Norm: 0.2016  LR: 2.7534e-06  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 49 - avg_train_loss: 0.0014  lr: 2.7534e-06  time: 50s
Epoch 49 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0015(0.0015) Grad Norm: 0.5562  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0508(0.0031) Grad Norm: 8.6729  LR: 1.2467e-06  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5 12  2 11  8]
preds: [ 5 12  2 11  8]
Epoch 50 - avg_train_loss: 0.0031  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6870


EVAL: [8/9] Data 0.013 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 42s) Loss: 2.6906(2.6906) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.133 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.6101(2.5425) Grad Norm: 10.3876  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [4 2 2 4 0]
Epoch 1 - avg_train_loss: 2.5425  lr: 1.0000e-03  time: 50s
Epoch 1 - Score: 0.2748
Epoch 1 - Save Best Score: 0.2748 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 2.4470(2.4470) Grad Norm: 11.3807  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.129 (0.130) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9573(2.3474) Grad Norm: 11.7384  LR: 9.9803e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  2  2  2 11]
Epoch 2 - avg_train_loss: 2.3474  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.3664
Epoch 2 - Save Best Score: 0.3664 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 2.2382(2.2382) Grad Norm: 12.4972  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.141 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7319(2.0399) Grad Norm: 11.8603  LR: 9.9312e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  7  2  2 11]
Epoch 3 - avg_train_loss: 2.0399  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4504
Epoch 3 - Save Best Score: 0.4504 Model


EVAL: [8/9] Data 0.014 (0.071) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6758(1.6758) Grad Norm: 9.2718  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.135 (0.130) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3865(1.6043) Grad Norm: 12.4651  LR: 9.8627e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  7  2  4 12]
Epoch 4 - avg_train_loss: 1.6043  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.4885
Epoch 4 - Save Best Score: 0.4885 Model


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 45s) Loss: 1.6639(1.6639) Grad Norm: 10.5536  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.118 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6249(1.5134) Grad Norm: 7.5738  LR: 9.7751e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  4  2  4 12]
Epoch 5 - avg_train_loss: 1.5134  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.5802
Epoch 5 - Save Best Score: 0.5802 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3980(1.3980) Grad Norm: 9.3550  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.118 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4957(1.1610) Grad Norm: 8.8790  LR: 9.6688e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 0 1 4 0]
Epoch 6 - avg_train_loss: 1.1610  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.4275


EVAL: [8/9] Data 0.014 (0.070) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0375(2.0375) Grad Norm: 10.6893  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.119 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2660(1.4081) Grad Norm: 11.4796  LR: 9.5441e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 2 0]
Epoch 7 - avg_train_loss: 1.4081  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5573


EVAL: [8/9] Data 0.015 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5350(1.5350) Grad Norm: 9.5468  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.128 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5475(1.2585) Grad Norm: 10.4011  LR: 9.4016e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 2 1]
Epoch 8 - avg_train_loss: 1.2585  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5038


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2135(1.2135) Grad Norm: 11.7295  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.137 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6338(0.9879) Grad Norm: 11.4806  LR: 9.2418e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 1 4 3]
Epoch 9 - avg_train_loss: 0.9879  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.4656


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 45s) Loss: 1.2795(1.2795) Grad Norm: 12.8256  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3584(1.0908) Grad Norm: 10.3836  LR: 9.0654e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 4 3]
Epoch 10 - avg_train_loss: 1.0908  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5954
Epoch 10 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4319(0.4319) Grad Norm: 9.3084  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.139 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5391(0.9525) Grad Norm: 8.7453  LR: 8.8730e-04  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [9 4 2 4 6]
Epoch 11 - avg_train_loss: 0.9525  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5038


EVAL: [8/9] Data 0.015 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.4445(1.4445) Grad Norm: 10.0098  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.121 (0.129) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2603(0.8096) Grad Norm: 8.5207  LR: 8.6655e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 4 3]
Epoch 12 - avg_train_loss: 0.8096  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.6489
Epoch 12 - Save Best Score: 0.6489 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8080(1.8080) Grad Norm: 9.2960  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.130 (0.129) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0296(1.2409) Grad Norm: 4.7708  LR: 8.4436e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  4  2  4 12]
Epoch 13 - avg_train_loss: 1.2409  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.5191


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2907(0.2907) Grad Norm: 6.7841  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.120 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6463(0.8650) Grad Norm: 7.0255  LR: 8.2081e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 0 5]
Epoch 14 - avg_train_loss: 0.8650  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3680(1.3680) Grad Norm: 6.5109  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.138 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6582(1.0130) Grad Norm: 8.7266  LR: 7.9601e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  7  2  4 12]
Epoch 15 - avg_train_loss: 1.0130  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5564(1.5564) Grad Norm: 7.6294  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.132 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6097(1.0090) Grad Norm: 13.5053  LR: 7.7006e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 5 5]
Epoch 16 - avg_train_loss: 1.0090  lr: 7.7006e-04  time: 50s
Epoch 16 - Score: 0.5496


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3002(0.3002) Grad Norm: 8.3688  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.122 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0509(0.7187) Grad Norm: 3.3389  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 3 2 5 2]
Epoch 17 - avg_train_loss: 0.7187  lr: 7.4304e-04  time: 50s
Epoch 17 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0547(0.0547) Grad Norm: 5.6321  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.132 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3415(0.7615) Grad Norm: 34.8812  LR: 7.1508e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [ 5  7  2  2 12]
Epoch 18 - avg_train_loss: 0.7615  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5038


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2102(1.2102) Grad Norm: 8.8180  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.129 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5037(0.8056) Grad Norm: 13.4124  LR: 6.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 4 3]
Epoch 19 - avg_train_loss: 0.8056  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.4962


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.113 (0.113) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1391(0.1391) Grad Norm: 5.8710  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0548(0.6706) Grad Norm: 2.3051  LR: 6.5674e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 9]
Epoch 20 - avg_train_loss: 0.6706  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6363(0.6363) Grad Norm: 15.4773  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.138 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0381(0.6902) Grad Norm: 5.0259  LR: 6.2661e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 9 9]
Epoch 21 - avg_train_loss: 0.6902  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3129(1.3129) Grad Norm: 12.1007  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.120 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3218(0.7237) Grad Norm: 6.2703  LR: 5.9598e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 0]
Epoch 22 - avg_train_loss: 0.7237  lr: 5.9598e-04  time: 50s
Epoch 22 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0851(0.0851) Grad Norm: 4.6964  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.123 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0305(0.5138) Grad Norm: 1.9127  LR: 5.6498e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 5 6]
Epoch 23 - avg_train_loss: 0.5138  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0049(0.0049) Grad Norm: 0.1796  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.118 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0213(0.5423) Grad Norm: 1.6018  LR: 5.3373e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 9]
Epoch 24 - avg_train_loss: 0.5423  lr: 5.3373e-04  time: 50s
Epoch 24 - Score: 0.5649


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0479(0.0479) Grad Norm: 3.3853  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1608(0.5199) Grad Norm: 7.1703  LR: 5.0236e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 4 6]
Epoch 25 - avg_train_loss: 0.5199  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2665(0.2665) Grad Norm: 6.3130  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.125 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2611(0.7440) Grad Norm: 7.7615  LR: 4.7099e-04  
EVAL: [0/9] Data 0.082 (0.082) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 2 6]
Epoch 26 - avg_train_loss: 0.7440  lr: 4.7099e-04  time: 50s
Epoch 26 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1312(1.1312) Grad Norm: 8.5469  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.130 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6031(0.6080) Grad Norm: 8.5444  LR: 4.3974e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 4 5]
Epoch 27 - avg_train_loss: 0.6080  lr: 4.3974e-04  time: 50s
Epoch 27 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0115(0.0115) Grad Norm: 0.6288  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0430(0.5070) Grad Norm: 3.0937  LR: 4.0874e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [2 4 2 2 6]
Epoch 28 - avg_train_loss: 0.5070  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2442(1.2442) Grad Norm: 5.5553  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3350(0.5570) Grad Norm: 12.8932  LR: 3.7811e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 4 9]
Epoch 29 - avg_train_loss: 0.5570  lr: 3.7811e-04  time: 50s
Epoch 29 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0956(1.0956) Grad Norm: 6.6436  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0911(0.6190) Grad Norm: 4.7391  LR: 3.4797e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 4 6]
Epoch 30 - avg_train_loss: 0.6190  lr: 3.4797e-04  time: 50s
Epoch 30 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0088(0.0088) Grad Norm: 0.3793  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5928(0.4945) Grad Norm: 3.0284  LR: 3.1843e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 31 - avg_train_loss: 0.4945  lr: 3.1843e-04  time: 50s
Epoch 31 - Score: 0.6183


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8919(0.8919) Grad Norm: 5.5900  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.120 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2087(0.5658) Grad Norm: 16.8442  LR: 2.8962e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 5 5]
Epoch 32 - avg_train_loss: 0.5658  lr: 2.8962e-04  time: 50s
Epoch 32 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0032(0.0032) Grad Norm: 0.2928  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.126 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9387(0.3879) Grad Norm: 5.0451  LR: 2.6165e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 33 - avg_train_loss: 0.3879  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0020(0.0020) Grad Norm: 0.0585  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0198(0.4814) Grad Norm: 1.4962  LR: 2.3463e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 2 2 4 6]
Epoch 34 - avg_train_loss: 0.4814  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9395(0.9395) Grad Norm: 6.2672  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0041(0.7489) Grad Norm: 0.1421  LR: 2.0866e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 35 - avg_train_loss: 0.7489  lr: 2.0866e-04  time: 50s
Epoch 35 - Score: 0.5878


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2584(1.2584) Grad Norm: 7.5473  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.116 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9657(0.4982) Grad Norm: 4.4686  LR: 1.8385e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 36 - avg_train_loss: 0.4982  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0308(0.0308) Grad Norm: 1.7628  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.123 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6686(0.3195) Grad Norm: 3.8613  LR: 1.6030e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 37 - avg_train_loss: 0.3195  lr: 1.6030e-04  time: 50s
Epoch 37 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0161(0.0161) Grad Norm: 1.1877  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.127 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6750(0.3166) Grad Norm: 3.4745  LR: 1.3809e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 38 - avg_train_loss: 0.3166  lr: 1.3809e-04  time: 50s
Epoch 38 - Score: 0.6260


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8635(0.8635) Grad Norm: 5.3249  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0730(0.4280) Grad Norm: 9.5798  LR: 1.1732e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 4 6]
Epoch 39 - avg_train_loss: 0.4280  lr: 1.1732e-04  time: 50s
Epoch 39 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1098(0.1098) Grad Norm: 16.2399  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.130 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0027(0.3443) Grad Norm: 0.1377  LR: 9.8058e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 40 - avg_train_loss: 0.3443  lr: 9.8058e-05  time: 50s
Epoch 40 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0207  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.127 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7621(0.3831) Grad Norm: 7.0003  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 41 - avg_train_loss: 0.3831  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0021(0.0021) Grad Norm: 0.0980  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.115 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9459(0.3784) Grad Norm: 4.9104  LR: 6.4381e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 2 6]
Epoch 42 - avg_train_loss: 0.3784  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0009(0.0009) Grad Norm: 0.0690  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.2800) Grad Norm: 0.0107  LR: 5.0093e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 2 6]
Epoch 43 - avg_train_loss: 0.2800  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0026(0.0026) Grad Norm: 0.1753  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.3090) Grad Norm: 0.0278  LR: 3.7578e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 4 2 2 6]
Epoch 44 - avg_train_loss: 0.3090  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6107


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0008(0.0008) Grad Norm: 0.0426  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.128 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9232(0.4345) Grad Norm: 6.0328  LR: 2.6881e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 45 - avg_train_loss: 0.4345  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0006(0.0006) Grad Norm: 0.0345  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.116 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0022(0.0053) Grad Norm: 0.3020  LR: 1.8039e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 4 6]
Epoch 46 - avg_train_loss: 0.0053  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0026(0.0026) Grad Norm: 0.1170  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.116 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0032) Grad Norm: 0.0078  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 47 - avg_train_loss: 0.0032  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0159(0.0159) Grad Norm: 0.8751  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0002(0.0014) Grad Norm: 0.0143  LR: 5.9882e-06  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 48 - avg_train_loss: 0.0014  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0011(0.0011) Grad Norm: 0.1466  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.115 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.0018) Grad Norm: 0.0309  LR: 2.7534e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 49 - avg_train_loss: 0.0018  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0008(0.0008) Grad Norm: 0.0308  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.0020) Grad Norm: 0.0507  LR: 1.2467e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [2 7 2 3 0]
preds: [5 7 2 2 6]
Epoch 50 - avg_train_loss: 0.0020  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 42s) Loss: 2.7038(2.7038) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.113 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 2.3162(2.5735) Grad Norm: 10.0545  LR: 1.0000e-03  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [2 0 5 0 2]
Epoch 1 - avg_train_loss: 2.5735  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2769
Epoch 1 - Save Best Score: 0.2769 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3054(2.3054) Grad Norm: 11.7565  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.130 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.5717(2.3109) Grad Norm: 10.0858  LR: 9.9803e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 6 2 2 2]
Epoch 2 - avg_train_loss: 2.3109  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.3615
Epoch 2 - Save Best Score: 0.3615 Model


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0502(2.0502) Grad Norm: 9.9038  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2669(2.0459) Grad Norm: 11.0125  LR: 9.9312e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [ 8  3  5 10  2]
Epoch 3 - avg_train_loss: 2.0459  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.5000
Epoch 3 - Save Best Score: 0.5000 Model


EVAL: [8/9] Data 0.011 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6748(1.6748) Grad Norm: 12.5332  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.106 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1643(1.7354) Grad Norm: 10.1917  LR: 9.8627e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 1 5 1 4]
Epoch 4 - avg_train_loss: 1.7354  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.5308
Epoch 4 - Save Best Score: 0.5308 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 45s) Loss: 0.9866(0.9866) Grad Norm: 17.0452  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6859(1.5499) Grad Norm: 8.4455  LR: 9.7751e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 5 2 2]
Epoch 5 - avg_train_loss: 1.5499  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.5615
Epoch 5 - Save Best Score: 0.5615 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6604(0.6604) Grad Norm: 13.0538  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.109 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1378(1.4156) Grad Norm: 17.4707  LR: 9.6688e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 1 3]
Epoch 6 - avg_train_loss: 1.4156  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.6231
Epoch 6 - Save Best Score: 0.6231 Model


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0027(2.0027) Grad Norm: 20.6727  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.126 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0447(1.4770) Grad Norm: 7.9844  LR: 9.5441e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 3 3]
Epoch 7 - avg_train_loss: 1.4770  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5154


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6804(0.6804) Grad Norm: 11.5668  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.138 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0075(1.1467) Grad Norm: 11.6371  LR: 9.4016e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 5 1 3]
Epoch 8 - avg_train_loss: 1.1467  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5846


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1305(0.1305) Grad Norm: 4.6887  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.112 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5352(0.9745) Grad Norm: 10.4831  LR: 9.2418e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 1 2]
Epoch 9 - avg_train_loss: 0.9745  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.5846


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5757(1.5757) Grad Norm: 7.2326  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.133 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6185(1.0809) Grad Norm: 11.1507  LR: 9.0654e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 5 7 2]
Epoch 10 - avg_train_loss: 1.0809  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.6231


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4601(0.4601) Grad Norm: 7.2861  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.130 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5325(0.9044) Grad Norm: 7.9541  LR: 8.8730e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [ 8  3  2 11  4]
Epoch 11 - avg_train_loss: 0.9044  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5846


EVAL: [8/9] Data 0.012 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2732(0.2732) Grad Norm: 10.0734  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.127 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5714(1.0189) Grad Norm: 9.1513  LR: 8.6655e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 2 7 2]
Epoch 12 - avg_train_loss: 1.0189  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.6308
Epoch 12 - Save Best Score: 0.6308 Model


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.114 (0.114) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9544(0.9544) Grad Norm: 4.5951  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.126 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4358(0.9544) Grad Norm: 9.2673  LR: 8.4436e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 1 6 0 2]
Epoch 13 - avg_train_loss: 0.9544  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.6231


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3147(1.3147) Grad Norm: 8.1365  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.125 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8700(0.8291) Grad Norm: 14.5380  LR: 8.2081e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 1 0 1 1]
Epoch 14 - avg_train_loss: 0.8291  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.5462


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4845(0.4845) Grad Norm: 16.0136  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.114 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3028(0.8254) Grad Norm: 8.3140  LR: 7.9601e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 1 2]
Epoch 15 - avg_train_loss: 0.8254  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.6385
Epoch 15 - Save Best Score: 0.6385 Model


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1243(0.1243) Grad Norm: 5.7971  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5846(0.9586) Grad Norm: 9.2859  LR: 7.7006e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 1 0 0 2]
Epoch 16 - avg_train_loss: 0.9586  lr: 7.7006e-04  time: 50s
Epoch 16 - Score: 0.5538


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2234(0.2234) Grad Norm: 9.9767  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.124 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2382(0.8282) Grad Norm: 5.9819  LR: 7.4304e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 0 1]
Epoch 17 - avg_train_loss: 0.8282  lr: 7.4304e-04  time: 50s
Epoch 17 - Score: 0.5615


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2863(1.2863) Grad Norm: 5.9847  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.116 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0353(0.8100) Grad Norm: 1.3901  LR: 7.1508e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 2 2 2]
Epoch 18 - avg_train_loss: 0.8100  lr: 7.1508e-04  time: 50s
Epoch 18 - Score: 0.6077


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0638(0.0638) Grad Norm: 3.9864  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5217(0.8074) Grad Norm: 7.1821  LR: 6.8627e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 0 2]
Epoch 19 - avg_train_loss: 0.8074  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.6154


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1517(0.1517) Grad Norm: 7.8579  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.118 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0075(0.5885) Grad Norm: 0.4818  LR: 6.5674e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 2 2]
Epoch 20 - avg_train_loss: 0.5885  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.6462
Epoch 20 - Save Best Score: 0.6462 Model


EVAL: [8/9] Data 0.010 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 45s) Loss: 1.3059(1.3059) Grad Norm: 9.0062  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.132 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0721(0.6411) Grad Norm: 4.7456  LR: 6.2661e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 4 2]
Epoch 21 - avg_train_loss: 0.6411  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.5846


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2561(1.2561) Grad Norm: 9.2012  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0396(0.7708) Grad Norm: 2.2190  LR: 5.9598e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 0 2]
Epoch 22 - avg_train_loss: 0.7708  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.6077


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0113(1.0113) Grad Norm: 6.0918  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0337(0.9548) Grad Norm: 1.5653  LR: 5.6498e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [ 8  3  6  0 10]
Epoch 23 - avg_train_loss: 0.9548  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6462


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0999(0.0999) Grad Norm: 5.3145  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.136 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4040(0.5830) Grad Norm: 9.0024  LR: 5.3373e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 6 0 2]
Epoch 24 - avg_train_loss: 0.5830  lr: 5.3373e-04  time: 50s
Epoch 24 - Score: 0.5769


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0037(0.0037) Grad Norm: 0.3090  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.116 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0249(0.5654) Grad Norm: 3.1061  LR: 5.0236e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 2 0 3]
Epoch 25 - avg_train_loss: 0.5654  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.6000


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0471(0.0471) Grad Norm: 2.4730  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.116 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0125(0.5235) Grad Norm: 0.5890  LR: 4.7099e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [ 8  3  5  4 10]
Epoch 26 - avg_train_loss: 0.5235  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6615
Epoch 26 - Save Best Score: 0.6615 Model


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.155 (0.155) Elapsed 0m 1s (remain 0m 45s) Loss: 0.0619(0.0619) Grad Norm: 9.9781  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.118 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1631(0.4939) Grad Norm: 6.5888  LR: 4.3974e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 5 9 2]
Epoch 27 - avg_train_loss: 0.4939  lr: 4.3974e-04  time: 50s
Epoch 27 - Score: 0.6154


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3149(1.3149) Grad Norm: 6.2510  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.133 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1203(0.5234) Grad Norm: 7.0402  LR: 4.0874e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 4]
Epoch 28 - avg_train_loss: 0.5234  lr: 4.0874e-04  time: 50s
Epoch 28 - Score: 0.5923


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2503(1.2503) Grad Norm: 10.3929  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.124 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1858(0.4713) Grad Norm: 6.0663  LR: 3.7811e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 2 0 2]
Epoch 29 - avg_train_loss: 0.4713  lr: 3.7811e-04  time: 50s
Epoch 29 - Score: 0.5923


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0052(0.0052) Grad Norm: 0.3377  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.141 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0201(0.6243) Grad Norm: 4.2594  LR: 3.4797e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [ 8  3  0  0 10]
Epoch 30 - avg_train_loss: 0.6243  lr: 3.4797e-04  time: 50s
Epoch 30 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0113(0.0113) Grad Norm: 1.2194  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.114 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8886(0.6322) Grad Norm: 5.7374  LR: 3.1843e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 5]
Epoch 31 - avg_train_loss: 0.6322  lr: 3.1843e-04  time: 50s
Epoch 31 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9744(0.9744) Grad Norm: 12.1964  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.115 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0475(0.4022) Grad Norm: 3.5234  LR: 2.8962e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 5 0 2]
Epoch 32 - avg_train_loss: 0.4022  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.6538


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0030(0.0030) Grad Norm: 0.1378  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0446(0.4465) Grad Norm: 8.4232  LR: 2.6165e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 33 - avg_train_loss: 0.4465  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.6538


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7434(0.7434) Grad Norm: 4.7065  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7976(0.5176) Grad Norm: 4.1529  LR: 2.3463e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 3]
Epoch 34 - avg_train_loss: 0.5176  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0014(0.0014) Grad Norm: 0.0700  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0061(0.3374) Grad Norm: 0.5646  LR: 2.0866e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 35 - avg_train_loss: 0.3374  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6538


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9184(0.9184) Grad Norm: 7.0893  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0181(0.3732) Grad Norm: 4.8121  LR: 1.8385e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 4 2]
Epoch 36 - avg_train_loss: 0.3732  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9191(0.9191) Grad Norm: 4.4904  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.133 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0021(0.4862) Grad Norm: 0.0713  LR: 1.6030e-04  
EVAL: [0/9] Data 0.081 (0.081) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 37 - avg_train_loss: 0.4862  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6231


EVAL: [8/9] Data 0.010 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.0283  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.114 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0020(0.3902) Grad Norm: 0.1068  LR: 1.3809e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 38 - avg_train_loss: 0.3902  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0021(0.0021) Grad Norm: 0.0731  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0010(0.3150) Grad Norm: 0.0575  LR: 1.1732e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 3]
Epoch 39 - avg_train_loss: 0.3150  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6385


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0009(0.0009) Grad Norm: 0.0590  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9370(0.4875) Grad Norm: 3.9741  LR: 9.8058e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 40 - avg_train_loss: 0.4875  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6231


EVAL: [8/9] Data 0.012 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0017(1.0017) Grad Norm: 4.5506  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.127 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6968(0.4620) Grad Norm: 4.0741  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 3]
Epoch 41 - avg_train_loss: 0.4620  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6231


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8611(0.8611) Grad Norm: 5.0898  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.121 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0014(0.2883) Grad Norm: 0.1159  LR: 6.4381e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 42 - avg_train_loss: 0.2883  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0046(0.0046) Grad Norm: 0.9069  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.114 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9550(0.4784) Grad Norm: 5.0281  LR: 5.0093e-05  
EVAL: [0/9] Data 0.081 (0.081) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 3]
Epoch 43 - avg_train_loss: 0.4784  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6385


EVAL: [8/9] Data 0.009 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.5786(0.5786) Grad Norm: 3.2630  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.3190) Grad Norm: 0.0231  LR: 3.7578e-05  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 3]
Epoch 44 - avg_train_loss: 0.3190  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6538


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0008(0.0008) Grad Norm: 0.1422  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.139 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0010(0.3340) Grad Norm: 0.0363  LR: 2.6881e-05  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 45 - avg_train_loss: 0.3340  lr: 2.6881e-05  time: 50s
Epoch 45 - Score: 0.6692
Epoch 45 - Save Best Score: 0.6692 Model


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0104(0.0104) Grad Norm: 0.4981  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.139 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0015) Grad Norm: 0.0109  LR: 1.8039e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 46 - avg_train_loss: 0.0015  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6692


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0010(0.0010) Grad Norm: 0.0492  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.131 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0024) Grad Norm: 0.0215  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 47 - avg_train_loss: 0.0024  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6692


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0384  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.133 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.0057) Grad Norm: 0.0854  LR: 5.9882e-06  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 48 - avg_train_loss: 0.0057  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6615


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.0245  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.108 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.0012) Grad Norm: 0.0190  LR: 2.7534e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 49 - avg_train_loss: 0.0012  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6615


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0007(0.0007) Grad Norm: 0.0296  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.0019) Grad Norm: 0.0451  LR: 1.2467e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 8  3  6  4 10]
preds: [8 3 0 0 2]
Epoch 50 - avg_train_loss: 0.0019  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6615

oof score: 0.6636085626911316



EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
Epoch: [1][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 2.5153(2.5153) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.6010(2.5670) Grad Norm: 8.5620  LR: 1.0000e-03  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [5 9 9 9 2]
Epoch 1 - avg_train_loss: 2.5670  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.1679
Epoch 1 - Save Best Score: 0.1679 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 45s) Loss: 2.4939(2.4939) Grad Norm: 8.5786  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.136 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 2.5461(2.4064) Grad Norm: 7.8436  LR: 9.9803e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 0 2 0 2]
Epoch 2 - avg_train_loss: 2.4064  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.3282
Epoch 2 - Save Best Score: 0.3282 Model


EVAL: [8/9] Data 0.017 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 45s) Loss: 2.1090(2.1090) Grad Norm: 8.2260  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.121 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0679(2.0896) Grad Norm: 12.5659  LR: 9.9312e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [6 2 2 6 8]
Epoch 3 - avg_train_loss: 2.0896  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4122
Epoch 3 - Save Best Score: 0.4122 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.158 (0.158) Elapsed 0m 1s (remain 0m 45s) Loss: 1.2631(1.2631) Grad Norm: 14.4973  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.131 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0234(1.8166) Grad Norm: 9.1903  LR: 9.8627e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [4 2 4 0 8]
Epoch 4 - avg_train_loss: 1.8166  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.4809
Epoch 4 - Save Best Score: 0.4809 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 45s) Loss: 0.9181(0.9181) Grad Norm: 12.2112  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.113 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3418(1.5028) Grad Norm: 13.9157  LR: 9.7751e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 5 - avg_train_loss: 1.5028  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.4962
Epoch 5 - Save Best Score: 0.4962 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8272(1.8272) Grad Norm: 7.9175  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.127 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5521(1.2795) Grad Norm: 13.1786  LR: 9.6688e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 6 - avg_train_loss: 1.2795  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.5038
Epoch 6 - Save Best Score: 0.5038 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9956(0.9956) Grad Norm: 14.4102  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.126 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6610(1.0910) Grad Norm: 8.6337  LR: 9.5441e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 7 - avg_train_loss: 1.0910  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5573
Epoch 7 - Save Best Score: 0.5573 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5765(0.5765) Grad Norm: 9.3934  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6271(0.9662) Grad Norm: 9.6186  LR: 9.4016e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 1 0 8]
Epoch 8 - avg_train_loss: 0.9662  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2136(1.2136) Grad Norm: 9.0144  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6433(0.9506) Grad Norm: 19.8184  LR: 9.2418e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [5 2 1 0 8]
Epoch 9 - avg_train_loss: 0.9506  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.4809


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0838(1.0838) Grad Norm: 22.2019  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.138 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4092(1.1893) Grad Norm: 5.7598  LR: 9.0654e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [5 2 5 0 8]
Epoch 10 - avg_train_loss: 1.1893  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5725
Epoch 10 - Save Best Score: 0.5725 Model


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0419(1.0419) Grad Norm: 6.2577  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.134 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5384(1.1170) Grad Norm: 11.4577  LR: 8.8730e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 11 - avg_train_loss: 1.1170  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1041(0.1041) Grad Norm: 3.5259  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.136 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0224(1.0569) Grad Norm: 7.1114  LR: 8.6655e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 12 - avg_train_loss: 1.0569  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.5573


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3847(1.3847) Grad Norm: 5.1197  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.131 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1137(1.0117) Grad Norm: 27.7717  LR: 8.4436e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  5  0  8]
Epoch 13 - avg_train_loss: 1.0117  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3505(1.3505) Grad Norm: 6.8005  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.118 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7479(0.8675) Grad Norm: 14.7256  LR: 8.2081e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  1  0  8]
Epoch 14 - avg_train_loss: 0.8675  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.5191


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1726(0.1726) Grad Norm: 7.1547  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4770(0.8670) Grad Norm: 13.2852  LR: 7.9601e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 15 - avg_train_loss: 0.8670  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.5496


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0247(0.0247) Grad Norm: 1.5383  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4763(0.9309) Grad Norm: 5.9284  LR: 7.7006e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  4  0  8]
Epoch 16 - avg_train_loss: 0.9309  lr: 7.7006e-04  time: 50s
Epoch 16 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3438(1.3438) Grad Norm: 8.0203  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.114 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7828(0.6731) Grad Norm: 20.9740  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [0 2 2 0 8]
Epoch 17 - avg_train_loss: 0.6731  lr: 7.4304e-04  time: 50s
Epoch 17 - Score: 0.5344


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1040(1.1040) Grad Norm: 5.4835  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.123 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4309(0.7677) Grad Norm: 16.5211  LR: 7.1508e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [6 2 2 0 8]
Epoch 18 - avg_train_loss: 0.7677  lr: 7.1508e-04  time: 50s
Epoch 18 - Score: 0.5802
Epoch 18 - Save Best Score: 0.5802 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2107(1.2107) Grad Norm: 5.2844  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.112 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4126(0.7632) Grad Norm: 6.8713  LR: 6.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 19 - avg_train_loss: 0.7632  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.5267


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4506(1.4506) Grad Norm: 7.2421  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.128 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0668(0.5971) Grad Norm: 10.5081  LR: 6.5674e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  4  0  8]
Epoch 20 - avg_train_loss: 0.5971  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.117 (0.117) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9966(0.9966) Grad Norm: 6.6693  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1113(0.7030) Grad Norm: 6.8741  LR: 6.2661e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [0 2 2 0 8]
Epoch 21 - avg_train_loss: 0.7030  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.4885


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.5421(0.5421) Grad Norm: 10.1909  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.127 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0274(0.7911) Grad Norm: 1.3597  LR: 5.9598e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [2 2 2 0 8]
Epoch 22 - avg_train_loss: 0.7911  lr: 5.9598e-04  time: 50s
Epoch 22 - Score: 0.5420


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0535(0.0535) Grad Norm: 4.0516  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.128 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1994(0.6136) Grad Norm: 6.2232  LR: 5.6498e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 23 - avg_train_loss: 0.6136  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0089(0.0089) Grad Norm: 0.4705  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.134 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1807(0.6139) Grad Norm: 9.7364  LR: 5.3373e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [5 2 4 0 8]
Epoch 24 - avg_train_loss: 0.6139  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1401(0.1401) Grad Norm: 22.0908  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0053(0.7404) Grad Norm: 0.2971  LR: 5.0236e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [5 2 2 0 8]
Epoch 25 - avg_train_loss: 0.7404  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8904(0.8904) Grad Norm: 5.5150  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.136 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0824(0.5385) Grad Norm: 5.2331  LR: 4.7099e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 26 - avg_train_loss: 0.5385  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0293(0.0293) Grad Norm: 3.2266  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.117 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0376(0.5707) Grad Norm: 2.3954  LR: 4.3974e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 27 - avg_train_loss: 0.5707  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.5878
Epoch 27 - Save Best Score: 0.5878 Model


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0011(0.0011) Grad Norm: 0.0927  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.124 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0331(0.4057) Grad Norm: 3.0617  LR: 4.0874e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 28 - avg_train_loss: 0.4057  lr: 4.0874e-04  time: 50s
Epoch 28 - Score: 0.5573


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9644(0.9644) Grad Norm: 8.2616  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.122 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9736(0.5290) Grad Norm: 7.4732  LR: 3.7811e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 29 - avg_train_loss: 0.5290  lr: 3.7811e-04  time: 50s
Epoch 29 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0066(0.0066) Grad Norm: 2.4160  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.138 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1895(0.5754) Grad Norm: 5.5632  LR: 3.4797e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  1  0  8]
Epoch 30 - avg_train_loss: 0.5754  lr: 3.4797e-04  time: 50s
Epoch 30 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7838(0.7838) Grad Norm: 3.2023  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0034(0.5540) Grad Norm: 0.1699  LR: 3.1843e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 31 - avg_train_loss: 0.5540  lr: 3.1843e-04  time: 50s
Epoch 31 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0032(0.0032) Grad Norm: 0.1667  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8793(0.4679) Grad Norm: 3.5778  LR: 2.8962e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 32 - avg_train_loss: 0.4679  lr: 2.8962e-04  time: 50s
Epoch 32 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0759(0.0759) Grad Norm: 3.0893  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.119 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0044(0.5071) Grad Norm: 0.2228  LR: 2.6165e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 33 - avg_train_loss: 0.5071  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0009(0.0009) Grad Norm: 0.0347  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.136 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8490(0.4590) Grad Norm: 5.2173  LR: 2.3463e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 34 - avg_train_loss: 0.4590  lr: 2.3463e-04  time: 50s
Epoch 34 - Score: 0.5954
Epoch 34 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 45s) Loss: 0.0027(0.0027) Grad Norm: 0.1013  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.137 (0.129) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8504(0.5627) Grad Norm: 3.1316  LR: 2.0866e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  4  0  8]
Epoch 35 - avg_train_loss: 0.5627  lr: 2.0866e-04  time: 50s
Epoch 35 - Score: 0.6031
Epoch 35 - Save Best Score: 0.6031 Model


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.151 (0.151) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7077(0.7077) Grad Norm: 2.0435  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.132 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0017(0.4844) Grad Norm: 0.0555  LR: 1.8385e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  4  0  8]
Epoch 36 - avg_train_loss: 0.4844  lr: 1.8385e-04  time: 50s
Epoch 36 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7322(0.7322) Grad Norm: 3.1040  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0071(0.4928) Grad Norm: 1.1050  LR: 1.6030e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 37 - avg_train_loss: 0.4928  lr: 1.6030e-04  time: 50s
Epoch 37 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0012(0.0012) Grad Norm: 0.1190  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.127 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5996(0.4354) Grad Norm: 3.2008  LR: 1.3809e-04  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 38 - avg_train_loss: 0.4354  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0005(0.0005) Grad Norm: 0.0261  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.126 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0075(0.4268) Grad Norm: 0.6670  LR: 1.1732e-04  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 39 - avg_train_loss: 0.4268  lr: 1.1732e-04  time: 50s
Epoch 39 - Score: 0.5802


EVAL: [8/9] Data 0.017 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7885(0.7885) Grad Norm: 4.3154  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6975(0.4679) Grad Norm: 3.0404  LR: 9.8058e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 40 - avg_train_loss: 0.4679  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0011(0.0011) Grad Norm: 0.0908  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.128 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.4056) Grad Norm: 0.0248  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 41 - avg_train_loss: 0.4056  lr: 8.0390e-05  time: 50s
Epoch 41 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0036(0.0036) Grad Norm: 0.2570  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7189(0.4797) Grad Norm: 3.5480  LR: 6.4381e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 42 - avg_train_loss: 0.4797  lr: 6.4381e-05  time: 50s
Epoch 42 - Score: 0.6031


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0010(0.0010) Grad Norm: 0.1366  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.127 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0012(0.3798) Grad Norm: 0.0698  LR: 5.0093e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 43 - avg_train_loss: 0.3798  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6183
Epoch 43 - Save Best Score: 0.6183 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9545(0.9545) Grad Norm: 4.5733  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.3668) Grad Norm: 0.0354  LR: 3.7578e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 44 - avg_train_loss: 0.3668  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0008(0.0008) Grad Norm: 0.0317  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8832(0.3569) Grad Norm: 5.6988  LR: 2.6881e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 45 - avg_train_loss: 0.3569  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0017(0.0017) Grad Norm: 0.1396  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.124 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0072) Grad Norm: 0.0125  LR: 1.8039e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 46 - avg_train_loss: 0.0072  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.5802


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0004(0.0004) Grad Norm: 0.0327  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.110 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.0016) Grad Norm: 0.0140  LR: 1.1073e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 47 - avg_train_loss: 0.0016  lr: 1.1073e-05  time: 50s
Epoch 47 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0153  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0030) Grad Norm: 0.0210  LR: 5.9882e-06  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 48 - avg_train_loss: 0.0030  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0013(0.0013) Grad Norm: 0.0389  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.114 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0015) Grad Norm: 0.0109  LR: 2.7534e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 49 - avg_train_loss: 0.0015  lr: 2.7534e-06  time: 50s
Epoch 49 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0006(0.0006) Grad Norm: 0.0474  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.117 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0011) Grad Norm: 0.0454  LR: 1.2467e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  2 10  0  8]
preds: [11  2  2  0  8]
Epoch 50 - avg_train_loss: 0.0011  lr: 1.2467e-06  time: 50s
Epoch 50 - Score: 0.5802


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 2.7845(2.7845) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.123 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 2.5302(2.5514) Grad Norm: 7.3470  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [2 2 2 2 2]
Epoch 1 - avg_train_loss: 2.5514  lr: 1.0000e-03  time: 50s
Epoch 1 - Score: 0.2290
Epoch 1 - Save Best Score: 0.2290 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3208(2.3208) Grad Norm: 11.5835  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.119 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2167(2.3437) Grad Norm: 17.2702  LR: 9.9803e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [3 6 2 3 3]
Epoch 2 - avg_train_loss: 2.3437  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.2977
Epoch 2 - Save Best Score: 0.2977 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3730(2.3730) Grad Norm: 17.6231  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.127 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3305(2.0448) Grad Norm: 16.3902  LR: 9.9312e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [2 7 2 2 0]
Epoch 3 - avg_train_loss: 2.0448  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4656
Epoch 3 - Save Best Score: 0.4656 Model


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6338(1.6338) Grad Norm: 16.8157  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.122 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1608(1.7714) Grad Norm: 11.7238  LR: 9.8627e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 8 11  2  2  2]
Epoch 4 - avg_train_loss: 1.7714  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.3664


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2065(1.2065) Grad Norm: 18.5475  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7051(1.5482) Grad Norm: 9.6605  LR: 9.7751e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  6  2 11]
Epoch 5 - avg_train_loss: 1.5482  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.4656


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0951(2.0951) Grad Norm: 12.2424  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.118 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6649(1.3780) Grad Norm: 13.1759  LR: 9.6688e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [12 11  0  0 11]
Epoch 6 - avg_train_loss: 1.3780  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.5573
Epoch 6 - Save Best Score: 0.5573 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8130(1.8130) Grad Norm: 15.9456  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.118 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8147(1.2739) Grad Norm: 13.2433  LR: 9.5441e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 2  1  0  2 11]
Epoch 7 - avg_train_loss: 1.2739  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.4885


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4768(0.4768) Grad Norm: 10.0248  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.128 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9875(1.2629) Grad Norm: 15.8183  LR: 9.4016e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  1  0 11]
Epoch 8 - avg_train_loss: 1.2629  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5649
Epoch 8 - Save Best Score: 0.5649 Model


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.155 (0.155) Elapsed 0m 1s (remain 0m 45s) Loss: 1.5250(1.5250) Grad Norm: 7.9721  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.137 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6451(1.1740) Grad Norm: 9.9734  LR: 9.2418e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  1  4 11]
Epoch 9 - avg_train_loss: 1.1740  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3283(0.3283) Grad Norm: 6.4238  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.120 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7159(1.0038) Grad Norm: 18.8488  LR: 9.0654e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9  1  1  0 11]
Epoch 10 - avg_train_loss: 1.0038  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3251(0.3251) Grad Norm: 11.3458  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.134 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4284(1.2411) Grad Norm: 6.2725  LR: 8.8730e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  0  0 11]
Epoch 11 - avg_train_loss: 1.2411  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.7481(1.7481) Grad Norm: 8.1283  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.131 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7232(0.9737) Grad Norm: 12.5905  LR: 8.6655e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [12 11  0  0 11]
Epoch 12 - avg_train_loss: 0.9737  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.5267


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.5922(0.5922) Grad Norm: 9.6173  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8932(0.9805) Grad Norm: 9.5022  LR: 8.4436e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [12 11  2  3 11]
Epoch 13 - avg_train_loss: 0.9805  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.6260
Epoch 13 - Save Best Score: 0.6260 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3535(1.3535) Grad Norm: 6.2646  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.111 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0863(0.8944) Grad Norm: 7.5550  LR: 8.2081e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [12 11  0  0 11]
Epoch 14 - avg_train_loss: 0.8944  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8934(1.8934) Grad Norm: 10.3852  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.125 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9113(1.0992) Grad Norm: 16.3127  LR: 7.9601e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  0  4 11]
Epoch 15 - avg_train_loss: 1.0992  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.4809


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3895(0.3895) Grad Norm: 10.5612  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.126 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1206(0.7429) Grad Norm: 4.6414  LR: 7.7006e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  1  0 11]
Epoch 16 - avg_train_loss: 0.7429  lr: 7.7006e-04  time: 50s
Epoch 16 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3161(0.3161) Grad Norm: 17.6134  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.124 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1293(0.6675) Grad Norm: 4.3957  LR: 7.4304e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  2  4 11]
Epoch 17 - avg_train_loss: 0.6675  lr: 7.4304e-04  time: 50s
Epoch 17 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.117 (0.117) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5514(1.5514) Grad Norm: 9.4253  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0373(0.8811) Grad Norm: 9.8787  LR: 7.1508e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  0  3 11]
Epoch 18 - avg_train_loss: 0.8811  lr: 7.1508e-04  time: 50s
Epoch 18 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2651(1.2651) Grad Norm: 6.6419  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2730(0.8629) Grad Norm: 5.9632  LR: 6.8627e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  2  3 11]
Epoch 19 - avg_train_loss: 0.8629  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0824(0.0824) Grad Norm: 6.4004  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.124 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1435(0.6770) Grad Norm: 4.4438  LR: 6.5674e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [12 11  2  2 11]
Epoch 20 - avg_train_loss: 0.6770  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.5878


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1901(0.1901) Grad Norm: 7.2601  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.126 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1862(0.6870) Grad Norm: 6.3211  LR: 6.2661e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  2  0 11]
Epoch 21 - avg_train_loss: 0.6870  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0417(0.0417) Grad Norm: 3.2592  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.123 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0892(0.6985) Grad Norm: 4.6299  LR: 5.9598e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  4  7 11]
Epoch 22 - avg_train_loss: 0.6985  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0333(1.0333) Grad Norm: 4.8061  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.118 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1735(0.6105) Grad Norm: 8.0720  LR: 5.6498e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  1  8 11]
Epoch 23 - avg_train_loss: 0.6105  lr: 5.6498e-04  time: 50s
Epoch 23 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1586(0.1586) Grad Norm: 4.1295  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.107 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0406(0.6088) Grad Norm: 10.4547  LR: 5.3373e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  4  7  5]
Epoch 24 - avg_train_loss: 0.6088  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0072(0.0072) Grad Norm: 0.2158  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.127 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1666(0.5065) Grad Norm: 2.9671  LR: 5.0236e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  4  0  5]
Epoch 25 - avg_train_loss: 0.5065  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1342(1.1342) Grad Norm: 6.8342  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.109 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6818(0.7357) Grad Norm: 2.6407  LR: 4.7099e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  4  0 11]
Epoch 26 - avg_train_loss: 0.7357  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0077(0.0077) Grad Norm: 0.8322  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0062(0.5015) Grad Norm: 0.2853  LR: 4.3974e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  6  0 11]
Epoch 27 - avg_train_loss: 0.5015  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1164(0.1164) Grad Norm: 4.4451  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.128 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5587(0.6746) Grad Norm: 8.3678  LR: 4.0874e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  6  0 11]
Epoch 28 - avg_train_loss: 0.6746  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0622(1.0622) Grad Norm: 5.1537  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.114 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0368(0.4911) Grad Norm: 2.4440  LR: 3.7811e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  0  3 11]
Epoch 29 - avg_train_loss: 0.4911  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8922(0.8922) Grad Norm: 4.2284  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9664(0.5864) Grad Norm: 4.9399  LR: 3.4797e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  4  7 11]
Epoch 30 - avg_train_loss: 0.5864  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7662(0.7662) Grad Norm: 3.7210  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0138(0.5754) Grad Norm: 2.1284  LR: 3.1843e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  3 11]
Epoch 31 - avg_train_loss: 0.5754  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6282(0.6282) Grad Norm: 2.3763  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.121 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0250(0.4631) Grad Norm: 2.9638  LR: 2.8962e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  1  3 11]
Epoch 32 - avg_train_loss: 0.4631  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0073(0.0073) Grad Norm: 0.9268  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.106 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0126(0.5517) Grad Norm: 0.8099  LR: 2.6165e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  3 11]
Epoch 33 - avg_train_loss: 0.5517  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8146(0.8146) Grad Norm: 4.8661  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.131 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5688(0.5402) Grad Norm: 3.0247  LR: 2.3463e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  3 11]
Epoch 34 - avg_train_loss: 0.5402  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0018(0.0018) Grad Norm: 0.1021  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0020(0.4868) Grad Norm: 0.0792  LR: 2.0866e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  0  3 11]
Epoch 35 - avg_train_loss: 0.4868  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8078(0.8078) Grad Norm: 4.8590  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.118 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0011(0.4782) Grad Norm: 0.0615  LR: 1.8385e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  3 11]
Epoch 36 - avg_train_loss: 0.4782  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6412
Epoch 36 - Save Best Score: 0.6412 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.150 (0.150) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0055(0.0055) Grad Norm: 0.3041  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0018(0.5430) Grad Norm: 0.3293  LR: 1.6030e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 9 11  1  0 11]
Epoch 37 - avg_train_loss: 0.5430  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0024(0.0024) Grad Norm: 0.0790  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7739(0.4600) Grad Norm: 3.4461  LR: 1.3809e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 38 - avg_train_loss: 0.4600  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0021(0.0021) Grad Norm: 0.0571  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0047(0.3446) Grad Norm: 0.2765  LR: 1.1732e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  3 11]
Epoch 39 - avg_train_loss: 0.3446  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0011(0.0011) Grad Norm: 0.0389  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6463(0.4211) Grad Norm: 2.8091  LR: 9.8058e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [11 11  1  3 11]
Epoch 40 - avg_train_loss: 0.4211  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8795(0.8795) Grad Norm: 4.2399  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.118 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6631(0.5421) Grad Norm: 10.2454  LR: 8.0390e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 41 - avg_train_loss: 0.5421  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.0408  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.131 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0008(0.4477) Grad Norm: 0.0485  LR: 6.4381e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 42 - avg_train_loss: 0.4477  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0011(0.0011) Grad Norm: 0.0380  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.124 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0024(0.4017) Grad Norm: 0.1874  LR: 5.0093e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 43 - avg_train_loss: 0.4017  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7109(0.7109) Grad Norm: 2.9108  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.132 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1130(0.2817) Grad Norm: 15.1547  LR: 3.7578e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 44 - avg_train_loss: 0.2817  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0108  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.137 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.3563) Grad Norm: 0.0399  LR: 2.6881e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 45 - avg_train_loss: 0.3563  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0049(0.0049) Grad Norm: 0.2660  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.0023) Grad Norm: 0.1160  LR: 1.8039e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 46 - avg_train_loss: 0.0023  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0209  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.118 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0014(0.0019) Grad Norm: 0.0605  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 47 - avg_train_loss: 0.0019  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0005(0.0005) Grad Norm: 0.0290  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.121 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0050) Grad Norm: 0.0193  LR: 5.9882e-06  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 48 - avg_train_loss: 0.0050  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0310  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.125 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0075) Grad Norm: 0.0136  LR: 2.7534e-06  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 49 - avg_train_loss: 0.0075  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0038(0.0038) Grad Norm: 0.2334  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0034(0.0118) Grad Norm: 0.2186  LR: 1.2467e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [12 11  2  2 11]
preds: [ 4 11  1  0 11]
Epoch 50 - avg_train_loss: 0.0118  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 42s) Loss: 2.6959(2.6959) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.113 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3183(2.4990) Grad Norm: 12.4096  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [2 2 2 9 5]
Epoch 1 - avg_train_loss: 2.4990  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2519
Epoch 1 - Save Best Score: 0.2519 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 2.4302(2.4302) Grad Norm: 10.3244  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.117 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3944(2.3411) Grad Norm: 7.1899  LR: 9.9803e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 2 9 6]
Epoch 2 - avg_train_loss: 2.3411  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.3969
Epoch 2 - Save Best Score: 0.3969 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.151 (0.151) Elapsed 0m 1s (remain 0m 45s) Loss: 1.9626(1.9626) Grad Norm: 7.7010  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.119 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9218(2.0557) Grad Norm: 9.7419  LR: 9.9312e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 7 4 1 6]
Epoch 3 - avg_train_loss: 2.0557  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4351
Epoch 3 - Save Best Score: 0.4351 Model


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.152 (0.152) Elapsed 0m 1s (remain 0m 45s) Loss: 2.2562(2.2562) Grad Norm: 72.5009  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.126 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3809(1.8094) Grad Norm: 13.9970  LR: 9.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 1 6]
Epoch 4 - avg_train_loss: 1.8094  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.5191
Epoch 4 - Save Best Score: 0.5191 Model


EVAL: [8/9] Data 0.016 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 45s) Loss: 0.7870(0.7870) Grad Norm: 8.4407  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.126 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7916(1.5053) Grad Norm: 8.8923  LR: 9.7751e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 5 - avg_train_loss: 1.5053  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.5573
Epoch 5 - Save Best Score: 0.5573 Model


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 1.7690(1.7690) Grad Norm: 9.1179  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.115 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6309(1.4177) Grad Norm: 15.3781  LR: 9.6688e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 2 9 6]
Epoch 6 - avg_train_loss: 1.4177  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8753(0.8753) Grad Norm: 12.7587  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.125 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8968(1.1704) Grad Norm: 13.5069  LR: 9.5441e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 3 6 6]
Epoch 7 - avg_train_loss: 1.1704  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2156(1.2156) Grad Norm: 7.7409  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6501(1.1374) Grad Norm: 8.4476  LR: 9.4016e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 3 3 2]
Epoch 8 - avg_train_loss: 1.1374  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5878
Epoch 8 - Save Best Score: 0.5878 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 45s) Loss: 0.4024(0.4024) Grad Norm: 9.4966  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.129 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2298(1.1491) Grad Norm: 7.5723  LR: 9.2418e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [ 8  4 10  6  6]
Epoch 9 - avg_train_loss: 1.1491  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5800(1.5800) Grad Norm: 6.4410  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5036(0.8839) Grad Norm: 7.3451  LR: 9.0654e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 2 6 5]
Epoch 10 - avg_train_loss: 0.8839  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5954
Epoch 10 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.017 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6320(1.6320) Grad Norm: 7.1247  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.129 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0311(0.9296) Grad Norm: 9.3466  LR: 8.8730e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 7 2 9 5]
Epoch 11 - avg_train_loss: 0.9296  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.8675(1.8675) Grad Norm: 30.6822  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.132 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4604(1.1264) Grad Norm: 10.6426  LR: 8.6655e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 4 9 5]
Epoch 12 - avg_train_loss: 1.1264  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.6260
Epoch 12 - Save Best Score: 0.6260 Model


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.153 (0.153) Elapsed 0m 1s (remain 0m 45s) Loss: 1.6539(1.6539) Grad Norm: 7.2403  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.119 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8099(0.7732) Grad Norm: 10.5421  LR: 8.4436e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 7 9 3 5]
Epoch 13 - avg_train_loss: 0.7732  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9910(0.9910) Grad Norm: 17.8983  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3357(1.0415) Grad Norm: 8.8827  LR: 8.2081e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 7 3 6 5]
Epoch 14 - avg_train_loss: 1.0415  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.6336
Epoch 14 - Save Best Score: 0.6336 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 45s) Loss: 1.6189(1.6189) Grad Norm: 10.0345  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.123 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1554(0.9921) Grad Norm: 9.0133  LR: 7.9601e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 1]
Epoch 15 - avg_train_loss: 0.9921  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.6718
Epoch 15 - Save Best Score: 0.6718 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1920(0.1920) Grad Norm: 14.8242  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.120 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1414(0.8290) Grad Norm: 8.6865  LR: 7.7006e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 3 3 1]
Epoch 16 - avg_train_loss: 0.8290  lr: 7.7006e-04  time: 50s
Epoch 16 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3531(0.3531) Grad Norm: 9.9417  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9332(0.7654) Grad Norm: 18.3246  LR: 7.4304e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 3 9 1]
Epoch 17 - avg_train_loss: 0.7654  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0309(0.0309) Grad Norm: 1.7156  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4079(0.6760) Grad Norm: 5.7904  LR: 7.1508e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 2 4 0 5]
Epoch 18 - avg_train_loss: 0.6760  lr: 7.1508e-04  time: 50s
Epoch 18 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9049(0.9049) Grad Norm: 8.8108  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0765(0.9009) Grad Norm: 3.2933  LR: 6.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 5 0]
Epoch 19 - avg_train_loss: 0.9009  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.5725


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0371(0.0371) Grad Norm: 1.5444  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.128 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0815(0.4965) Grad Norm: 3.2449  LR: 6.5674e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 20 - avg_train_loss: 0.4965  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2623(1.2623) Grad Norm: 12.0436  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.138 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3298(0.8156) Grad Norm: 14.6479  LR: 6.2661e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [ 8  2 11  3 11]
Epoch 21 - avg_train_loss: 0.8156  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.5496


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.150 (0.150) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3929(1.3929) Grad Norm: 10.2263  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.139 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5635(0.7753) Grad Norm: 17.0438  LR: 5.9598e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 1]
Epoch 22 - avg_train_loss: 0.7753  lr: 5.9598e-04  time: 50s
Epoch 22 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7425(0.7425) Grad Norm: 4.1598  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0137(0.7245) Grad Norm: 0.6139  LR: 5.6498e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 7 4 0 6]
Epoch 23 - avg_train_loss: 0.7245  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3092(1.3092) Grad Norm: 5.8395  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.138 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0835(0.5383) Grad Norm: 3.7435  LR: 5.3373e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 3 0 6]
Epoch 24 - avg_train_loss: 0.5383  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0037(1.0037) Grad Norm: 5.0742  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.113 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1264(0.6549) Grad Norm: 7.0517  LR: 5.0236e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 1]
Epoch 25 - avg_train_loss: 0.6549  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.6412


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0352(0.0352) Grad Norm: 3.1569  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.135 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0211(0.5210) Grad Norm: 1.6287  LR: 4.7099e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [ 8  2 11  0  5]
Epoch 26 - avg_train_loss: 0.5210  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7981(0.7981) Grad Norm: 3.3042  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.116 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1878(0.6029) Grad Norm: 5.9894  LR: 4.3974e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 6]
Epoch 27 - avg_train_loss: 0.6029  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6183


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8788(0.8788) Grad Norm: 4.5742  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.127 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0476(0.3662) Grad Norm: 4.4091  LR: 4.0874e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 28 - avg_train_loss: 0.3662  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0042(0.0042) Grad Norm: 0.1717  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.123 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8781(0.5998) Grad Norm: 5.0885  LR: 3.7811e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [ 8  4  4 11  6]
Epoch 29 - avg_train_loss: 0.5998  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1698(0.1698) Grad Norm: 10.5887  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.130 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8254(0.5178) Grad Norm: 3.3096  LR: 3.4797e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [ 8  4 11  0  5]
Epoch 30 - avg_train_loss: 0.5178  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7726(0.7726) Grad Norm: 6.2742  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.128 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0067(0.4473) Grad Norm: 0.4109  LR: 3.1843e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 7 1]
Epoch 31 - avg_train_loss: 0.4473  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0070(0.0070) Grad Norm: 0.3029  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.124 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7124(0.4199) Grad Norm: 23.3320  LR: 2.8962e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 32 - avg_train_loss: 0.4199  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7600(0.7600) Grad Norm: 2.5842  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.120 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0023(0.4865) Grad Norm: 0.1159  LR: 2.6165e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 33 - avg_train_loss: 0.4865  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0964(1.0964) Grad Norm: 5.9370  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.116 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0026(0.4627) Grad Norm: 0.2175  LR: 2.3463e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 2 6 5]
Epoch 34 - avg_train_loss: 0.4627  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9073(0.9073) Grad Norm: 7.3654  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6868(0.3679) Grad Norm: 2.5526  LR: 2.0866e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 35 - avg_train_loss: 0.3679  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0814(1.0814) Grad Norm: 6.8891  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.116 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0011(0.4002) Grad Norm: 0.1149  LR: 1.8385e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 36 - avg_train_loss: 0.4002  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0013(0.0013) Grad Norm: 0.0649  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.118 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8684(0.3368) Grad Norm: 4.3490  LR: 1.6030e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 37 - avg_train_loss: 0.3368  lr: 1.6030e-04  time: 50s
Epoch 37 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0010(0.0010) Grad Norm: 0.0386  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.126 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.3073) Grad Norm: 0.0180  LR: 1.3809e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 38 - avg_train_loss: 0.3073  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0018(0.0018) Grad Norm: 0.2246  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7383(0.4803) Grad Norm: 5.8608  LR: 1.1732e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 39 - avg_train_loss: 0.4803  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6870
Epoch 39 - Save Best Score: 0.6870 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0023(0.0023) Grad Norm: 0.2091  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0010(0.4388) Grad Norm: 0.1262  LR: 9.8058e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 40 - avg_train_loss: 0.4388  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.7023
Epoch 40 - Save Best Score: 0.7023 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0052(0.0052) Grad Norm: 0.3235  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7966(0.4077) Grad Norm: 3.8168  LR: 8.0390e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 41 - avg_train_loss: 0.4077  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5324(0.5324) Grad Norm: 2.5449  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5697(0.3607) Grad Norm: 3.1355  LR: 6.4381e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 42 - avg_train_loss: 0.3607  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7318(0.7318) Grad Norm: 2.7846  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.115 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.4160) Grad Norm: 0.0157  LR: 5.0093e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 43 - avg_train_loss: 0.4160  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0011(0.0011) Grad Norm: 0.1496  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.127 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0011(0.4369) Grad Norm: 0.0562  LR: 3.7578e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 44 - avg_train_loss: 0.4369  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6947


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0002(0.0002) Grad Norm: 0.0866  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.125 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.3399) Grad Norm: 0.0183  LR: 2.6881e-05  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 0 5]
Epoch 45 - avg_train_loss: 0.3399  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0151  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.121 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0143) Grad Norm: 0.0078  LR: 1.8039e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 46 - avg_train_loss: 0.0143  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.118 (0.118) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0005(0.0005) Grad Norm: 0.0165  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.116 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0006) Grad Norm: 0.0114  LR: 1.1073e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 47 - avg_train_loss: 0.0006  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0009(0.0009) Grad Norm: 0.0991  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.129 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0002(0.0021) Grad Norm: 0.0186  LR: 5.9882e-06  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 48 - avg_train_loss: 0.0021  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0043(0.0043) Grad Norm: 0.3823  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.125 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0011) Grad Norm: 0.0396  LR: 2.7534e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 49 - avg_train_loss: 0.0011  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0005(0.0005) Grad Norm: 0.0237  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.116 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0002(0.0016) Grad Norm: 0.0062  LR: 1.2467e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 7 4 6 2]
preds: [8 4 4 6 5]
Epoch 50 - avg_train_loss: 0.0016  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 42s) Loss: 2.6890(2.6890) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.116 (0.122) Elapsed 0m 44s (remain 0m 0s) Loss: 2.6287(2.4969) Grad Norm: 16.9896  LR: 1.0000e-03  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [2 5 2 2 2]
Epoch 1 - avg_train_loss: 2.4969  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2214
Epoch 1 - Save Best Score: 0.2214 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 2.2584(2.2584) Grad Norm: 17.9357  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.129 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3209(2.3251) Grad Norm: 11.9732  LR: 9.9803e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 6  5 11  6  6]
Epoch 2 - avg_train_loss: 2.3251  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.2366
Epoch 2 - Save Best Score: 0.2366 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.157 (0.157) Elapsed 0m 1s (remain 0m 45s) Loss: 2.3601(2.3601) Grad Norm: 11.2178  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.112 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1518(2.0646) Grad Norm: 10.1023  LR: 9.9312e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5 11  5 10]
Epoch 3 - avg_train_loss: 2.0646  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4733
Epoch 3 - Save Best Score: 0.4733 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6245(1.6245) Grad Norm: 12.8054  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5444(1.7413) Grad Norm: 19.1331  LR: 9.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [5 5 0 0 0]
Epoch 4 - avg_train_loss: 1.7413  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.5038
Epoch 4 - Save Best Score: 0.5038 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 1.9112(1.9112) Grad Norm: 9.4867  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.127 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9267(1.4865) Grad Norm: 8.3362  LR: 9.7751e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  3  5 10]
Epoch 5 - avg_train_loss: 1.4865  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.4656


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8238(0.8238) Grad Norm: 9.8809  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1952(1.4348) Grad Norm: 8.6941  LR: 9.6688e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 6 - avg_train_loss: 1.4348  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 2.1528(2.1528) Grad Norm: 12.5256  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.123 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2373(1.1186) Grad Norm: 22.6871  LR: 9.5441e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 7 - avg_train_loss: 1.1186  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5649
Epoch 7 - Save Best Score: 0.5649 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 45s) Loss: 1.8271(1.8271) Grad Norm: 11.4166  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.125 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8864(1.1923) Grad Norm: 7.0670  LR: 9.4016e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 10]
Epoch 8 - avg_train_loss: 1.1923  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5496


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4089(0.4089) Grad Norm: 9.1432  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.129 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5260(1.1071) Grad Norm: 8.6338  LR: 9.2418e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [11  0  0  0 10]
Epoch 9 - avg_train_loss: 1.1071  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.4504


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.7981(1.7981) Grad Norm: 11.9667  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.124 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7992(1.0632) Grad Norm: 8.1236  LR: 9.0654e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 10]
Epoch 10 - avg_train_loss: 1.0632  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5878
Epoch 10 - Save Best Score: 0.5878 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2263(0.2263) Grad Norm: 5.2438  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2140(1.1268) Grad Norm: 8.9245  LR: 8.8730e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5 12  0  0  8]
Epoch 11 - avg_train_loss: 1.1268  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5088(1.5088) Grad Norm: 8.7706  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.133 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5334(1.0914) Grad Norm: 11.3219  LR: 8.6655e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 12 - avg_train_loss: 1.0914  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5802


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4213(1.4213) Grad Norm: 7.9079  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.107 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3965(0.9467) Grad Norm: 6.2446  LR: 8.4436e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [5 5 0 0 6]
Epoch 13 - avg_train_loss: 0.9467  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.6336
Epoch 13 - Save Best Score: 0.6336 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.152 (0.152) Elapsed 0m 1s (remain 0m 45s) Loss: 1.4096(1.4096) Grad Norm: 9.9561  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.122 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2895(1.0083) Grad Norm: 12.6068  LR: 8.2081e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 2  5  0  6 10]
Epoch 14 - avg_train_loss: 1.0083  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3556(0.3556) Grad Norm: 6.6057  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.135 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0795(1.0149) Grad Norm: 11.9377  LR: 7.9601e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 10]
Epoch 15 - avg_train_loss: 1.0149  lr: 7.9601e-04  time: 50s
Epoch 15 - Score: 0.6260


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1702(1.1702) Grad Norm: 5.2827  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.109 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0500(0.6919) Grad Norm: 3.1300  LR: 7.7006e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 10]
Epoch 16 - avg_train_loss: 0.6919  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.6336


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8656(1.8656) Grad Norm: 12.5592  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0951(0.7694) Grad Norm: 7.3193  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 17 - avg_train_loss: 0.7694  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5420


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3677(1.3677) Grad Norm: 8.2186  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2481(0.8241) Grad Norm: 4.8126  LR: 7.1508e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [5 5 0 0 6]
Epoch 18 - avg_train_loss: 0.8241  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1096(1.1096) Grad Norm: 6.6258  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.120 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3343(0.8502) Grad Norm: 7.9476  LR: 6.8627e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 3  5  0  0 11]
Epoch 19 - avg_train_loss: 0.8502  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3815(1.3815) Grad Norm: 6.5737  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.129 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2720(0.6286) Grad Norm: 11.2275  LR: 6.5674e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 3  5  0  0 11]
Epoch 20 - avg_train_loss: 0.6286  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1221(0.1221) Grad Norm: 4.3253  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.129 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2410(0.5836) Grad Norm: 6.9587  LR: 6.2661e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5 11  5  7]
Epoch 21 - avg_train_loss: 0.5836  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5496


EVAL: [8/9] Data 0.016 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1648(1.1648) Grad Norm: 4.8933  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.128 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1153(0.7934) Grad Norm: 4.3636  LR: 5.9598e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 4  5  0  0 11]
Epoch 22 - avg_train_loss: 0.7934  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3690(0.3690) Grad Norm: 10.3938  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9362(0.4712) Grad Norm: 7.6615  LR: 5.6498e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 23 - avg_train_loss: 0.4712  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0610(1.0610) Grad Norm: 10.1460  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4289(0.6367) Grad Norm: 12.6565  LR: 5.3373e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 24 - avg_train_loss: 0.6367  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.6412
Epoch 24 - Save Best Score: 0.6412 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0133(0.0133) Grad Norm: 0.5493  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0541(0.7219) Grad Norm: 3.4173  LR: 5.0236e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 25 - avg_train_loss: 0.7219  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0106(0.0106) Grad Norm: 0.4915  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.122 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1804(0.6708) Grad Norm: 8.4775  LR: 4.7099e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 26 - avg_train_loss: 0.6708  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6870
Epoch 26 - Save Best Score: 0.6870 Model


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.155 (0.155) Elapsed 0m 1s (remain 0m 45s) Loss: 0.0069(0.0069) Grad Norm: 0.5797  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.124 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1291(0.5681) Grad Norm: 4.9978  LR: 4.3974e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 27 - avg_train_loss: 0.5681  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6031


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5294(1.5294) Grad Norm: 8.0446  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8398(0.5622) Grad Norm: 3.9043  LR: 4.0874e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 2  5  0  0 11]
Epoch 28 - avg_train_loss: 0.5622  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8195(0.8195) Grad Norm: 4.4851  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5778(0.5745) Grad Norm: 19.4506  LR: 3.7811e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 3  5  0  6 11]
Epoch 29 - avg_train_loss: 0.5745  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.6183


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0035(0.0035) Grad Norm: 0.2390  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.122 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0075(0.4761) Grad Norm: 0.4293  LR: 3.4797e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 3  5  0  6 11]
Epoch 30 - avg_train_loss: 0.4761  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6031


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9610(0.9610) Grad Norm: 5.5032  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8209(0.5620) Grad Norm: 5.5761  LR: 3.1843e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 31 - avg_train_loss: 0.5620  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6641


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7905(0.7905) Grad Norm: 3.6949  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0226(0.3338) Grad Norm: 6.3585  LR: 2.8962e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 32 - avg_train_loss: 0.3338  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0025(0.0025) Grad Norm: 0.1066  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9407(0.4886) Grad Norm: 5.2638  LR: 2.6165e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 4  5  0  6 11]
Epoch 33 - avg_train_loss: 0.4886  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0200(0.0200) Grad Norm: 3.2302  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0105(0.3835) Grad Norm: 0.6499  LR: 2.3463e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 34 - avg_train_loss: 0.3835  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0023(0.0023) Grad Norm: 0.0537  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.115 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8008(0.5845) Grad Norm: 3.8671  LR: 2.0866e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 35 - avg_train_loss: 0.5845  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0012(0.0012) Grad Norm: 0.0494  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.132 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0005(0.3571) Grad Norm: 0.0159  LR: 1.8385e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 36 - avg_train_loss: 0.3571  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6364(0.6364) Grad Norm: 3.5041  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.117 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0033(0.4473) Grad Norm: 0.1680  LR: 1.6030e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  5 11]
Epoch 37 - avg_train_loss: 0.4473  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6183


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6347(0.6347) Grad Norm: 2.0749  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.122 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3963(0.5032) Grad Norm: 1.9030  LR: 1.3809e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 38 - avg_train_loss: 0.5032  lr: 1.3809e-04  time: 50s
Epoch 38 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.153 (0.153) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1167(1.1167) Grad Norm: 7.0655  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.130 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7398(0.4748) Grad Norm: 3.3881  LR: 1.1732e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 39 - avg_train_loss: 0.4748  lr: 1.1732e-04  time: 50s
Epoch 39 - Score: 0.6260


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6639(0.6639) Grad Norm: 2.9350  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.132 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0012(0.4996) Grad Norm: 9.0339  LR: 9.8058e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 40 - avg_train_loss: 0.4996  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0038(0.0038) Grad Norm: 0.2670  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.129 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6362(0.4922) Grad Norm: 3.8664  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 41 - avg_train_loss: 0.4922  lr: 8.0390e-05  time: 50s
Epoch 41 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6652(0.6652) Grad Norm: 3.7579  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.142 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7971(0.3444) Grad Norm: 5.0313  LR: 6.4381e-05  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 42 - avg_train_loss: 0.3444  lr: 6.4381e-05  time: 50s
Epoch 42 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5773(0.5773) Grad Norm: 2.0498  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.126 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.4775) Grad Norm: 0.0215  LR: 5.0093e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  0 11]
Epoch 43 - avg_train_loss: 0.4775  lr: 5.0093e-05  time: 50s
Epoch 43 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9269(0.9269) Grad Norm: 3.8621  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.129 (0.129) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.4364) Grad Norm: 0.0205  LR: 3.7578e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 44 - avg_train_loss: 0.4364  lr: 3.7578e-05  time: 50s
Epoch 44 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0013(0.0013) Grad Norm: 0.0896  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.127 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9211(0.3756) Grad Norm: 3.8724  LR: 2.6881e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 45 - avg_train_loss: 0.3756  lr: 2.6881e-05  time: 50s
Epoch 45 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0012(0.0012) Grad Norm: 0.0767  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.124 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0068) Grad Norm: 0.0135  LR: 1.8039e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 46 - avg_train_loss: 0.0068  lr: 1.8039e-05  time: 50s
Epoch 46 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0039(0.0039) Grad Norm: 0.2935  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0017) Grad Norm: 0.0124  LR: 1.1073e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 47 - avg_train_loss: 0.0017  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0016(0.0016) Grad Norm: 0.1114  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.129 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0030) Grad Norm: 0.0220  LR: 5.9882e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 48 - avg_train_loss: 0.0030  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0116(0.0116) Grad Norm: 1.8718  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.122 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0354(0.0037) Grad Norm: 2.9729  LR: 2.7534e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 49 - avg_train_loss: 0.0037  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6260


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0006(0.0006) Grad Norm: 0.0200  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.111 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0021(0.0105) Grad Norm: 0.0558  LR: 1.2467e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2  0  6 11]
preds: [ 5  5  0  6 11]
Epoch 50 - avg_train_loss: 0.0105  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 42s) Loss: 2.7487(2.7487) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 2.7504(2.5011) Grad Norm: 15.1824  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [2 2 2 2 6]
Epoch 1 - avg_train_loss: 2.5011  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2462
Epoch 1 - Save Best Score: 0.2462 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3441(2.3441) Grad Norm: 23.8331  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.119 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2034(2.4021) Grad Norm: 17.7723  LR: 9.9803e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [2 0 2 2 0]
Epoch 2 - avg_train_loss: 2.4021  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.3538
Epoch 2 - Save Best Score: 0.3538 Model


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 2.1611(2.1611) Grad Norm: 16.4520  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.114 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0686(2.1228) Grad Norm: 15.0417  LR: 9.9312e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 2 11  0  8  1]
Epoch 3 - avg_train_loss: 2.1228  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.5154
Epoch 3 - Save Best Score: 0.5154 Model


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.154 (0.154) Elapsed 0m 1s (remain 0m 45s) Loss: 1.9822(1.9822) Grad Norm: 18.9855  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.126 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3076(1.8314) Grad Norm: 9.8073  LR: 9.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [5 4 4 9 1]
Epoch 4 - avg_train_loss: 1.8314  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.4615


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8156(0.8156) Grad Norm: 9.8285  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.128 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1601(1.5255) Grad Norm: 13.9588  LR: 9.7751e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  3  4 12  1]
Epoch 5 - avg_train_loss: 1.5255  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.5615
Epoch 5 - Save Best Score: 0.5615 Model


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0063(2.0063) Grad Norm: 10.2039  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.133 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6041(1.3727) Grad Norm: 8.3799  LR: 9.6688e-04  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  0 12  1]
Epoch 6 - avg_train_loss: 1.3727  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.4231


EVAL: [8/9] Data 0.011 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9682(0.9682) Grad Norm: 12.5160  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.126 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0617(1.2656) Grad Norm: 12.9644  LR: 9.5441e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  4  9  3]
Epoch 7 - avg_train_loss: 1.2656  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.3615


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 1.7913(1.7913) Grad Norm: 8.2934  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.110 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3721(1.2801) Grad Norm: 38.1177  LR: 9.4016e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  4  3 12  3]
Epoch 8 - avg_train_loss: 1.2801  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5615


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.7833(1.7833) Grad Norm: 14.6973  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4348(1.1685) Grad Norm: 8.4162  LR: 9.2418e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  1  4 12  4]
Epoch 9 - avg_train_loss: 1.1685  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.4615


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6349(0.6349) Grad Norm: 13.7494  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.113 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1537(1.0801) Grad Norm: 27.4300  LR: 9.0654e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  3  3 12  3]
Epoch 10 - avg_train_loss: 1.0801  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5385


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4821(0.4821) Grad Norm: 11.7774  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6029(0.9187) Grad Norm: 7.9393  LR: 8.8730e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [12  4  3 12  3]
Epoch 11 - avg_train_loss: 0.9187  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5385


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 1.4320(1.4320) Grad Norm: 7.2946  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0525(0.9277) Grad Norm: 15.3352  LR: 8.6655e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  0  3  8  1]
Epoch 12 - avg_train_loss: 0.9277  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.5154


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5783(1.5783) Grad Norm: 8.8776  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.127 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0342(0.9124) Grad Norm: 2.5884  LR: 8.4436e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [10  4  3 12  3]
Epoch 13 - avg_train_loss: 0.9124  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3732(0.3732) Grad Norm: 8.8232  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.127 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4594(0.9802) Grad Norm: 6.5751  LR: 8.2081e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [5 4 3 9 3]
Epoch 14 - avg_train_loss: 0.9802  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5308


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2653(1.2653) Grad Norm: 6.8140  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4183(1.0799) Grad Norm: 7.1657  LR: 7.9601e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 0  4  3 12  0]
Epoch 15 - avg_train_loss: 1.0799  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.5231


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2709(0.2709) Grad Norm: 13.9207  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2110(0.8346) Grad Norm: 9.7652  LR: 7.7006e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  4  3 12  1]
Epoch 16 - avg_train_loss: 0.8346  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5538


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3884(1.3884) Grad Norm: 23.5432  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.138 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3056(0.9242) Grad Norm: 11.1685  LR: 7.4304e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  3  3 12  3]
Epoch 17 - avg_train_loss: 0.9242  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2547(0.2547) Grad Norm: 6.2035  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1615(0.6234) Grad Norm: 9.2130  LR: 7.1508e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  1 12  3]
Epoch 18 - avg_train_loss: 0.6234  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5846
Epoch 18 - Save Best Score: 0.5846 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0437(0.0437) Grad Norm: 1.9472  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4606(0.8291) Grad Norm: 16.0711  LR: 6.8627e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 19 - avg_train_loss: 0.8291  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.5154


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2427(0.2427) Grad Norm: 8.2562  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.133 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0258(0.7212) Grad Norm: 4.1883  LR: 6.5674e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 20 - avg_train_loss: 0.7212  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0137(0.0137) Grad Norm: 0.4521  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.129 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0735(0.6174) Grad Norm: 2.5486  LR: 6.2661e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 21 - avg_train_loss: 0.6174  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5615


EVAL: [8/9] Data 0.010 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0253(0.0253) Grad Norm: 1.2294  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.121 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0081(0.7881) Grad Norm: 0.3448  LR: 5.9598e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [12  4  3 12  3]
Epoch 22 - avg_train_loss: 0.7881  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9497(0.9497) Grad Norm: 4.7734  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2907(0.5386) Grad Norm: 5.8010  LR: 5.6498e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  4  3 12  3]
Epoch 23 - avg_train_loss: 0.5386  lr: 5.6498e-04  time: 50s
Epoch 23 - Score: 0.5462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2690(0.2690) Grad Norm: 10.2075  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.123 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4651(0.3990) Grad Norm: 8.4197  LR: 5.3373e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  0  1 12  3]
Epoch 24 - avg_train_loss: 0.3990  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5231


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1359(1.1359) Grad Norm: 7.7845  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.114 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0483(0.5612) Grad Norm: 5.7330  LR: 5.0236e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 25 - avg_train_loss: 0.5612  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5615


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3814(1.3814) Grad Norm: 7.7534  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.131 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0164(0.6874) Grad Norm: 1.5736  LR: 4.7099e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  6  3  9  3]
Epoch 26 - avg_train_loss: 0.6874  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5308


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0195(0.0195) Grad Norm: 1.6308  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.117 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0033(0.5503) Grad Norm: 8.1198  LR: 4.3974e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  3  3 12  3]
Epoch 27 - avg_train_loss: 0.5503  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.5462


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1241(1.1241) Grad Norm: 5.6171  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.112 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0147(0.6884) Grad Norm: 0.5469  LR: 4.0874e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5 10  3  9  3]
Epoch 28 - avg_train_loss: 0.6884  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5615


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0305(0.0305) Grad Norm: 1.2745  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.129 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0010(0.6113) Grad Norm: 0.0374  LR: 3.7811e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  1  3 12  3]
Epoch 29 - avg_train_loss: 0.6113  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5846


EVAL: [8/9] Data 0.009 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9091(0.9091) Grad Norm: 4.4225  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.109 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0034(0.5300) Grad Norm: 0.1312  LR: 3.4797e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  1  3 12  3]
Epoch 30 - avg_train_loss: 0.5300  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.5923
Epoch 30 - Save Best Score: 0.5923 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 45s) Loss: 0.0081(0.0081) Grad Norm: 0.6098  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0027(0.4250) Grad Norm: 0.1201  LR: 3.1843e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [5 1 3 9 3]
Epoch 31 - avg_train_loss: 0.4250  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.5538


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8912(0.8912) Grad Norm: 3.6325  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0056(0.5804) Grad Norm: 0.2865  LR: 2.8962e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [5 0 3 9 3]
Epoch 32 - avg_train_loss: 0.5804  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5846


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0301(1.0301) Grad Norm: 5.5896  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.127 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0069(0.3747) Grad Norm: 0.2836  LR: 2.6165e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 33 - avg_train_loss: 0.3747  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.6077
Epoch 33 - Save Best Score: 0.6077 Model


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 45s) Loss: 1.0978(1.0978) Grad Norm: 7.8169  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.128 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7750(0.4416) Grad Norm: 4.1847  LR: 2.3463e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 34 - avg_train_loss: 0.4416  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6308
Epoch 34 - Save Best Score: 0.6308 Model


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9005(0.9005) Grad Norm: 6.9783  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.117 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.3489) Grad Norm: 0.1289  LR: 2.0866e-04  
EVAL: [0/9] Data 0.080 (0.080) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 35 - avg_train_loss: 0.3489  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6462
Epoch 35 - Save Best Score: 0.6462 Model


EVAL: [8/9] Data 0.010 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5123(1.5123) Grad Norm: 8.3004  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.119 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0016(0.4753) Grad Norm: 0.1616  LR: 1.8385e-04  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 36 - avg_train_loss: 0.4753  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0673(1.0673) Grad Norm: 4.4464  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.120 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9904(0.5163) Grad Norm: 5.3388  LR: 1.6030e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [ 5  4  3 10  3]
Epoch 37 - avg_train_loss: 0.5163  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.5769


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5977(0.5977) Grad Norm: 3.4895  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8443(0.6522) Grad Norm: 4.4358  LR: 1.3809e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 38 - avg_train_loss: 0.6522  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0018(0.0018) Grad Norm: 0.0465  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.129 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.3751) Grad Norm: 0.0897  LR: 1.1732e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 39 - avg_train_loss: 0.3751  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6077


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8079(0.8079) Grad Norm: 3.7891  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.123 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0034(0.3947) Grad Norm: 0.5839  LR: 9.8058e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 40 - avg_train_loss: 0.3947  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6308


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8282(0.8282) Grad Norm: 4.1402  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.112 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0013(0.5250) Grad Norm: 0.0351  LR: 8.0390e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 41 - avg_train_loss: 0.5250  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6231


EVAL: [8/9] Data 0.010 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0167(0.0167) Grad Norm: 1.6159  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.131 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.3090) Grad Norm: 0.0214  LR: 6.4381e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11  4  3 12  3]
Epoch 42 - avg_train_loss: 0.3090  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6231


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7422(0.7422) Grad Norm: 2.8340  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8405(0.3505) Grad Norm: 5.0098  LR: 5.0093e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 43 - avg_train_loss: 0.3505  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0311  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.131 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.2958) Grad Norm: 0.0264  LR: 3.7578e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 44 - avg_train_loss: 0.2958  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9121(0.9121) Grad Norm: 4.2948  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.133 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6574(0.4557) Grad Norm: 2.1419  LR: 2.6881e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 45 - avg_train_loss: 0.4557  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6308


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0008(0.0008) Grad Norm: 0.1024  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.125 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.0016) Grad Norm: 0.0191  LR: 1.8039e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 46 - avg_train_loss: 0.0016  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0010(0.0010) Grad Norm: 0.0314  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.119 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0008(0.0018) Grad Norm: 0.0282  LR: 1.1073e-05  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 47 - avg_train_loss: 0.0018  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6308


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0010(0.0010) Grad Norm: 0.0377  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.0066) Grad Norm: 0.0451  LR: 5.9882e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 48 - avg_train_loss: 0.0066  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6308


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0197  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.135 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.0019) Grad Norm: 0.0173  LR: 2.7534e-06  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 49 - avg_train_loss: 0.0019  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6308


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.121 (0.121) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0016(0.0016) Grad Norm: 0.1379  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0079(0.0027) Grad Norm: 0.8662  LR: 1.2467e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  4  3 12  3]
preds: [11 10  3 12  3]
Epoch 50 - avg_train_loss: 0.0027  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6308

oof score: 0.6590214067278287



EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
Epoch: [1][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 42s) Loss: 2.7934(2.7934) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0345(2.4901) Grad Norm: 11.4773  LR: 1.0000e-03  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 2 2 2 2]
Epoch 1 - avg_train_loss: 2.4901  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2214
Epoch 1 - Save Best Score: 0.2214 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 2.2035(2.2035) Grad Norm: 12.4008  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.113 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0891(2.2281) Grad Norm: 19.1902  LR: 9.9803e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 2 10 10  5  9]
Epoch 2 - avg_train_loss: 2.2281  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.4046
Epoch 2 - Save Best Score: 0.4046 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 45s) Loss: 1.7512(1.7512) Grad Norm: 23.6322  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.130 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1753(1.9481) Grad Norm: 10.5336  LR: 9.9312e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 4 8]
Epoch 3 - avg_train_loss: 1.9481  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4656
Epoch 3 - Save Best Score: 0.4656 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.164 (0.164) Elapsed 0m 1s (remain 0m 45s) Loss: 1.8853(1.8853) Grad Norm: 12.2957  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.131 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5739(1.6493) Grad Norm: 15.1984  LR: 9.8627e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [0 4 3 3 9]
Epoch 4 - avg_train_loss: 1.6493  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.4351


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 1.9943(1.9943) Grad Norm: 27.0376  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 2.4404(1.5475) Grad Norm: 13.5300  LR: 9.7751e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 4 9]
Epoch 5 - avg_train_loss: 1.5475  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.5344
Epoch 5 - Save Best Score: 0.5344 Model


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4645(1.4645) Grad Norm: 7.1633  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.120 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1285(1.3284) Grad Norm: 14.5700  LR: 9.6688e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [1 4 0 3 9]
Epoch 6 - avg_train_loss: 1.3284  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5649
Epoch 6 - Save Best Score: 0.5649 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3654(0.3654) Grad Norm: 5.7340  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.119 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3273(1.1890) Grad Norm: 18.7706  LR: 9.5441e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [6 4 6 3 9]
Epoch 7 - avg_train_loss: 1.1890  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.4733


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7308(0.7308) Grad Norm: 15.6582  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3357(1.3316) Grad Norm: 6.9275  LR: 9.4016e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 8 - avg_train_loss: 1.3316  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.6107
Epoch 8 - Save Best Score: 0.6107 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5846(1.5846) Grad Norm: 8.2384  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.111 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8416(1.1065) Grad Norm: 17.9622  LR: 9.2418e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [10  1  0  3  9]
Epoch 9 - avg_train_loss: 1.1065  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.9397(1.9397) Grad Norm: 8.9026  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.106 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1501(0.9444) Grad Norm: 10.0789  LR: 9.0654e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 3 0 3 8]
Epoch 10 - avg_train_loss: 0.9444  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0921(0.0921) Grad Norm: 2.2384  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.110 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 1.5918(0.9709) Grad Norm: 8.1947  LR: 8.8730e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [5 4 0 3 8]
Epoch 11 - avg_train_loss: 0.9709  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9020(0.9020) Grad Norm: 5.5286  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.119 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4700(0.9818) Grad Norm: 6.5867  LR: 8.6655e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  3  0  3 12]
Epoch 12 - avg_train_loss: 0.9818  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5115


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3823(1.3823) Grad Norm: 11.8364  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.122 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1489(0.8234) Grad Norm: 6.0011  LR: 8.4436e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 5  4  0  3 12]
Epoch 13 - avg_train_loss: 0.8234  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0914(0.0914) Grad Norm: 4.4952  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.098 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 1.9236(0.9017) Grad Norm: 8.5542  LR: 8.2081e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 7 3 8]
Epoch 14 - avg_train_loss: 0.9017  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5739(1.5739) Grad Norm: 7.4040  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.116 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4329(0.8827) Grad Norm: 15.2477  LR: 7.9601e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 2  4  0  3 10]
Epoch 15 - avg_train_loss: 0.8827  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2608(1.2608) Grad Norm: 4.5323  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.114 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3143(0.7995) Grad Norm: 5.0799  LR: 7.7006e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [5 0 0 3 8]
Epoch 16 - avg_train_loss: 0.7995  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0547(0.0547) Grad Norm: 2.4355  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2050(0.7054) Grad Norm: 7.4720  LR: 7.4304e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  1  0  3  8]
Epoch 17 - avg_train_loss: 0.7054  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.070) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0486(0.0486) Grad Norm: 2.4671  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.118 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1496(0.7311) Grad Norm: 6.1417  LR: 7.1508e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 18 - avg_train_loss: 0.7311  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.150 (0.150) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0475(0.0475) Grad Norm: 3.4996  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.118 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1075(0.6867) Grad Norm: 8.2728  LR: 6.8627e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 5 0 3 8]
Epoch 19 - avg_train_loss: 0.6867  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3531(0.3531) Grad Norm: 20.8714  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0968(0.8033) Grad Norm: 3.3553  LR: 6.5674e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 0  4  0  4 10]
Epoch 20 - avg_train_loss: 0.8033  lr: 6.5674e-04  time: 50s
Epoch 20 - Score: 0.4962


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0123(0.0123) Grad Norm: 0.5740  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0290(0.7159) Grad Norm: 4.1701  LR: 6.2661e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 21 - avg_train_loss: 0.7159  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4130(1.4130) Grad Norm: 15.2360  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.115 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0054(0.8086) Grad Norm: 0.2166  LR: 5.9598e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  4  0  3  8]
Epoch 22 - avg_train_loss: 0.8086  lr: 5.9598e-04  time: 50s
Epoch 22 - Score: 0.5802


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2907(1.2907) Grad Norm: 13.8826  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.116 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2257(0.7115) Grad Norm: 6.0390  LR: 5.6498e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  4  0  3  8]
Epoch 23 - avg_train_loss: 0.7115  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.6260
Epoch 23 - Save Best Score: 0.6260 Model


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.159 (0.159) Elapsed 0m 1s (remain 0m 45s) Loss: 1.0673(1.0673) Grad Norm: 5.7351  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2362(0.6878) Grad Norm: 5.9028  LR: 5.3373e-04  
EVAL: [0/9] Data 0.067 (0.067) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 24 - avg_train_loss: 0.6878  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2192(0.2192) Grad Norm: 9.6155  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1741(0.5913) Grad Norm: 7.8509  LR: 5.0236e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [0 0 0 3 8]
Epoch 25 - avg_train_loss: 0.5913  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.5802


EVAL: [8/9] Data 0.017 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0946(0.0946) Grad Norm: 3.3670  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.124 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0950(0.4949) Grad Norm: 8.6483  LR: 4.7099e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 26 - avg_train_loss: 0.4949  lr: 4.7099e-04  time: 50s
Epoch 26 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0105(0.0105) Grad Norm: 0.5116  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.130 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0838(0.6047) Grad Norm: 4.5173  LR: 4.3974e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [6 4 0 3 8]
Epoch 27 - avg_train_loss: 0.6047  lr: 4.3974e-04  time: 50s
Epoch 27 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0145(0.0145) Grad Norm: 0.6433  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.130 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1097(0.6227) Grad Norm: 7.0335  LR: 4.0874e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [5 4 0 3 8]
Epoch 28 - avg_train_loss: 0.6227  lr: 4.0874e-04  time: 50s
Epoch 28 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0073(0.0073) Grad Norm: 0.7294  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.122 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0472(0.4728) Grad Norm: 4.5339  LR: 3.7811e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3  8]
Epoch 29 - avg_train_loss: 0.4728  lr: 3.7811e-04  time: 50s
Epoch 29 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1461(1.1461) Grad Norm: 9.4012  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.135 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1365(0.4854) Grad Norm: 15.0626  LR: 3.4797e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 30 - avg_train_loss: 0.4854  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8654(0.8654) Grad Norm: 4.4389  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.130 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0080(0.5081) Grad Norm: 0.6315  LR: 3.1843e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [2 4 0 3 8]
Epoch 31 - avg_train_loss: 0.5081  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2311(1.2311) Grad Norm: 6.1045  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0210(0.5214) Grad Norm: 3.0516  LR: 2.8962e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3  9]
Epoch 32 - avg_train_loss: 0.5214  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0043(0.0043) Grad Norm: 0.1729  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.133 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0542(0.3935) Grad Norm: 7.9630  LR: 2.6165e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3  8]
Epoch 33 - avg_train_loss: 0.3935  lr: 2.6165e-04  time: 50s
Epoch 33 - Score: 0.5725


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7691(0.7691) Grad Norm: 4.5375  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.122 (0.129) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9455(0.5395) Grad Norm: 4.9236  LR: 2.3463e-04  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3  9]
Epoch 34 - avg_train_loss: 0.5395  lr: 2.3463e-04  time: 50s
Epoch 34 - Score: 0.5954


EVAL: [8/9] Data 0.015 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0162(0.0162) Grad Norm: 1.7743  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.118 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8096(0.4330) Grad Norm: 4.6626  LR: 2.0866e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3 10]
Epoch 35 - avg_train_loss: 0.4330  lr: 2.0866e-04  time: 50s
Epoch 35 - Score: 0.5954


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0084(0.0084) Grad Norm: 1.2564  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.130 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0024(0.4563) Grad Norm: 0.0865  LR: 1.8385e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [6 2 0 3 8]
Epoch 36 - avg_train_loss: 0.4563  lr: 1.8385e-04  time: 50s
Epoch 36 - Score: 0.5954


EVAL: [8/9] Data 0.016 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0044(0.0044) Grad Norm: 0.3236  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.117 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6422(0.5626) Grad Norm: 2.0999  LR: 1.6030e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  2  0  3 10]
Epoch 37 - avg_train_loss: 0.5626  lr: 1.6030e-04  time: 50s
Epoch 37 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6391(0.6391) Grad Norm: 3.4616  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.127 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.4998) Grad Norm: 0.0443  LR: 1.3809e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 10]
Epoch 38 - avg_train_loss: 0.4998  lr: 1.3809e-04  time: 50s
Epoch 38 - Score: 0.6489
Epoch 38 - Save Best Score: 0.6489 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0542(1.0542) Grad Norm: 3.7367  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0053(0.4274) Grad Norm: 0.4297  LR: 1.1732e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [11  4  0  3  8]
Epoch 39 - avg_train_loss: 0.4274  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0111  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.110 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.3611) Grad Norm: 0.0125  LR: 9.8058e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  2  0  3 10]
Epoch 40 - avg_train_loss: 0.3611  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0162(1.0162) Grad Norm: 4.6662  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.118 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.3916) Grad Norm: 0.0455  LR: 8.0390e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  2  0  3 10]
Epoch 41 - avg_train_loss: 0.3916  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0012(0.0012) Grad Norm: 0.0905  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.122 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7848(0.4311) Grad Norm: 3.7296  LR: 6.4381e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 10]
Epoch 42 - avg_train_loss: 0.4311  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8199(0.8199) Grad Norm: 3.6864  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6110(0.5564) Grad Norm: 3.4091  LR: 5.0093e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 43 - avg_train_loss: 0.5564  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0009(0.0009) Grad Norm: 0.1245  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0030(0.4106) Grad Norm: 0.2270  LR: 3.7578e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  2  0  3 12]
Epoch 44 - avg_train_loss: 0.4106  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6641
Epoch 44 - Save Best Score: 0.6641 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6757(0.6757) Grad Norm: 3.5781  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.122 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0061(0.4503) Grad Norm: 0.6464  LR: 2.6881e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 45 - avg_train_loss: 0.4503  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6718
Epoch 45 - Save Best Score: 0.6718 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0012(0.0012) Grad Norm: 0.1386  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.117 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0014) Grad Norm: 0.0156  LR: 1.8039e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 46 - avg_train_loss: 0.0014  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0005(0.0005) Grad Norm: 0.0559  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.122 (0.120) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0041(0.0036) Grad Norm: 0.2660  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 47 - avg_train_loss: 0.0036  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6794
Epoch 47 - Save Best Score: 0.6794 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0014(0.0014) Grad Norm: 0.0803  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0023) Grad Norm: 0.0272  LR: 5.9882e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 48 - avg_train_loss: 0.0023  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0164  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.122 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0009(0.0013) Grad Norm: 0.0319  LR: 2.7534e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 49 - avg_train_loss: 0.0013  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6794


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0100  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.130 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.0009) Grad Norm: 0.0300  LR: 1.2467e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 6  4  0  3 12]
preds: [ 6  4  0  3 12]
Epoch 50 - avg_train_loss: 0.0009  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6794


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.106 (0.106) Elapsed 0m 1s (remain 0m 42s) Loss: 2.6616(2.6616) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.126 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 2.6445(2.5474) Grad Norm: 10.3461  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 2 2 2]
Epoch 1 - avg_train_loss: 2.5474  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2672
Epoch 1 - Save Best Score: 0.2672 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 2.4899(2.4899) Grad Norm: 13.4748  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.119 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3279(2.2775) Grad Norm: 13.5316  LR: 9.9803e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 2 2 2]
Epoch 2 - avg_train_loss: 2.2775  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.4580
Epoch 2 - Save Best Score: 0.4580 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 45s) Loss: 2.0085(2.0085) Grad Norm: 20.6053  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.129 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2147(2.0377) Grad Norm: 12.2927  LR: 9.9312e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  7  0  2 12]
Epoch 3 - avg_train_loss: 2.0377  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.5420
Epoch 3 - Save Best Score: 0.5420 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 45s) Loss: 1.1911(1.1911) Grad Norm: 10.2186  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7981(1.8128) Grad Norm: 7.8165  LR: 9.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 0 2 0]
Epoch 4 - avg_train_loss: 1.8128  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.5496
Epoch 4 - Save Best Score: 0.5496 Model


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8498(1.8498) Grad Norm: 7.2087  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.119 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8125(1.6581) Grad Norm: 7.8859  LR: 9.7751e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 5 2 9]
Epoch 5 - avg_train_loss: 1.6581  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.4962


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 44s) Loss: 2.2166(2.2166) Grad Norm: 10.9080  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8136(1.3136) Grad Norm: 10.9984  LR: 9.6688e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  4 11  2  9]
Epoch 6 - avg_train_loss: 1.3136  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5496


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2890(1.2890) Grad Norm: 7.5729  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.122 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8786(1.1910) Grad Norm: 9.9963  LR: 9.5441e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 3 5 2 9]
Epoch 7 - avg_train_loss: 1.1910  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.5725
Epoch 7 - Save Best Score: 0.5725 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 1.7497(1.7497) Grad Norm: 9.1813  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.123 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6115(1.2390) Grad Norm: 10.1227  LR: 9.4016e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 8 - avg_train_loss: 1.2390  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.6183
Epoch 8 - Save Best Score: 0.6183 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.167 (0.167) Elapsed 0m 1s (remain 0m 45s) Loss: 1.2832(1.2832) Grad Norm: 8.7224  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.123 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3078(1.2976) Grad Norm: 9.5605  LR: 9.2418e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  4 11  2  9]
Epoch 9 - avg_train_loss: 1.2976  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.5344


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9889(0.9889) Grad Norm: 6.4948  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.115 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6736(0.9632) Grad Norm: 11.5128  LR: 9.0654e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 3 5 2 2]
Epoch 10 - avg_train_loss: 0.9632  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.5344


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4440(1.4440) Grad Norm: 8.6129  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.135 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4059(1.0065) Grad Norm: 10.2023  LR: 8.8730e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 1 2 2]
Epoch 11 - avg_train_loss: 1.0065  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.5524(1.5524) Grad Norm: 11.0822  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.118 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3770(0.9031) Grad Norm: 11.2329  LR: 8.6655e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  1 11  2 11]
Epoch 12 - avg_train_loss: 0.9031  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5191


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3202(0.3202) Grad Norm: 12.2681  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.120 (0.118) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3730(0.7148) Grad Norm: 22.5621  LR: 8.4436e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 2]
Epoch 13 - avg_train_loss: 0.7148  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2438(1.2438) Grad Norm: 9.9997  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.105 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0907(0.8447) Grad Norm: 3.1266  LR: 8.2081e-04  
EVAL: [0/9] Data 0.065 (0.065) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2 11  5  2 11]
Epoch 14 - avg_train_loss: 0.8447  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5649


EVAL: [8/9] Data 0.013 (0.061) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2262(1.2262) Grad Norm: 7.1274  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5067(1.0720) Grad Norm: 7.6895  LR: 7.9601e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  3  9  2 11]
Epoch 15 - avg_train_loss: 1.0720  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2515(0.2515) Grad Norm: 7.0665  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.106 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0819(0.9185) Grad Norm: 6.1615  LR: 7.7006e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 1 2 2]
Epoch 16 - avg_train_loss: 0.9185  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1170(0.1170) Grad Norm: 4.0727  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.109 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 1.6028(0.7749) Grad Norm: 9.5397  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 2 5 2 2]
Epoch 17 - avg_train_loss: 0.7749  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5573


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.6583(1.6583) Grad Norm: 9.3952  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.122 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0378(0.7114) Grad Norm: 1.6573  LR: 7.1508e-04  
EVAL: [0/9] Data 0.066 (0.066) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  2  1  2 12]
Epoch 18 - avg_train_loss: 0.7114  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.6336
Epoch 18 - Save Best Score: 0.6336 Model


EVAL: [8/9] Data 0.014 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0278(0.0278) Grad Norm: 1.5498  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.102 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0847(0.7176) Grad Norm: 3.4106  LR: 6.8627e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 0 5 2 4]
Epoch 19 - avg_train_loss: 0.7176  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 1.3799(1.3799) Grad Norm: 5.8959  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.114 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0587(0.7959) Grad Norm: 4.3002  LR: 6.5674e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  2  1  2 11]
Epoch 20 - avg_train_loss: 0.7959  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4481(1.4481) Grad Norm: 11.4920  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.120 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0445(0.7090) Grad Norm: 2.5165  LR: 6.2661e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 21 - avg_train_loss: 0.7090  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0022(1.0022) Grad Norm: 8.8008  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.113 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0297(0.6420) Grad Norm: 2.1341  LR: 5.9598e-04  
EVAL: [0/9] Data 0.066 (0.066) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 2 2 2]
Epoch 22 - avg_train_loss: 0.6420  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1054(1.1054) Grad Norm: 7.1432  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.122 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.3806(0.7292) Grad Norm: 14.8521  LR: 5.6498e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  3  5  2 11]
Epoch 23 - avg_train_loss: 0.7292  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.2229(1.2229) Grad Norm: 6.0625  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.116 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0091(0.7295) Grad Norm: 0.4639  LR: 5.3373e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 2]
Epoch 24 - avg_train_loss: 0.7295  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5802


EVAL: [8/9] Data 0.012 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3237(0.3237) Grad Norm: 7.2845  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.113 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 1.1373(0.6911) Grad Norm: 8.3549  LR: 5.0236e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  3  5  2 11]
Epoch 25 - avg_train_loss: 0.6911  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.122 (0.122) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6875(1.6875) Grad Norm: 11.3738  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.116 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.9104(0.6553) Grad Norm: 4.6005  LR: 4.7099e-04  
EVAL: [0/9] Data 0.066 (0.066) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  4 11  2 11]
Epoch 26 - avg_train_loss: 0.6553  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.116 (0.116) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0729(0.0729) Grad Norm: 7.8134  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.110 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0614(0.4998) Grad Norm: 7.5015  LR: 4.3974e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  9  5  2 11]
Epoch 27 - avg_train_loss: 0.4998  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6183


EVAL: [8/9] Data 0.015 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.123 (0.123) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0497(1.0497) Grad Norm: 4.3005  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.130 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0478(0.5228) Grad Norm: 2.4975  LR: 4.0874e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  2  5  2 11]
Epoch 28 - avg_train_loss: 0.5228  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5649


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0107(1.0107) Grad Norm: 6.1260  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.115 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.8673(0.6472) Grad Norm: 3.9113  LR: 3.7811e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  4  5  2 11]
Epoch 29 - avg_train_loss: 0.6472  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.6260


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0029(0.0029) Grad Norm: 0.1541  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.129 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0619(0.4187) Grad Norm: 4.8055  LR: 3.4797e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  0  5  2 11]
Epoch 30 - avg_train_loss: 0.4187  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6107


EVAL: [8/9] Data 0.016 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1657(1.1657) Grad Norm: 5.1558  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.111 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 1.1252(0.5820) Grad Norm: 5.0561  LR: 3.1843e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2 11  5  2  9]
Epoch 31 - avg_train_loss: 0.5820  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1643(1.1643) Grad Norm: 7.9312  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.117 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 1.0638(0.6239) Grad Norm: 5.7973  LR: 2.8962e-04  
EVAL: [0/9] Data 0.066 (0.066) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  9  1  2 11]
Epoch 32 - avg_train_loss: 0.6239  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.6107


EVAL: [8/9] Data 0.013 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0011(0.0011) Grad Norm: 0.0385  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.114 (0.117) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0040(0.4569) Grad Norm: 0.1468  LR: 2.6165e-04  
EVAL: [0/9] Data 0.065 (0.065) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2  9  5  2 11]
Epoch 33 - avg_train_loss: 0.4569  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.061) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0101(1.0101) Grad Norm: 7.4487  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.111 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.9714(0.6533) Grad Norm: 4.5995  LR: 2.3463e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2 11  5  2  9]
Epoch 34 - avg_train_loss: 0.6533  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0026(0.0026) Grad Norm: 0.1561  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.115 (0.115) Elapsed 0m 44s (remain 0m 0s) Loss: 0.8246(0.5726) Grad Norm: 3.8486  LR: 2.0866e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [ 2 11  5  2  9]
Epoch 35 - avg_train_loss: 0.5726  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9243(0.9243) Grad Norm: 4.0953  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.120 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7937(0.5313) Grad Norm: 6.4369  LR: 1.8385e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 9 5 2 9]
Epoch 36 - avg_train_loss: 0.5313  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0017(0.0017) Grad Norm: 0.0568  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7593(0.4516) Grad Norm: 3.8979  LR: 1.6030e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 9 5 2 9]
Epoch 37 - avg_train_loss: 0.4516  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6031


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0183(1.0183) Grad Norm: 6.0028  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.123 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0008(0.4744) Grad Norm: 0.0539  LR: 1.3809e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 9 5 2 9]
Epoch 38 - avg_train_loss: 0.4744  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0026(0.0026) Grad Norm: 0.5359  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.104 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7290(0.4833) Grad Norm: 3.5892  LR: 1.1732e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 39 - avg_train_loss: 0.4833  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0044(0.0044) Grad Norm: 0.3566  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.114 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0029(0.2785) Grad Norm: 0.1667  LR: 9.8058e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 40 - avg_train_loss: 0.2785  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0016(0.0016) Grad Norm: 0.1806  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.131 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.3559) Grad Norm: 0.0144  LR: 8.0390e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 41 - avg_train_loss: 0.3559  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7674(0.7674) Grad Norm: 4.6810  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8954(0.3824) Grad Norm: 6.3226  LR: 6.4381e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 42 - avg_train_loss: 0.3824  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8107(0.8107) Grad Norm: 4.2991  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.122 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.4764) Grad Norm: 0.2522  LR: 5.0093e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 43 - avg_train_loss: 0.4764  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0016(0.0016) Grad Norm: 0.0991  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.123 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7945(0.3125) Grad Norm: 3.1856  LR: 3.7578e-05  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 44 - avg_train_loss: 0.3125  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6260


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.8891(0.8891) Grad Norm: 5.7303  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.115 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0021(0.5079) Grad Norm: 0.0850  LR: 2.6881e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 45 - avg_train_loss: 0.5079  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6260


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0012(0.0012) Grad Norm: 0.0758  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.135 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0012(0.0053) Grad Norm: 0.5738  LR: 1.8039e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 46 - avg_train_loss: 0.0053  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6412
Epoch 46 - Save Best Score: 0.6412 Model


EVAL: [8/9] Data 0.018 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0011(0.0011) Grad Norm: 0.1969  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.112 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0009(0.0015) Grad Norm: 0.0374  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 47 - avg_train_loss: 0.0015  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6412


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0007(0.0007) Grad Norm: 0.0647  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0028) Grad Norm: 0.0231  LR: 5.9882e-06  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 48 - avg_train_loss: 0.0028  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.117 (0.117) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0247  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.116 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0019(0.0021) Grad Norm: 0.1747  LR: 2.7534e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 49 - avg_train_loss: 0.0021  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0009(0.0009) Grad Norm: 0.0254  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0016(0.0016) Grad Norm: 0.0409  LR: 1.2467e-06  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 2  4  2  2 12]
preds: [2 4 5 2 9]
Epoch 50 - avg_train_loss: 0.0016  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6412


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.117 (0.117) Elapsed 0m 1s (remain 0m 42s) Loss: 2.7067(2.7067) Grad Norm: nan  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.127 (0.121) Elapsed 0m 44s (remain 0m 0s) Loss: 2.4872(2.5297) Grad Norm: 30.1152  LR: 1.0000e-03  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [2 2 5 2 2]
Epoch 1 - avg_train_loss: 2.5297  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2214
Epoch 1 - Save Best Score: 0.2214 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 45s) Loss: 2.3084(2.3084) Grad Norm: 9.4021  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3240(2.3835) Grad Norm: 15.9964  LR: 9.9803e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [2 2 5 2 6]
Epoch 2 - avg_train_loss: 2.3835  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.3740
Epoch 2 - Save Best Score: 0.3740 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 2.4833(2.4833) Grad Norm: 9.9723  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.119 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3198(2.1722) Grad Norm: 10.8226  LR: 9.9312e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [2 2 3 0 0]
Epoch 3 - avg_train_loss: 2.1722  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.4351
Epoch 3 - Save Best Score: 0.4351 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6744(1.6744) Grad Norm: 14.1765  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.125 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2652(1.9003) Grad Norm: 9.7997  LR: 9.8627e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  1  4  1]
Epoch 4 - avg_train_loss: 1.9003  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.4427
Epoch 4 - Save Best Score: 0.4427 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 45s) Loss: 1.9107(1.9107) Grad Norm: 14.9775  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.120 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7037(1.6377) Grad Norm: 6.4787  LR: 9.7751e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [10  2  3  2  0]
Epoch 5 - avg_train_loss: 1.6377  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.5496
Epoch 5 - Save Best Score: 0.5496 Model


EVAL: [8/9] Data 0.013 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3511(1.3511) Grad Norm: 8.2803  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.128 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4969(1.4280) Grad Norm: 14.5991  LR: 9.6688e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [3 2 3 4 0]
Epoch 6 - avg_train_loss: 1.4280  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6859(1.6859) Grad Norm: 12.1598  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9894(1.3455) Grad Norm: 11.8796  LR: 9.5441e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  3  7  0]
Epoch 7 - avg_train_loss: 1.3455  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5954
Epoch 7 - Save Best Score: 0.5954 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4500(1.4500) Grad Norm: 10.9522  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.132 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5935(1.0691) Grad Norm: 7.0181  LR: 9.4016e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  3  4  0]
Epoch 8 - avg_train_loss: 1.0691  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1199(1.1199) Grad Norm: 5.2795  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.125 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5135(1.1760) Grad Norm: 8.4233  LR: 9.2418e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  6]
Epoch 9 - avg_train_loss: 1.1760  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5563(1.5563) Grad Norm: 10.3591  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.130 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3397(1.0394) Grad Norm: 9.1635  LR: 9.0654e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  1]
Epoch 10 - avg_train_loss: 1.0394  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.5573


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3120(1.3120) Grad Norm: 7.0437  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.108 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5485(1.0585) Grad Norm: 7.4335  LR: 8.8730e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  0  0]
Epoch 11 - avg_train_loss: 1.0585  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6314(0.6314) Grad Norm: 9.8587  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4207(0.9345) Grad Norm: 10.2361  LR: 8.6655e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  3  4  7]
Epoch 12 - avg_train_loss: 0.9345  lr: 8.6655e-04  time: 50s
Epoch 12 - Score: 0.6183
Epoch 12 - Save Best Score: 0.6183 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4907(1.4907) Grad Norm: 7.8152  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.8120(0.9240) Grad Norm: 11.0146  LR: 8.4436e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  3  3  6]
Epoch 13 - avg_train_loss: 0.9240  lr: 8.4436e-04  time: 50s
Epoch 13 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 43s) Loss: 0.1806(0.1806) Grad Norm: 4.0478  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.118 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.0258(0.7781) Grad Norm: 21.7433  LR: 8.2081e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  4]
Epoch 14 - avg_train_loss: 0.7781  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.4962


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.119 (0.119) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7511(0.7511) Grad Norm: 15.0999  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.125 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2973(0.8733) Grad Norm: 9.2006  LR: 7.9601e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  3 11]
Epoch 15 - avg_train_loss: 0.8733  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.6412
Epoch 15 - Save Best Score: 0.6412 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4499(1.4499) Grad Norm: 8.0993  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.136 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3342(0.7488) Grad Norm: 11.0828  LR: 7.7006e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [5 2 3 4 0]
Epoch 16 - avg_train_loss: 0.7488  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1982(0.1982) Grad Norm: 6.2105  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.118 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2107(0.7134) Grad Norm: 14.0846  LR: 7.4304e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  0]
Epoch 17 - avg_train_loss: 0.7134  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.6031


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2081(0.2081) Grad Norm: 5.8818  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.112 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2056(0.8362) Grad Norm: 4.9306  LR: 7.1508e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  1]
Epoch 18 - avg_train_loss: 0.8362  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5954


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0995(0.0995) Grad Norm: 6.9266  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3150(0.7115) Grad Norm: 7.7733  LR: 6.8627e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  4  4  1]
Epoch 19 - avg_train_loss: 0.7115  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5115


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.130 (0.130) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6350(0.6350) Grad Norm: 14.5456  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2396(0.7057) Grad Norm: 5.4095  LR: 6.5674e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  4  4  3]
Epoch 20 - avg_train_loss: 0.7057  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.6489
Epoch 20 - Save Best Score: 0.6489 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0793(0.0793) Grad Norm: 5.1433  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.116 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1366(0.7699) Grad Norm: 4.8699  LR: 6.2661e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [0 2 3 7 1]
Epoch 21 - avg_train_loss: 0.7699  lr: 6.2661e-04  time: 50s
Epoch 21 - Score: 0.5954


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0384(0.0384) Grad Norm: 2.4750  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.129 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0588(0.7640) Grad Norm: 3.2258  LR: 5.9598e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  1]
Epoch 22 - avg_train_loss: 0.7640  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0243(0.0243) Grad Norm: 1.3230  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.112 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0039(0.6348) Grad Norm: 0.2459  LR: 5.6498e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7  1]
Epoch 23 - avg_train_loss: 0.6348  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5878


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.152 (0.152) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0487(1.0487) Grad Norm: 6.4008  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3745(0.6995) Grad Norm: 7.4613  LR: 5.3373e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  3]
Epoch 24 - avg_train_loss: 0.6995  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.6107


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1539(1.1539) Grad Norm: 5.7762  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.125 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0482(0.5996) Grad Norm: 5.9704  LR: 5.0236e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  4]
Epoch 25 - avg_train_loss: 0.5996  lr: 5.0236e-04  time: 50s
Epoch 25 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1128(1.1128) Grad Norm: 5.7871  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0415(0.4359) Grad Norm: 2.9375  LR: 4.7099e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  1]
Epoch 26 - avg_train_loss: 0.4359  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6183


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0031(0.0031) Grad Norm: 0.1470  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.111 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0195(0.4776) Grad Norm: 1.7830  LR: 4.3974e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [9 2 1 4 9]
Epoch 27 - avg_train_loss: 0.4776  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.6336


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1730(1.1730) Grad Norm: 9.0838  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.129 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2133(0.5138) Grad Norm: 16.9482  LR: 4.0874e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7  6]
Epoch 28 - avg_train_loss: 0.5138  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2886(0.2886) Grad Norm: 11.3853  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.122 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2112(0.3884) Grad Norm: 10.3893  LR: 3.7811e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 29 - avg_train_loss: 0.3884  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0546(0.0546) Grad Norm: 3.2296  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.136 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8690(0.4356) Grad Norm: 10.9486  LR: 3.4797e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7  9]
Epoch 30 - avg_train_loss: 0.4356  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.6718
Epoch 30 - Save Best Score: 0.6718 Model


EVAL: [8/9] Data 0.014 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.159 (0.159) Elapsed 0m 1s (remain 0m 45s) Loss: 0.0044(0.0044) Grad Norm: 0.1534  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.126 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0080(0.5828) Grad Norm: 0.9665  LR: 3.1843e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7 11]
Epoch 31 - avg_train_loss: 0.5828  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0036(0.0036) Grad Norm: 0.1583  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.129 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9406(0.4891) Grad Norm: 4.4705  LR: 2.8962e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [12  2  3  4  4]
Epoch 32 - avg_train_loss: 0.4891  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5878


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0567(0.0567) Grad Norm: 3.4424  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0547(0.6114) Grad Norm: 6.3891  LR: 2.6165e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 33 - avg_train_loss: 0.6114  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0262(0.0262) Grad Norm: 2.1276  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.121 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0060(0.3916) Grad Norm: 0.5275  LR: 2.3463e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  6]
Epoch 34 - avg_train_loss: 0.3916  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7791(0.7791) Grad Norm: 3.1259  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0054(0.4281) Grad Norm: 0.3151  LR: 2.0866e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7  9]
Epoch 35 - avg_train_loss: 0.4281  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6947
Epoch 35 - Save Best Score: 0.6947 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.160 (0.160) Elapsed 0m 1s (remain 0m 45s) Loss: 0.9700(0.9700) Grad Norm: 9.0166  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.131 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0178(0.4440) Grad Norm: 1.4454  LR: 1.8385e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  7  7]
Epoch 36 - avg_train_loss: 0.4440  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6629(0.6629) Grad Norm: 3.8357  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0031(0.4369) Grad Norm: 0.2976  LR: 1.6030e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 37 - avg_train_loss: 0.4369  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6336


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6486(0.6486) Grad Norm: 2.6924  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.130 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.4257) Grad Norm: 0.0234  LR: 1.3809e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  7]
Epoch 38 - avg_train_loss: 0.4257  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.109 (0.109) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0010(0.0010) Grad Norm: 0.0338  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.126 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7447(0.4249) Grad Norm: 3.8168  LR: 1.1732e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 39 - avg_train_loss: 0.4249  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6489


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0017(0.0017) Grad Norm: 0.1552  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.3870) Grad Norm: 0.2743  LR: 9.8058e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  9]
Epoch 40 - avg_train_loss: 0.3870  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0011(0.0011) Grad Norm: 0.0321  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0015(0.4352) Grad Norm: 0.0871  LR: 8.0390e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  9]
Epoch 41 - avg_train_loss: 0.4352  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.150 (0.150) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8271(0.8271) Grad Norm: 5.3360  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8198(0.5017) Grad Norm: 5.5085  LR: 6.4381e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4  7]
Epoch 42 - avg_train_loss: 0.5017  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0019(0.0019) Grad Norm: 0.0941  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.121 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.1980) Grad Norm: 0.0204  LR: 5.0093e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 43 - avg_train_loss: 0.1980  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.7823(0.7823) Grad Norm: 5.0572  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.121 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0016(0.4532) Grad Norm: 0.1855  LR: 3.7578e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 44 - avg_train_loss: 0.4532  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.6870


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0005(0.0005) Grad Norm: 0.0170  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.113 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.3231) Grad Norm: 0.0101  LR: 2.6881e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 45 - avg_train_loss: 0.3231  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.6718


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0022(0.0022) Grad Norm: 0.1208  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.120 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0060) Grad Norm: 0.0353  LR: 1.8039e-05  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 46 - avg_train_loss: 0.0060  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.6641


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0007(0.0007) Grad Norm: 0.0286  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.118 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0026) Grad Norm: 0.0212  LR: 1.1073e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 47 - avg_train_loss: 0.0026  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0003(0.0003) Grad Norm: 0.0141  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.123 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0020) Grad Norm: 0.0224  LR: 5.9882e-06  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 48 - avg_train_loss: 0.0020  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0003(0.0003) Grad Norm: 0.0384  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.121 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0047(0.0021) Grad Norm: 0.3163  LR: 2.7534e-06  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 49 - avg_train_loss: 0.0021  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.6565


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0007(0.0007) Grad Norm: 0.0372  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.113 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0017) Grad Norm: 0.0467  LR: 1.2467e-06  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [11  2  3  7  6]
preds: [11  2  3  4 11]
Epoch 50 - avg_train_loss: 0.0017  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.6565


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 43s) Loss: 2.7199(2.7199) Grad Norm: nan  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.134 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 2.6432(2.5217) Grad Norm: 11.3302  LR: 1.0000e-03  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [2 5 5 2 5]
Epoch 1 - avg_train_loss: 2.5217  lr: 1.0000e-03  time: 50s
Epoch 1 - Score: 0.2290
Epoch 1 - Save Best Score: 0.2290 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 2.7625(2.7625) Grad Norm: 14.0415  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.129 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 2.6121(2.2591) Grad Norm: 13.5557  LR: 9.9803e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [10  5 11  2  8]
Epoch 2 - avg_train_loss: 2.2591  lr: 9.9803e-04  time: 50s
Epoch 2 - Score: 0.4122
Epoch 2 - Save Best Score: 0.4122 Model


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5962(1.5962) Grad Norm: 11.8800  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4346(1.9166) Grad Norm: 12.7361  LR: 9.9312e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 4  5 10  4  8]
Epoch 3 - avg_train_loss: 1.9166  lr: 9.9312e-04  time: 50s
Epoch 3 - Score: 0.4885
Epoch 3 - Save Best Score: 0.4885 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 2.0623(2.0623) Grad Norm: 21.0005  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.112 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.3192(1.7518) Grad Norm: 10.5329  LR: 9.8627e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 1  5 11  2  8]
Epoch 4 - avg_train_loss: 1.7518  lr: 9.8627e-04  time: 50s
Epoch 4 - Score: 0.4046


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3330(1.3330) Grad Norm: 17.1956  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.123 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1204(1.4840) Grad Norm: 10.7378  LR: 9.7751e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [10  5 10  2  8]
Epoch 5 - avg_train_loss: 1.4840  lr: 9.7751e-04  time: 50s
Epoch 5 - Score: 0.4885


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5929(0.5929) Grad Norm: 8.6752  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7346(1.3675) Grad Norm: 9.9057  LR: 9.6688e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11 11 11  2  8]
Epoch 6 - avg_train_loss: 1.3675  lr: 9.6688e-04  time: 50s
Epoch 6 - Score: 0.5191
Epoch 6 - Save Best Score: 0.5191 Model


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5397(1.5397) Grad Norm: 9.6764  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.113 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9166(1.3243) Grad Norm: 12.9924  LR: 9.5441e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 1  5 11  2  8]
Epoch 7 - avg_train_loss: 1.3243  lr: 9.5441e-04  time: 50s
Epoch 7 - Score: 0.5725
Epoch 7 - Save Best Score: 0.5725 Model


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5011(0.5011) Grad Norm: 8.9713  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.117 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7442(1.1551) Grad Norm: 10.7460  LR: 9.4016e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 1  5 10  1  8]
Epoch 8 - avg_train_loss: 1.1551  lr: 9.4016e-04  time: 50s
Epoch 8 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3234(1.3234) Grad Norm: 8.3744  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9995(1.0963) Grad Norm: 11.5937  LR: 9.2418e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 1  9 10  1  8]
Epoch 9 - avg_train_loss: 1.0963  lr: 9.2418e-04  time: 50s
Epoch 9 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5862(1.5862) Grad Norm: 6.9593  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.125 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6608(1.0434) Grad Norm: 13.7150  LR: 9.0654e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  5  8]
Epoch 10 - avg_train_loss: 1.0434  lr: 9.0654e-04  time: 50s
Epoch 10 - Score: 0.4962


EVAL: [8/9] Data 0.015 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.152 (0.152) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4150(1.4150) Grad Norm: 13.4510  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.102 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2820(0.8904) Grad Norm: 11.5658  LR: 8.8730e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 1  9 11  3  8]
Epoch 11 - avg_train_loss: 0.8904  lr: 8.8730e-04  time: 50s
Epoch 11 - Score: 0.5038


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.4972(0.4972) Grad Norm: 15.5974  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2162(0.9462) Grad Norm: 6.6053  LR: 8.6655e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11  9 11  2  8]
Epoch 12 - avg_train_loss: 0.9462  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3725(1.3725) Grad Norm: 6.0310  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.124 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9238(0.8950) Grad Norm: 8.5095  LR: 8.4436e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 2 12 11  2  8]
Epoch 13 - avg_train_loss: 0.8950  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.4885


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.4170(0.4170) Grad Norm: 9.8615  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6501(0.7542) Grad Norm: 11.2062  LR: 8.2081e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [2 5 8 2 8]
Epoch 14 - avg_train_loss: 0.7542  lr: 8.2081e-04  time: 50s
Epoch 14 - Score: 0.4656


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3484(0.3484) Grad Norm: 8.4930  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1895(1.0130) Grad Norm: 4.8678  LR: 7.9601e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 15 - avg_train_loss: 1.0130  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.4401(1.4401) Grad Norm: 29.5959  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.120 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2549(0.9439) Grad Norm: 4.7771  LR: 7.7006e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 16 - avg_train_loss: 0.9439  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0521(0.0521) Grad Norm: 1.4660  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.132 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4984(0.7872) Grad Norm: 9.1954  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5 11 11  2  8]
Epoch 17 - avg_train_loss: 0.7872  lr: 7.4304e-04  time: 50s
Epoch 17 - Score: 0.5420


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.148 (0.148) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3693(1.3693) Grad Norm: 6.6487  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3407(0.7210) Grad Norm: 7.8021  LR: 7.1508e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [5 5 6 2 8]
Epoch 18 - avg_train_loss: 0.7210  lr: 7.1508e-04  time: 50s
Epoch 18 - Score: 0.4733


EVAL: [8/9] Data 0.013 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0254(0.0254) Grad Norm: 1.7247  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.129 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.2732(0.7437) Grad Norm: 6.0650  LR: 6.8627e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [5 5 6 2 8]
Epoch 19 - avg_train_loss: 0.7437  lr: 6.8627e-04  time: 50s
Epoch 19 - Score: 0.5344


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1905(1.1905) Grad Norm: 5.5728  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.134 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2173(0.6865) Grad Norm: 5.1758  LR: 6.5674e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 20 - avg_train_loss: 0.6865  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5573


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 1.2814(1.2814) Grad Norm: 6.0704  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.126 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1522(0.5389) Grad Norm: 9.0680  LR: 6.2661e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 0  5 11  2  8]
Epoch 21 - avg_train_loss: 0.5389  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0685(0.0685) Grad Norm: 6.1323  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.133 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.5040(0.7966) Grad Norm: 15.6204  LR: 5.9598e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11  5 11  3  8]
Epoch 22 - avg_train_loss: 0.7966  lr: 5.9598e-04  time: 50s
Epoch 22 - Score: 0.5573


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 1.1185(1.1185) Grad Norm: 4.0440  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.118 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9084(0.7049) Grad Norm: 5.5400  LR: 5.6498e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 23 - avg_train_loss: 0.7049  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1855(0.1855) Grad Norm: 5.1606  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.129 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0513(0.4974) Grad Norm: 2.4053  LR: 5.3373e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11  5 11  3  8]
Epoch 24 - avg_train_loss: 0.4974  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.4962


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 44s) Loss: 0.2132(0.2132) Grad Norm: 10.3650  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0699(0.4689) Grad Norm: 2.4613  LR: 5.0236e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 25 - avg_train_loss: 0.4689  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5191


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0022(0.0022) Grad Norm: 0.0666  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.131 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0041(0.5177) Grad Norm: 0.2446  LR: 4.7099e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  0]
Epoch 26 - avg_train_loss: 0.5177  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.5191


EVAL: [8/9] Data 0.013 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0216(0.0216) Grad Norm: 0.9433  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.125 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1909(0.6225) Grad Norm: 11.5755  LR: 4.3974e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 27 - avg_train_loss: 0.6225  lr: 4.3974e-04  time: 50s
Epoch 27 - Score: 0.5115


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3326(1.3326) Grad Norm: 10.2656  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.123 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0191(0.5581) Grad Norm: 1.1633  LR: 4.0874e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 3  5 11  3  8]
Epoch 28 - avg_train_loss: 0.5581  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5267


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0540(0.0540) Grad Norm: 2.6883  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.119 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2654(0.6147) Grad Norm: 7.0554  LR: 3.7811e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11  5 11  2  8]
Epoch 29 - avg_train_loss: 0.6147  lr: 3.7811e-04  time: 50s
Epoch 29 - Score: 0.5115


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0069(0.0069) Grad Norm: 0.4197  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.125 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0159(0.5427) Grad Norm: 1.5720  LR: 3.4797e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [11  5 11  3  8]
Epoch 30 - avg_train_loss: 0.5427  lr: 3.4797e-04  time: 50s
Epoch 30 - Score: 0.5115


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0010(0.0010) Grad Norm: 0.0352  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.127 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1185(0.3658) Grad Norm: 5.9876  LR: 3.1843e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  5  8]
Epoch 31 - avg_train_loss: 0.3658  lr: 3.1843e-04  time: 50s
Epoch 31 - Score: 0.5496


EVAL: [8/9] Data 0.014 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1481(0.1481) Grad Norm: 10.1610  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.134 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 1.0904(0.5592) Grad Norm: 7.1968  LR: 2.8962e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 32 - avg_train_loss: 0.5592  lr: 2.8962e-04  time: 50s
Epoch 32 - Score: 0.5725


EVAL: [8/9] Data 0.016 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.147 (0.147) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0311(0.0311) Grad Norm: 2.6700  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.114 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8599(0.3746) Grad Norm: 4.4034  LR: 2.6165e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 33 - avg_train_loss: 0.3746  lr: 2.6165e-04  time: 50s
Epoch 33 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0025(0.0025) Grad Norm: 0.1802  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.127 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0018(0.4635) Grad Norm: 0.1254  LR: 2.3463e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 34 - avg_train_loss: 0.4635  lr: 2.3463e-04  time: 50s
Epoch 34 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1014(1.1014) Grad Norm: 8.2803  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.116 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9677(0.5458) Grad Norm: 4.4905  LR: 2.0866e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 35 - avg_train_loss: 0.5458  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.5649


EVAL: [8/9] Data 0.013 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.145 (0.145) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9076(0.9076) Grad Norm: 4.1788  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0023(0.3576) Grad Norm: 0.0821  LR: 1.8385e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 36 - avg_train_loss: 0.3576  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.5649


EVAL: [8/9] Data 0.015 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9757(0.9757) Grad Norm: 5.6815  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.114 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6627(0.4350) Grad Norm: 4.1919  LR: 1.6030e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 37 - avg_train_loss: 0.4350  lr: 1.6030e-04  time: 50s
Epoch 37 - Score: 0.5802
Epoch 37 - Save Best Score: 0.5802 Model


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0027(0.0027) Grad Norm: 0.1644  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.124 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0015(0.4764) Grad Norm: 0.0915  LR: 1.3809e-04  
EVAL: [0/9] Data 0.076 (0.076) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 38 - avg_train_loss: 0.4764  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0076(0.0076) Grad Norm: 1.6186  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.115 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7079(0.4131) Grad Norm: 4.1240  LR: 1.1732e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 39 - avg_train_loss: 0.4131  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0170(0.0170) Grad Norm: 0.7875  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.114 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0416(0.3657) Grad Norm: 4.1031  LR: 9.8058e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 40 - avg_train_loss: 0.3657  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.5802


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0013(0.0013) Grad Norm: 0.0521  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.118 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0002(0.3055) Grad Norm: 0.0052  LR: 8.0390e-05  
EVAL: [0/9] Data 0.078 (0.078) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 41 - avg_train_loss: 0.3055  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.5802


EVAL: [8/9] Data 0.014 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.9410(0.9410) Grad Norm: 5.9104  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.122 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7972(0.5751) Grad Norm: 4.5344  LR: 6.4381e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 42 - avg_train_loss: 0.5751  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.5878
Epoch 42 - Save Best Score: 0.5878 Model


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.154 (0.154) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8205(0.8205) Grad Norm: 3.3507  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.126 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7356(0.4025) Grad Norm: 4.5594  LR: 5.0093e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 43 - avg_train_loss: 0.4025  lr: 5.0093e-05  time: 49s
Epoch 43 - Score: 0.5649


EVAL: [8/9] Data 0.013 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0008(0.0008) Grad Norm: 0.0210  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.130 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.3968) Grad Norm: 0.0310  LR: 3.7578e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  3  8]
Epoch 44 - avg_train_loss: 0.3968  lr: 3.7578e-05  time: 49s
Epoch 44 - Score: 0.5649


EVAL: [8/9] Data 0.015 (0.070) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 1.1456(1.1456) Grad Norm: 6.4133  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.129 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.3469) Grad Norm: 0.0136  LR: 2.6881e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 45 - avg_train_loss: 0.3469  lr: 2.6881e-05  time: 49s
Epoch 45 - Score: 0.5725


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0332(0.0332) Grad Norm: 3.3251  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.126 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0020) Grad Norm: 0.0273  LR: 1.8039e-05  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 46 - avg_train_loss: 0.0020  lr: 1.8039e-05  time: 49s
Epoch 46 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0022(0.0022) Grad Norm: 0.1040  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.128 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.0014) Grad Norm: 0.2907  LR: 1.1073e-05  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 47 - avg_train_loss: 0.0014  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.124 (0.124) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0018(0.0018) Grad Norm: 0.0713  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0004(0.0056) Grad Norm: 0.0153  LR: 5.9882e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 48 - avg_train_loss: 0.0056  lr: 5.9882e-06  time: 49s
Epoch 48 - Score: 0.5649


EVAL: [8/9] Data 0.015 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0004(0.0004) Grad Norm: 0.0150  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0003(0.0017) Grad Norm: 0.0096  LR: 2.7534e-06  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 49 - avg_train_loss: 0.0017  lr: 2.7534e-06  time: 49s
Epoch 49 - Score: 0.5649


EVAL: [8/9] Data 0.014 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0004(0.0004) Grad Norm: 0.0270  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.121 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0015(0.0054) Grad Norm: 0.0369  LR: 1.2467e-06  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [ 5  2 11 10  8]
preds: [ 5  5 11  2  8]
Epoch 50 - avg_train_loss: 0.0054  lr: 1.2467e-06  time: 49s
Epoch 50 - Score: 0.5802


EVAL: [8/9] Data 0.015 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [1][0/32] Data 0.112 (0.112) Elapsed 0m 1s (remain 0m 42s) Loss: 2.4089(2.4089) Grad Norm: inf  LR: 1.0000e-03  
Epoch: [1][31/32] Data 0.117 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 2.4033(2.4920) Grad Norm: 12.2481  LR: 1.0000e-03  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [6 2 2 2 2]
Epoch 1 - avg_train_loss: 2.4920  lr: 1.0000e-03  time: 49s
Epoch 1 - Score: 0.2462
Epoch 1 - Save Best Score: 0.2462 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 45s) Loss: 2.1805(2.1805) Grad Norm: 8.9978  LR: 9.9803e-04  
Epoch: [2][31/32] Data 0.117 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.7727(2.2449) Grad Norm: 10.2102  LR: 9.9803e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 2 2 2 7]
Epoch 2 - avg_train_loss: 2.2449  lr: 9.9803e-04  time: 49s
Epoch 2 - Score: 0.4000
Epoch 2 - Save Best Score: 0.4000 Model


EVAL: [8/9] Data 0.011 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 44s) Loss: 2.3563(2.3563) Grad Norm: 8.8625  LR: 9.9312e-04  
Epoch: [3][31/32] Data 0.104 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2690(1.8838) Grad Norm: 12.6723  LR: 9.9312e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 2 2 2 9]
Epoch 3 - avg_train_loss: 1.8838  lr: 9.9312e-04  time: 49s
Epoch 3 - Score: 0.4538
Epoch 3 - Save Best Score: 0.4538 Model


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/32] Data 0.143 (0.143) Elapsed 0m 1s (remain 0m 44s) Loss: 1.8211(1.8211) Grad Norm: 10.2890  LR: 9.8627e-04  
Epoch: [4][31/32] Data 0.115 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 2.2325(1.6906) Grad Norm: 15.8858  LR: 9.8627e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 2 2 2 7]
Epoch 4 - avg_train_loss: 1.6906  lr: 9.8627e-04  time: 49s
Epoch 4 - Score: 0.4615
Epoch 4 - Save Best Score: 0.4615 Model


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 45s) Loss: 1.7413(1.7413) Grad Norm: 19.4380  LR: 9.7751e-04  
Epoch: [5][31/32] Data 0.119 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 2.1266(1.4559) Grad Norm: 16.1953  LR: 9.7751e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 6 8]
Epoch 5 - avg_train_loss: 1.4559  lr: 9.7751e-04  time: 49s
Epoch 5 - Score: 0.5462
Epoch 5 - Save Best Score: 0.5462 Model


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/32] Data 0.144 (0.144) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9256(0.9256) Grad Norm: 13.3548  LR: 9.6688e-04  
Epoch: [6][31/32] Data 0.124 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8429(1.2820) Grad Norm: 15.6972  LR: 9.6688e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 7]
Epoch 6 - avg_train_loss: 1.2820  lr: 9.6688e-04  time: 49s
Epoch 6 - Score: 0.5385


EVAL: [8/9] Data 0.010 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/32] Data 0.127 (0.127) Elapsed 0m 1s (remain 0m 43s) Loss: 1.8029(1.8029) Grad Norm: 11.2995  LR: 9.5441e-04  
Epoch: [7][31/32] Data 0.131 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6718(1.3805) Grad Norm: 14.6416  LR: 9.5441e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 2 2 2 8]
Epoch 7 - avg_train_loss: 1.3805  lr: 9.5441e-04  time: 49s
Epoch 7 - Score: 0.4692


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8795(0.8795) Grad Norm: 18.5405  LR: 9.4016e-04  
Epoch: [8][31/32] Data 0.117 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 1.6691(1.0782) Grad Norm: 12.2697  LR: 9.4016e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 0 9]
Epoch 8 - avg_train_loss: 1.0782  lr: 9.4016e-04  time: 49s
Epoch 8 - Score: 0.6231
Epoch 8 - Save Best Score: 0.6231 Model


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5978(1.5978) Grad Norm: 9.1656  LR: 9.2418e-04  
Epoch: [9][31/32] Data 0.114 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2573(1.0024) Grad Norm: 9.6238  LR: 9.2418e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 2 2 2 7]
Epoch 9 - avg_train_loss: 1.0024  lr: 9.2418e-04  time: 49s
Epoch 9 - Score: 0.5462


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.2190(0.2190) Grad Norm: 8.5442  LR: 9.0654e-04  
Epoch: [10][31/32] Data 0.115 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.4654(0.7399) Grad Norm: 11.6474  LR: 9.0654e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 4 0]
Epoch 10 - avg_train_loss: 0.7399  lr: 9.0654e-04  time: 49s
Epoch 10 - Score: 0.5923


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [11][0/32] Data 0.129 (0.129) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3231(0.3231) Grad Norm: 8.6443  LR: 8.8730e-04  
Epoch: [11][31/32] Data 0.129 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1959(1.0979) Grad Norm: 6.6072  LR: 8.8730e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 2 2]
Epoch 11 - avg_train_loss: 1.0979  lr: 8.8730e-04  time: 49s
Epoch 11 - Score: 0.6077


EVAL: [8/9] Data 0.009 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [12][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3030(0.3030) Grad Norm: 7.5716  LR: 8.6655e-04  
Epoch: [12][31/32] Data 0.122 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.3530(0.9797) Grad Norm: 8.5461  LR: 8.6655e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 2 3 0]
Epoch 12 - avg_train_loss: 0.9797  lr: 8.6655e-04  time: 49s
Epoch 12 - Score: 0.5615


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [13][0/32] Data 0.140 (0.140) Elapsed 0m 1s (remain 0m 44s) Loss: 0.1356(0.1356) Grad Norm: 5.0849  LR: 8.4436e-04  
Epoch: [13][31/32] Data 0.119 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.1916(1.0036) Grad Norm: 8.2658  LR: 8.4436e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 2]
Epoch 13 - avg_train_loss: 1.0036  lr: 8.4436e-04  time: 49s
Epoch 13 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [14][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.5102(1.5102) Grad Norm: 7.7621  LR: 8.2081e-04  
Epoch: [14][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 1.5375(0.8248) Grad Norm: 8.5856  LR: 8.2081e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 0]
Epoch 14 - avg_train_loss: 0.8248  lr: 8.2081e-04  time: 49s
Epoch 14 - Score: 0.5615


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [15][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 43s) Loss: 0.3668(0.3668) Grad Norm: 10.0654  LR: 7.9601e-04  
Epoch: [15][31/32] Data 0.108 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.9546(1.0519) Grad Norm: 17.1827  LR: 7.9601e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 2]
Epoch 15 - avg_train_loss: 1.0519  lr: 7.9601e-04  time: 49s
Epoch 15 - Score: 0.5462


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [16][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3403(1.3403) Grad Norm: 6.3739  LR: 7.7006e-04  
Epoch: [16][31/32] Data 0.110 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1064(0.8082) Grad Norm: 3.6549  LR: 7.7006e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 9 5 2 7]
Epoch 16 - avg_train_loss: 0.8082  lr: 7.7006e-04  time: 49s
Epoch 16 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [17][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3186(1.3186) Grad Norm: 7.4262  LR: 7.4304e-04  
Epoch: [17][31/32] Data 0.126 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.4612(0.9350) Grad Norm: 10.1498  LR: 7.4304e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 2]
Epoch 17 - avg_train_loss: 0.9350  lr: 7.4304e-04  time: 49s
Epoch 17 - Score: 0.5385


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [18][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.5219(0.5219) Grad Norm: 14.6485  LR: 7.1508e-04  
Epoch: [18][31/32] Data 0.128 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.2923(0.8068) Grad Norm: 7.3319  LR: 7.1508e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 4 0]
Epoch 18 - avg_train_loss: 0.8068  lr: 7.1508e-04  time: 49s
Epoch 18 - Score: 0.5462


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [19][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 0.3487(0.3487) Grad Norm: 13.2197  LR: 6.8627e-04  
Epoch: [19][31/32] Data 0.134 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3113(0.8559) Grad Norm: 7.7861  LR: 6.8627e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 5 2]
Epoch 19 - avg_train_loss: 0.8559  lr: 6.8627e-04  time: 49s
Epoch 19 - Score: 0.5462


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [20][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 1.6230(1.6230) Grad Norm: 9.1619  LR: 6.5674e-04  
Epoch: [20][31/32] Data 0.121 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.1600(0.6028) Grad Norm: 5.9840  LR: 6.5674e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 4 7]
Epoch 20 - avg_train_loss: 0.6028  lr: 6.5674e-04  time: 49s
Epoch 20 - Score: 0.5692


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [21][0/32] Data 0.149 (0.149) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0172(0.0172) Grad Norm: 0.6003  LR: 6.2661e-04  
Epoch: [21][31/32] Data 0.115 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0630(0.6554) Grad Norm: 3.7055  LR: 6.2661e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 2 2 7]
Epoch 21 - avg_train_loss: 0.6554  lr: 6.2661e-04  time: 49s
Epoch 21 - Score: 0.5846


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [22][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0281(1.0281) Grad Norm: 3.5847  LR: 5.9598e-04  
Epoch: [22][31/32] Data 0.125 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0754(0.6437) Grad Norm: 3.4018  LR: 5.9598e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 4 4 0]
Epoch 22 - avg_train_loss: 0.6437  lr: 5.9598e-04  time: 49s
Epoch 22 - Score: 0.5692


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [23][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.3897(1.3897) Grad Norm: 6.3737  LR: 5.6498e-04  
Epoch: [23][31/32] Data 0.120 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8090(0.6917) Grad Norm: 5.2471  LR: 5.6498e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 2]
Epoch 23 - avg_train_loss: 0.6917  lr: 5.6498e-04  time: 49s
Epoch 23 - Score: 0.5615


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [24][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0147(0.0147) Grad Norm: 0.7613  LR: 5.3373e-04  
Epoch: [24][31/32] Data 0.123 (0.119) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0316(0.5655) Grad Norm: 3.7551  LR: 5.3373e-04  
EVAL: [0/9] Data 0.070 (0.070) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 2 4 0]
Epoch 24 - avg_train_loss: 0.5655  lr: 5.3373e-04  time: 49s
Epoch 24 - Score: 0.5615


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [25][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0325(0.0325) Grad Norm: 3.2744  LR: 5.0236e-04  
Epoch: [25][31/32] Data 0.104 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9223(0.6335) Grad Norm: 6.0025  LR: 5.0236e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 5 2 0]
Epoch 25 - avg_train_loss: 0.6335  lr: 5.0236e-04  time: 49s
Epoch 25 - Score: 0.5923


EVAL: [8/9] Data 0.009 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [26][0/32] Data 0.120 (0.120) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0299(0.0299) Grad Norm: 2.2144  LR: 4.7099e-04  
Epoch: [26][31/32] Data 0.122 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 0.4852(0.6021) Grad Norm: 10.2158  LR: 4.7099e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 4 0]
Epoch 26 - avg_train_loss: 0.6021  lr: 4.7099e-04  time: 49s
Epoch 26 - Score: 0.6231


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [27][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0444(1.0444) Grad Norm: 6.5160  LR: 4.3974e-04  
Epoch: [27][31/32] Data 0.118 (0.120) Elapsed 0m 45s (remain 0m 0s) Loss: 1.3987(0.5049) Grad Norm: 10.0787  LR: 4.3974e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 3]
Epoch 27 - avg_train_loss: 0.5049  lr: 4.3974e-04  time: 49s
Epoch 27 - Score: 0.5385


EVAL: [8/9] Data 0.010 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [28][0/32] Data 0.135 (0.135) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0222(0.0222) Grad Norm: 1.6765  LR: 4.0874e-04  
Epoch: [28][31/32] Data 0.113 (0.116) Elapsed 0m 44s (remain 0m 0s) Loss: 0.6993(0.5471) Grad Norm: 3.1095  LR: 4.0874e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 3]
Epoch 28 - avg_train_loss: 0.5471  lr: 4.0874e-04  time: 49s
Epoch 28 - Score: 0.5692


EVAL: [8/9] Data 0.009 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [29][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 43s) Loss: 1.0546(1.0546) Grad Norm: 7.5403  LR: 3.7811e-04  
Epoch: [29][31/32] Data 0.130 (0.114) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0117(0.6032) Grad Norm: 0.3614  LR: 3.7811e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 0 2 4 0]
Epoch 29 - avg_train_loss: 0.6032  lr: 3.7811e-04  time: 49s
Epoch 29 - Score: 0.5923


EVAL: [8/9] Data 0.009 (0.062) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [30][0/32] Data 0.115 (0.115) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0164(0.0164) Grad Norm: 1.8976  LR: 3.4797e-04  
Epoch: [30][31/32] Data 0.104 (0.118) Elapsed 0m 44s (remain 0m 0s) Loss: 1.0715(0.5448) Grad Norm: 5.1480  LR: 3.4797e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 4 0]
Epoch 30 - avg_train_loss: 0.5448  lr: 3.4797e-04  time: 49s
Epoch 30 - Score: 0.5538


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [31][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0044(0.0044) Grad Norm: 0.2850  LR: 3.1843e-04  
Epoch: [31][31/32] Data 0.118 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.0119(0.4399) Grad Norm: 2.7256  LR: 3.1843e-04  
EVAL: [0/9] Data 0.069 (0.069) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 2 0]
Epoch 31 - avg_train_loss: 0.4399  lr: 3.1843e-04  time: 49s
Epoch 31 - Score: 0.5769


EVAL: [8/9] Data 0.009 (0.063) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [32][0/32] Data 0.138 (0.138) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0085(0.0085) Grad Norm: 0.6042  LR: 2.8962e-04  
Epoch: [32][31/32] Data 0.122 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 1.1308(0.5440) Grad Norm: 7.4347  LR: 2.8962e-04  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 4 2]
Epoch 32 - avg_train_loss: 0.5440  lr: 2.8962e-04  time: 49s
Epoch 32 - Score: 0.5923


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [33][0/32] Data 0.134 (0.134) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9522(0.9522) Grad Norm: 12.0809  LR: 2.6165e-04  
Epoch: [33][31/32] Data 0.124 (0.122) Elapsed 0m 45s (remain 0m 0s) Loss: 0.8521(0.5848) Grad Norm: 5.7731  LR: 2.6165e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 4 0]
Epoch 33 - avg_train_loss: 0.5848  lr: 2.6165e-04  time: 49s
Epoch 33 - Score: 0.6385
Epoch 33 - Save Best Score: 0.6385 Model


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [34][0/32] Data 0.165 (0.165) Elapsed 0m 1s (remain 0m 45s) Loss: 1.2147(1.2147) Grad Norm: 7.3031  LR: 2.3463e-04  
Epoch: [34][31/32] Data 0.117 (0.121) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7205(0.4980) Grad Norm: 3.3384  LR: 2.3463e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 34 - avg_train_loss: 0.4980  lr: 2.3463e-04  time: 49s
Epoch 34 - Score: 0.6231


EVAL: [8/9] Data 0.009 (0.064) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [35][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.7676(0.7676) Grad Norm: 3.0427  LR: 2.0866e-04  
Epoch: [35][31/32] Data 0.121 (0.119) Elapsed 0m 44s (remain 0m 0s) Loss: 0.7838(0.6516) Grad Norm: 4.3802  LR: 2.0866e-04  
EVAL: [0/9] Data 0.068 (0.068) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 2 0]
Epoch 35 - avg_train_loss: 0.6516  lr: 2.0866e-04  time: 49s
Epoch 35 - Score: 0.6308


EVAL: [8/9] Data 0.010 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [36][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0029(0.0029) Grad Norm: 0.0895  LR: 1.8385e-04  
Epoch: [36][31/32] Data 0.132 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0026(0.4218) Grad Norm: 0.1068  LR: 1.8385e-04  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 2 0]
Epoch 36 - avg_train_loss: 0.4218  lr: 1.8385e-04  time: 49s
Epoch 36 - Score: 0.6154


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [37][0/32] Data 0.128 (0.128) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0026(0.0026) Grad Norm: 0.2293  LR: 1.6030e-04  
Epoch: [37][31/32] Data 0.138 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9603(0.4544) Grad Norm: 7.9249  LR: 1.6030e-04  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 37 - avg_train_loss: 0.4544  lr: 1.6030e-04  time: 49s
Epoch 37 - Score: 0.6385


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [38][0/32] Data 0.131 (0.131) Elapsed 0m 1s (remain 0m 44s) Loss: 1.0092(1.0092) Grad Norm: 4.7100  LR: 1.3809e-04  
Epoch: [38][31/32] Data 0.131 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6029(0.3902) Grad Norm: 1.5835  LR: 1.3809e-04  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 38 - avg_train_loss: 0.3902  lr: 1.3809e-04  time: 49s
Epoch 38 - Score: 0.6615
Epoch 38 - Save Best Score: 0.6615 Model


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [39][0/32] Data 0.136 (0.136) Elapsed 0m 1s (remain 0m 45s) Loss: 1.1251(1.1251) Grad Norm: 6.0453  LR: 1.1732e-04  
Epoch: [39][31/32] Data 0.127 (0.123) Elapsed 0m 45s (remain 0m 0s) Loss: 0.7622(0.4351) Grad Norm: 3.7622  LR: 1.1732e-04  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 2 2 0]
Epoch 39 - avg_train_loss: 0.4351  lr: 1.1732e-04  time: 49s
Epoch 39 - Score: 0.6385


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [40][0/32] Data 0.125 (0.125) Elapsed 0m 1s (remain 0m 44s) Loss: 0.8969(0.8969) Grad Norm: 5.8446  LR: 9.8058e-05  
Epoch: [40][31/32] Data 0.113 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.2328) Grad Norm: 0.0418  LR: 9.8058e-05  
EVAL: [0/9] Data 0.071 (0.071) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 40 - avg_train_loss: 0.2328  lr: 9.8058e-05  time: 49s
Epoch 40 - Score: 0.6385


EVAL: [8/9] Data 0.009 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [41][0/32] Data 0.133 (0.133) Elapsed 0m 1s (remain 0m 44s) Loss: 0.9671(0.9671) Grad Norm: 5.4712  LR: 8.0390e-05  
Epoch: [41][31/32] Data 0.122 (0.124) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.4249) Grad Norm: 0.0217  LR: 8.0390e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 41 - avg_train_loss: 0.4249  lr: 8.0390e-05  time: 49s
Epoch 41 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [42][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0009(0.0009) Grad Norm: 0.0272  LR: 6.4381e-05  
Epoch: [42][31/32] Data 0.127 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.9138(0.3191) Grad Norm: 8.7679  LR: 6.4381e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 42 - avg_train_loss: 0.3191  lr: 6.4381e-05  time: 49s
Epoch 42 - Score: 0.6385


EVAL: [8/9] Data 0.009 (0.065) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [43][0/32] Data 0.142 (0.142) Elapsed 0m 1s (remain 0m 44s) Loss: 0.6632(0.6632) Grad Norm: 3.2159  LR: 5.0093e-05  
Epoch: [43][31/32] Data 0.130 (0.131) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6821(0.5121) Grad Norm: 4.3187  LR: 5.0093e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 43 - avg_train_loss: 0.5121  lr: 5.0093e-05  time: 50s
Epoch 43 - Score: 0.6615


EVAL: [8/9] Data 0.010 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [44][0/32] Data 0.132 (0.132) Elapsed 0m 1s (remain 0m 43s) Loss: 0.6855(0.6855) Grad Norm: 2.7638  LR: 3.7578e-05  
Epoch: [44][31/32] Data 0.125 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.6563(0.5036) Grad Norm: 3.1979  LR: 3.7578e-05  
EVAL: [0/9] Data 0.073 (0.073) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 44 - avg_train_loss: 0.5036  lr: 3.7578e-05  time: 50s
Epoch 44 - Score: 0.6615


EVAL: [8/9] Data 0.010 (0.067) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [45][0/32] Data 0.150 (0.150) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0010(0.0010) Grad Norm: 0.0484  LR: 2.6881e-05  
Epoch: [45][31/32] Data 0.127 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0013(0.2840) Grad Norm: 0.0478  LR: 2.6881e-05  
EVAL: [0/9] Data 0.074 (0.074) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 45 - avg_train_loss: 0.2840  lr: 2.6881e-05  time: 50s
Epoch 45 - Score: 0.6462


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [46][0/32] Data 0.126 (0.126) Elapsed 0m 1s (remain 0m 43s) Loss: 0.0015(0.0015) Grad Norm: 0.0611  LR: 1.8039e-05  
Epoch: [46][31/32] Data 0.130 (0.126) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.0027) Grad Norm: 0.0270  LR: 1.8039e-05  
EVAL: [0/9] Data 0.072 (0.072) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 46 - avg_train_loss: 0.0027  lr: 1.8039e-05  time: 50s
Epoch 46 - Score: 0.6385


EVAL: [8/9] Data 0.009 (0.066) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [47][0/32] Data 0.139 (0.139) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0022(0.0022) Grad Norm: 0.0895  LR: 1.1073e-05  
Epoch: [47][31/32] Data 0.111 (0.125) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.0091) Grad Norm: 0.0433  LR: 1.1073e-05  
EVAL: [0/9] Data 0.081 (0.081) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 47 - avg_train_loss: 0.0091  lr: 1.1073e-05  time: 49s
Epoch 47 - Score: 0.6462


EVAL: [8/9] Data 0.010 (0.069) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [48][0/32] Data 0.137 (0.137) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0095(0.0095) Grad Norm: 0.5868  LR: 5.9882e-06  
Epoch: [48][31/32] Data 0.145 (0.127) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0100(0.0015) Grad Norm: 0.9758  LR: 5.9882e-06  
EVAL: [0/9] Data 0.079 (0.079) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 48 - avg_train_loss: 0.0015  lr: 5.9882e-06  time: 50s
Epoch 48 - Score: 0.6462


EVAL: [8/9] Data 0.011 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [49][0/32] Data 0.146 (0.146) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0009(0.0009) Grad Norm: 0.0287  LR: 2.7534e-06  
Epoch: [49][31/32] Data 0.136 (0.130) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0014(0.0011) Grad Norm: 0.0453  LR: 2.7534e-06  
EVAL: [0/9] Data 0.077 (0.077) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 49 - avg_train_loss: 0.0011  lr: 2.7534e-06  time: 50s
Epoch 49 - Score: 0.6462


EVAL: [8/9] Data 0.009 (0.068) Elapsed 0m 4s (remain 0m 0s) 
Epoch: [50][0/32] Data 0.141 (0.141) Elapsed 0m 1s (remain 0m 44s) Loss: 0.0003(0.0003) Grad Norm: 0.0112  LR: 1.2467e-06  
Epoch: [50][31/32] Data 0.125 (0.128) Elapsed 0m 45s (remain 0m 0s) Loss: 0.0009(0.0016) Grad Norm: 0.1049  LR: 1.2467e-06  
EVAL: [0/9] Data 0.075 (0.075) Elapsed 0m 0s (remain 0m 4s) 


labels: [8 6 2 3 6]
preds: [8 6 5 2 0]
Epoch 50 - avg_train_loss: 0.0016  lr: 1.2467e-06  time: 50s
Epoch 50 - Score: 0.6462

oof score: 0.6529051987767585



EVAL: [8/9] Data 0.010 (0.068) Elapsed 0m 4s (remain 0m 0s) 



train finish!!!
