# Experiment 3: Optimizer

- Adam
- AdamW
- MadGrad

In [1]:
import os
import sys
import pickle
import glob
import time
from tqdm import tqdm
from collections import Counter

# scikit-learn
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

# Data preprocessing
import cv2
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
%matplotlib inline

# pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
from torchvision import datasets, transforms
torch.manual_seed(0)
print(f'PyTorch version: {torch.__version__}')

# device setting
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'This notebook use {device}')

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

PyTorch version: 1.7.1
This notebook use cuda:0


In [2]:
# 파일 경로 사용자 정의
class path:
    data = '/opt/ml/input/original_data'
    train = f'{data}/train'
    train_img = f'{train}/images'
    train_df = f'{train}/train.csv'
    test = f'{data}/eval'
    test_img = f'{test}/images'
    test_df = f'{test}/info.csv'

In [3]:
BATCH_SIZE = 16
NUM_WORKERS = 2
LEARNING_RATE = 1e-4
EPOCHS = 3

## 1. Dataset

In [4]:
class MaskDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def set_transform(self, transform):
        self.transform = transform
        
    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        target = data.target
        image = Image.open(data.path)
        
        if self.transform:
            image = self.transform(image)
            
        return image, target
    
    def __len__(self):
        return len(self.df)

In [5]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [6]:
train_transforms = transforms.Compose([
    transforms.CenterCrop(384),
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.5, saturation=0.5, hue=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.548, 0.504, 0.479], std=[0.237, 0.247, 0.246]),
    AddGaussianNoise(0., 1.),
])

In [7]:
valid_transforms = transforms.Compose([
    transforms.CenterCrop(384),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.548, 0.504, 0.479], std=[0.237, 0.247, 0.246]),
])

## 2. Modeling

### 2.1. Adam

In [8]:
# adam
model1 = torchvision.models.resnet18(pretrained=False)
n_features = model1.fc.in_features
model1.fc = nn.Linear(n_features, 18)
model1 = model1.cuda()

optimizer1 = optim.Adam(model1.parameters(), lr=LEARNING_RATE)
criterion1 = nn.CrossEntropyLoss().to(device)

### 2.2. AdamW

In [10]:
# adamw
model2 = torchvision.models.resnet18(pretrained=False)
n_features = model2.fc.in_features
model2.fc = nn.Linear(n_features, 18)
model2 = model2.cuda()

optimizer2 = optim.AdamW(model2.parameters(), lr=LEARNING_RATE)
criterion2 = nn.CrossEntropyLoss().to(device)

### 2.3. MadGrad

In [12]:
import math
from typing import TYPE_CHECKING, Any, Callable, Optional

import torch
import torch.optim

if TYPE_CHECKING:
    from torch.optim.optimizer import _params_t
else:
    _params_t = Any
    
class MADGRAD(torch.optim.Optimizer):
    """
    MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic 
    Optimization.
    .. _MADGRAD: https://arxiv.org/abs/2101.11075
    MADGRAD is a general purpose optimizer that can be used in place of SGD or
    Adam may converge faster and generalize better. Currently GPU-only.
    Typically, the same learning rate schedule that is used for SGD or Adam may
    be used. The overall learning rate is not comparable to either method and
    should be determined by a hyper-parameter sweep.
    MADGRAD requires less weight decay than other methods, often as little as
    zero. Momentum values used for SGD or Adam's beta1 should work here also.
    On sparse problems both weight_decay and momentum should be set to 0.
    Arguments:
        params (iterable): 
            Iterable of parameters to optimize or dicts defining parameter groups.
        lr (float): 
            Learning rate (default: 1e-2).
        momentum (float): 
            Momentum value in  the range [0,1) (default: 0.9).
        weight_decay (float): 
            Weight decay, i.e. a L2 penalty (default: 0).
        eps (float): 
            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6).
    """

    def __init__(
        self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6,
    ):
        if momentum < 0 or momentum >= 1:
            raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
        if lr <= 0:
            raise ValueError(f"Learning rate {lr} must be positive")
        if weight_decay < 0:
            raise ValueError(f"Weight decay {weight_decay} must be non-negative")
        if eps < 0:
            raise ValueError(f"Eps must be non-negative")

        defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
        super().__init__(params, defaults)

    @property
    def supports_memory_efficient_fp16(self) -> bool:
        return False

    @property
    def supports_flat_params(self) -> bool:
        return True

    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # step counter must be stored in state to ensure correct behavior under
        # optimizer sharding
        if 'k' not in self.state:
            self.state['k'] = torch.tensor([0], dtype=torch.long)
        k = self.state['k'].item()

        for group in self.param_groups:
            eps = group["eps"]
            lr = group["lr"] + eps
            decay = group["weight_decay"]
            momentum = group["momentum"]

            ck = 1 - momentum
            lamb = lr * math.pow(k + 1, 0.5)

            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                if "grad_sum_sq" not in state:
                    state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
                    state["s"] = torch.zeros_like(p.data).detach()
                    if momentum != 0:
                        state["x0"] = torch.clone(p.data).detach()

                if momentum != 0.0 and grad.is_sparse:
                    raise RuntimeError("momentum != 0 is not compatible with sparse gradients")

                grad_sum_sq = state["grad_sum_sq"]
                s = state["s"]

                # Apply weight decay
                if decay != 0:
                    if grad.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")

                    grad.add_(p.data, alpha=decay)

                if grad.is_sparse:
                    grad = grad.coalesce()
                    grad_val = grad._values()

                    p_masked = p.sparse_mask(grad)
                    grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
                    s_masked = s.sparse_mask(grad)

                    # Compute x_0 from other known quantities
                    rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
                    x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1)

                    # Dense + sparse op
                    grad_sq = grad * grad
                    grad_sum_sq.add_(grad_sq, alpha=lamb)
                    grad_sum_sq_masked.add_(grad_sq, alpha=lamb)

                    rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)

                    s.add_(grad, alpha=lamb)
                    s_masked._values().add_(grad_val, alpha=lamb)

                    # update masked copy of p
                    p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1)
                    # Copy updated masked p to dense p using an add operation
                    p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
                    p.data.add_(p_masked, alpha=-1)
                else:
                    if momentum == 0:
                        # Compute x_0 from other known quantities
                        rms = grad_sum_sq.pow(1 / 3).add_(eps)
                        x0 = p.data.addcdiv(s, rms, value=1)
                    else:
                        x0 = state["x0"]

                    # Accumulate second moments
                    grad_sum_sq.addcmul_(grad, grad, value=lamb)
                    rms = grad_sum_sq.pow(1 / 3).add_(eps)

                    # Update s
                    s.data.add_(grad, alpha=lamb)

                    # Step
                    if momentum == 0:
                        p.data.copy_(x0.addcdiv(s, rms, value=-1))
                    else:
                        z = x0.addcdiv(s, rms, value=-1)

                        # p is a moving average of z
                        p.data.mul_(1 - ck).add_(z, alpha=ck)


        self.state['k'] += 1
        return loss

In [13]:
# madgrad
model3 = torchvision.models.resnet18(pretrained=False)
n_features = model3.fc.in_features
model3.fc = nn.Linear(n_features, 18)
model3 = model3.cuda()

optimizer3 = MADGRAD(model3.parameters(), lr=LEARNING_RATE)
criterion3 = nn.CrossEntropyLoss().to(device)

## 3. Training

In [14]:
def test_eval(model, valid_dataset):
    model.eval()
    with torch.no_grad():
        y_true, y_pred = [], []
        for image, label in tqdm(valid_dataset):
            X = image.float().to(device)
            y = label.item()
            _, pred = torch.max(model(X), 1)
            pred = pred.item()
            y_true.append(y)
            y_pred.append(pred)
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        f1 = f1_score(y_true, y_pred, average='macro')
        accuracy = accuracy_score(y_true, y_pred)
    model.train()
    return f1, accuracy

In [15]:
def train_model(train, test, model, criterion, optimizer, print_every=1):
    print(f"============ Training Starts! ============")
    best_accuracy = 0
    for epoch in range(EPOCHS):
        loss_sum = 0
        for images, label in tqdm(train):
            X = images.float().to(device)
            y = label.to(device)
            
            y_pred = model(X)
            loss = criterion(y_pred, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_sum += loss
            
        if ((epoch % print_every) == 0) or (epoch == (EPOCHS - 1)):
            loss_avg = loss_sum / len(train)
            f1, accuracy = test_eval(model, test)
            print(f">> epoch:[{epoch + 1}/{EPOCHS}] cost: {loss_avg:5.3f} test_accuracy: {accuracy:5.3f} test_f1_score: {f1:5.3f}")
            
    print(f"============ Training Done! ============")

In [16]:
def cross_validation(df, model, criterion, optimizer, k_folds=5):
    skf = StratifiedKFold(n_splits=5)
    for n_iter, (train_idx, valid_idx) in enumerate(skf.split(df, df.target), start=1):
        print(f'>> Cross Validation {n_iter} Starts!')
        train, valid = df.loc[train_idx], df.loc[valid_idx]
        train_dataset, valid_dataset = MaskDataset(train), MaskDataset(valid)
        
        # augmentation 설정
        train_dataset.set_transform(train_transforms)
        valid_dataset.set_transform(valid_transforms)
        
        # DataLoader 생성
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True)
        valid_loader = DataLoader(valid_dataset, shuffle=False)
        
        train_model(train_loader, valid_loader, model, criterion, optimizer)
        print()

In [17]:
df = pd.read_csv(f'{path.train}/train_modified.csv')[['path', 'target']]

### 3.1 Adam

In [18]:
cross_validation(df, model1, criterion1, optimizer1)

  0%|          | 0/945 [00:00<?, ?it/s]

>> Cross Validation 1 Starts!


100%|██████████| 945/945 [02:10<00:00,  7.23it/s]
100%|██████████| 3780/3780 [01:22<00:00, 45.62it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 1.619 test_accuracy: 0.372 test_f1_score: 0.194


100%|██████████| 945/945 [02:13<00:00,  7.10it/s]
100%|██████████| 3780/3780 [01:18<00:00, 48.13it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.911 test_accuracy: 0.437 test_f1_score: 0.305


100%|██████████| 945/945 [02:11<00:00,  7.16it/s]
100%|██████████| 3780/3780 [01:16<00:00, 49.36it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.677 test_accuracy: 0.488 test_f1_score: 0.291

>> Cross Validation 2 Starts!


100%|██████████| 945/945 [02:13<00:00,  7.06it/s]
100%|██████████| 3780/3780 [01:18<00:00, 47.93it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.581 test_accuracy: 0.469 test_f1_score: 0.271


100%|██████████| 945/945 [02:13<00:00,  7.07it/s]
100%|██████████| 3780/3780 [01:24<00:00, 44.58it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.477 test_accuracy: 0.743 test_f1_score: 0.543


100%|██████████| 945/945 [02:18<00:00,  6.83it/s]
100%|██████████| 3780/3780 [01:23<00:00, 45.21it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.398 test_accuracy: 0.671 test_f1_score: 0.520

>> Cross Validation 3 Starts!


100%|██████████| 945/945 [02:18<00:00,  6.83it/s]
100%|██████████| 3780/3780 [01:21<00:00, 46.31it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.367 test_accuracy: 0.722 test_f1_score: 0.612


100%|██████████| 945/945 [02:14<00:00,  7.01it/s]
100%|██████████| 3780/3780 [01:20<00:00, 47.18it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.292 test_accuracy: 0.756 test_f1_score: 0.594


100%|██████████| 945/945 [02:15<00:00,  7.00it/s]
100%|██████████| 3780/3780 [00:45<00:00, 83.30it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.241 test_accuracy: 0.796 test_f1_score: 0.642

>> Cross Validation 4 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.71it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.74it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.254 test_accuracy: 0.719 test_f1_score: 0.673


100%|██████████| 945/945 [01:46<00:00,  8.85it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.26it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.205 test_accuracy: 0.810 test_f1_score: 0.720


100%|██████████| 945/945 [01:48<00:00,  8.72it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.00it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.174 test_accuracy: 0.765 test_f1_score: 0.660

>> Cross Validation 5 Starts!


100%|██████████| 945/945 [01:47<00:00,  8.80it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.49it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.197 test_accuracy: 0.921 test_f1_score: 0.865


100%|██████████| 945/945 [01:45<00:00,  8.94it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.57it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.143 test_accuracy: 0.835 test_f1_score: 0.788


100%|██████████| 945/945 [01:46<00:00,  8.87it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.16it/s]


>> epoch:[3/3] cost: 0.125 test_accuracy: 0.833 test_f1_score: 0.746



### 3.2 AdamW

In [19]:
cross_validation(df, model2, criterion3, optimizer2)

  0%|          | 0/945 [00:00<?, ?it/s]

>> Cross Validation 1 Starts!


100%|██████████| 945/945 [01:47<00:00,  8.81it/s]
100%|██████████| 3780/3780 [00:43<00:00, 86.17it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 1.640 test_accuracy: 0.202 test_f1_score: 0.125


100%|██████████| 945/945 [01:46<00:00,  8.84it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.97it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.926 test_accuracy: 0.090 test_f1_score: 0.033


100%|██████████| 945/945 [01:48<00:00,  8.69it/s]
100%|██████████| 3780/3780 [00:46<00:00, 82.16it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.684 test_accuracy: 0.273 test_f1_score: 0.162

>> Cross Validation 2 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.70it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.49it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.589 test_accuracy: 0.542 test_f1_score: 0.311


100%|██████████| 945/945 [01:47<00:00,  8.76it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.07it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.483 test_accuracy: 0.591 test_f1_score: 0.420


100%|██████████| 945/945 [01:51<00:00,  8.46it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.45it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.390 test_accuracy: 0.516 test_f1_score: 0.412

>> Cross Validation 3 Starts!


100%|██████████| 945/945 [01:51<00:00,  8.51it/s]
100%|██████████| 3780/3780 [00:48<00:00, 77.70it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.365 test_accuracy: 0.531 test_f1_score: 0.424


100%|██████████| 945/945 [01:48<00:00,  8.74it/s]
100%|██████████| 3780/3780 [00:47<00:00, 78.80it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.295 test_accuracy: 0.712 test_f1_score: 0.551


100%|██████████| 945/945 [01:51<00:00,  8.51it/s]
100%|██████████| 3780/3780 [00:48<00:00, 77.68it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.238 test_accuracy: 0.706 test_f1_score: 0.565

>> Cross Validation 4 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.67it/s]
100%|██████████| 3780/3780 [00:47<00:00, 80.37it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.260 test_accuracy: 0.829 test_f1_score: 0.720


100%|██████████| 945/945 [01:48<00:00,  8.68it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.95it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.204 test_accuracy: 0.832 test_f1_score: 0.715


100%|██████████| 945/945 [01:50<00:00,  8.59it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.40it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.173 test_accuracy: 0.825 test_f1_score: 0.710

>> Cross Validation 5 Starts!


100%|██████████| 945/945 [01:47<00:00,  8.76it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.06it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.189 test_accuracy: 0.863 test_f1_score: 0.783


100%|██████████| 945/945 [01:47<00:00,  8.80it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.08it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.144 test_accuracy: 0.812 test_f1_score: 0.706


100%|██████████| 945/945 [01:50<00:00,  8.54it/s]
100%|██████████| 3780/3780 [00:48<00:00, 78.29it/s]


>> epoch:[3/3] cost: 0.124 test_accuracy: 0.837 test_f1_score: 0.721



### 3.3 MadGrad

In [20]:
cross_validation(df, model3, criterion3, optimizer3)

  0%|          | 0/945 [00:00<?, ?it/s]

>> Cross Validation 1 Starts!


100%|██████████| 945/945 [01:49<00:00,  8.65it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.97it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 1.548 test_accuracy: 0.569 test_f1_score: 0.434


100%|██████████| 945/945 [01:47<00:00,  8.79it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.73it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.891 test_accuracy: 0.723 test_f1_score: 0.491


100%|██████████| 945/945 [01:49<00:00,  8.59it/s]
100%|██████████| 3780/3780 [00:46<00:00, 80.71it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.685 test_accuracy: 0.733 test_f1_score: 0.538

>> Cross Validation 2 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.68it/s]
100%|██████████| 3780/3780 [00:46<00:00, 80.81it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.598 test_accuracy: 0.788 test_f1_score: 0.633


100%|██████████| 945/945 [01:48<00:00,  8.70it/s]
100%|██████████| 3780/3780 [00:48<00:00, 77.61it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.503 test_accuracy: 0.845 test_f1_score: 0.697


100%|██████████| 945/945 [01:50<00:00,  8.58it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.30it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.438 test_accuracy: 0.812 test_f1_score: 0.706

>> Cross Validation 3 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.68it/s]
100%|██████████| 3780/3780 [00:48<00:00, 78.16it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.388 test_accuracy: 0.824 test_f1_score: 0.635


100%|██████████| 945/945 [01:49<00:00,  8.65it/s]
100%|██████████| 3780/3780 [00:47<00:00, 79.89it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.341 test_accuracy: 0.829 test_f1_score: 0.707


100%|██████████| 945/945 [01:50<00:00,  8.53it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.49it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.288 test_accuracy: 0.828 test_f1_score: 0.649

>> Cross Validation 4 Starts!


100%|██████████| 945/945 [01:47<00:00,  8.76it/s]
100%|██████████| 3780/3780 [00:46<00:00, 80.66it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.314 test_accuracy: 0.815 test_f1_score: 0.621


100%|██████████| 945/945 [01:47<00:00,  8.81it/s]
100%|██████████| 3780/3780 [00:46<00:00, 80.50it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.253 test_accuracy: 0.858 test_f1_score: 0.763


100%|██████████| 945/945 [01:48<00:00,  8.69it/s]
100%|██████████| 3780/3780 [00:47<00:00, 80.42it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[3/3] cost: 0.202 test_accuracy: 0.876 test_f1_score: 0.789

>> Cross Validation 5 Starts!


100%|██████████| 945/945 [01:48<00:00,  8.71it/s]
100%|██████████| 3780/3780 [00:45<00:00, 82.81it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[1/3] cost: 0.247 test_accuracy: 0.907 test_f1_score: 0.854


100%|██████████| 945/945 [01:47<00:00,  8.80it/s]
100%|██████████| 3780/3780 [00:46<00:00, 81.35it/s]
  0%|          | 0/945 [00:00<?, ?it/s]

>> epoch:[2/3] cost: 0.178 test_accuracy: 0.878 test_f1_score: 0.817


100%|██████████| 945/945 [01:47<00:00,  8.81it/s]
100%|██████████| 3780/3780 [00:46<00:00, 82.11it/s]


>> epoch:[3/3] cost: 0.170 test_accuracy: 0.890 test_f1_score: 0.818

