In [112]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import ceil
import os
import pandas as pd
import numpy as np
import time
import random
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
%matplotlib inline

In [113]:
####################################
### SETTINGS
####################################

# Initializing expand_ratio, channels, repeats, stride, kernel_size
# for base 

base_model = [
    [1, 16, 1, 1, 3],
    [6, 24, 2, 2, 3],
    [6, 40, 2, 2, 5],
    [6, 80, 3, 2, 3],
    [6, 112, 3, 1, 5],
    [6, 192, 4, 2, 5],
    [6, 320, 1, 1, 3]
]

# Initializing config for each verion (phi_value, resolution, drop_rate)
version_config = {
    'b0': (0, 224, 0.2),
    'b1': (0.5, 240, 0.2),
    'b2': (1, 260, 0.3),
    'b3': (2, 300, 0.3),
    'b4': (3, 380, 0.4),
    'b5': (4, 456, 0.4)
}

# Hyperparameter setting
RANDOM_SEED = 1
NUM_CLASSES = 10 # using CIFAR-10
BATCH_SIZE = 128
NUM_EPOCHS = 20
version = "b0"
phi, res, drop_rate = version_config[version]
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

####################################
### LEARNING RATE SETTINGS
####################################

iter_per_ep = len(train_loader.sampler.indices) // train_loader.batch_size
base_lr = 0.01
max_lr = 0.1
batch_step = -1
cur_lr = base_lr

In [114]:
####################################
### CIFAR-10 DATASET
####################################

# Scale inout images to 0-1 range using transforms.ToTensor()

np.random.seed(RANDOM_SEED)
idx = np.arange(50000)
np.random.shuffle(idx)
val_idx, train_idx = idx[:1000], idx[1000:]
train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)

train_dataset = datasets.CIFAR10(root = 'data',
                                 train = True,
                                 transform = transforms.ToTensor(),
                                 download = True)

test_dataset = datasets.CIFAR10(root = 'data',
                                 train = False,
                                 transform = transforms.ToTensor())

train_loader = DataLoader(dataset = train_dataset,
                         batch_size = BATCH_SIZE,
                         #shuffle = True,
                         sampler = train_sampler)

val_loader = DataLoader(dataset = train_dataset,
                       batch_size = BATCH_SIZE,
                       sampler = val_sampler)

test_loader = DataLoader(dataset = test_dataset,
                         batch_size = BATCH_SIZE,
                         shuffle = False)

# Checking the dataset
for images, labels in train_loader:
    print('Image Batch Dimensions:', images.shape)
    print('Image Label Dimensions:', labels.shape)
    break

Files already downloaded and verified
Image Batch Dimensions: torch.Size([128, 3, 32, 32])
Image Label Dimensions: torch.Size([128])


In [115]:
####################################
### MODEL
####################################

# 1. Start with Convolutional Neural Network block
# Con -> Batch_norm -> ReLU
class CNN_block(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size,
                 stride, padding, groups = 1):
        
        super(CNN_block, self).__init__()
        self.cnn = nn.Conv2d(in_ch, out_ch, kernel_size,
                             stride, padding, groups = groups,
                             bias = False) # If we set group = 1 -> normal conv, groups = in_ch -> Depthwise conv
        self.batch_norm = nn.BatchNorm2d(out_ch)
        self.silu = nn.SiLU() # SiLU == Swish
        
    def forward(self, x):
        
        out = self.cnn(x)
        out = self.batch_norm(out)
        out = self.silu(out)
        
        return out

# 2. Squeeze Excitation trick used inside Inverted Res block
class Squeeze_Excitation(nn.Module):
    def __init__(self, in_ch, reduced_dim):
        
        super(Squeeze_Excitation, self).__init__()
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1), # C x H x W -> C x 1 x 1
            nn.Conv2d(in_ch, reduced_dim, 1),
            nn.SiLU(),
            nn.Conv2d(reduced_dim, in_ch, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return x * self.se(x) # each channel, multiply the values from sequential (How much we should prioritize the channels)

# 3. Inverted Residual Block
class Inverted_Residual_block(nn.Module):
    # expand_ratio takes inputs and expands to higher number of channels
    # reduce by 1/4 (for squeeze excitation)
    # survival_prob is for stochastic depth
    def __init__(self, in_ch, out_ch, kernel_size,
                stride, padding, expand_ratio, reduction = 4,
                survival_prob = 0.8):
        
        super(Inverted_Residual_block, self).__init__()
        self.survival_prob = 0.8
        self.use_residual = in_ch == out_ch and stride == 1
        hidden_dim = in_ch * expand_ratio
        self.expand = in_ch != hidden_dim
        reduced_dim = int(in_ch/reduction)
        
        if self.expand:
            self.expand_conv = CNN_block(in_ch,
                                        hidden_dim,
                                        kernel_size = 3,
                                        stride = 1,
                                        padding = 1)
        
        self.conv = nn.Sequential(
            CNN_block(
                hidden_dim, hidden_dim, kernel_size, stride,
                padding, groups = hidden_dim
            ),
            Squeeze_Excitation(hidden_dim, reduced_dim),
            nn.Conv2d(hidden_dim, out_ch, 1, bias = False),
            nn.BatchNorm2d(out_ch)
        )
    
    def Stochastic_Depth(self, x):
        if not self.training: # like-dropout, randomly remove certain layer
            return x
        # compute value 0 or 1 for each example
        binary_tensor = torch.rand(x.shape[0], 1, 1, 1, device = x.device) < self.survival_prob
        
        return torch.div(x, self.survival_prob) * binary_tensor # try to maintain mean and sd in the batch
        
    def forward(self, x):
        
        out = self.expand_conv(x) if self.expand else x
        
        if self.use_residual:
            return self.Stochastic_Depth(self.conv(out)) + x
        else: # if we down sampled or if the channel changed
            return self.conv(out)
        
class EfficientNet(nn.Module):
    def __init__(self, version, num_classes):
        super(EfficientNet, self).__init__()
        width_factor, depth_factor, dropout_rate = self.calculate_factors(version)
        last_channels = ceil(1280 * width_factor)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.features = self.feature_extraction(width_factor, depth_factor, last_channels)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(last_channels, num_classes)
        )
        
    def calculate_factors(self, version, alpha = 1.2, beta = 1.1):
        phi, resolution, drop_rate = version_config[version]
        depth_factor = alpha ** phi # how many layers we should increase for each stage
        width_factor = beta ** phi # how much larger the channels should be
        return depth_factor, width_factor, drop_rate
    
    def feature_extraction(self, width_factor, depth_factor, last_channels):
        channels = int(32 * width_factor)
        features = [CNN_block(3, channels, 3, stride = 2, padding = 1)]
        in_ch = channels
        
        for expand_ratio, channels, repeats, stride, kernel_size in base_model:
            out_ch = 4 *ceil(int(channels*width_factor)/4) # modulus of 4 since we identify reduction = 4 in SE
            layers_repeats = ceil(repeats * depth_factor)
            
            for layer in range(layers_repeats):
                features.append(
                    Inverted_Residual_block(
                        in_ch,
                        out_ch,
                        expand_ratio = expand_ratio,
                        stride = stride if layer == 0 else 1,
                        kernel_size = kernel_size,
                        padding = kernel_size // 2, # if k=1 -> pad=0, k=3 -> pad=1, k=5-> pad=2
                    )
                )
                in_ch = out_ch
        features.append(
            CNN_block(in_ch, last_channels, kernel_size = 1,
                     stride = 1, padding = 0)
        )
        
        return nn.Sequential(*features)
    
    def forward(self, x):
        
        out = self.features(x)
        out = self.pool(out)
        
        return self.classifier(out.view(out.shape[0], -1))
    
def test():
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    version = "b0"
    phi, res, drop_rate = version_config[version]
    num_examples, num_classes = 4,10
    x = torch.randn((num_examples, 3, res, res)).to(DEVICE)
    model = EfficientNet(version = version,
                        num_classes = num_classes).to(DEVICE)
    
    print(model(x).shape) # (num_examples, num_classes)
    
#test()

In [116]:
####################################
### CYCLICAL LEARNING RATE
####################################

def cyclical_learning_rate(batch_step,
                           step_size,
                           base_lr=0.001,
                           max_lr=0.006,
                           mode='triangular',
                           gamma=0.999995):

    cycle = np.floor(1 + batch_step / (2. * step_size))
    x = np.abs(batch_step / float(step_size) - 2 * cycle + 1)

    lr_delta = (max_lr - base_lr) * np.maximum(0, (1 - x))
    
    if mode == 'triangular':
        pass
    elif mode == 'triangular2':
        lr_delta = lr_delta * 1 / (2. ** (cycle - 1))
    elif mode == 'exp_range':
        lr_delta = lr_delta * (gamma**(batch_step))
    else:
        raise ValueError('mode must be "triangular", "triangular2", or "exp_range"')
        
    lr = base_lr + lr_delta
    
    return lr

In [117]:
torch.manual_seed(RANDOM_SEED)

########################
### COST AND OPTIMIZER
########################

model = EfficientNet(version = version, num_classes = NUM_CLASSES).to(DEVICE)
model.to(DEVICE)

optimizer = torch.optim.SGD(model.parameters(), lr = base_lr, momentum = 0.9)

In [119]:
def compute_accuracy_and_loss(model, data_loader, device):
    correct_pred, num_examples = 0, 0
    cross_entropy = 0.
    for i, (features, targets) in enumerate(data_loader):
            
        features = features.to(device)
        targets = targets.to(device)

        outputs = model(features)
        _, preds = torch.max(outputs, 1)
        cross_entropy += F.cross_entropy(outputs, targets)
        num_examples += targets.size(0)
        correct_pred += (preds == targets).sum()
    return correct_pred.float()/num_examples * 100, cross_entropy/num_examples
    
collect = {'epoch': [], 'cost': [], 'train_cost':[], 
           'val_cost': [], 'train_acc': [], 'val_acc': [],
           'learn_rate': []}
start_time = time.time()
train_acc_lst, test_acc_lst = [], []
train_loss_lst, test_loss_lst = [], []
f1_score_lst = []

for epoch in range(NUM_EPOCHS):
    collect['learn_rate'].append(base_lr)
    epoch_avg_cost = 0.
    model.train()
    
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        batch_step += 1
        ### PREPARE MINIBATCH
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)
            
        ### FORWARD AND BACK PROP
        outputs = model(features)
        _, preds = torch.max(outputs, 1)
        cost = F.cross_entropy(outputs, targets)
        optimizer.zero_grad()
        
        cost.backward()
        f1_score_lst.append(f1_score(targets, preds, average = "macro"))
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        epoch_avg_cost += cost
        
        ### LOGGING
        if not batch_idx % 20:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} |' 
                   f' Cost: {cost:.4f}')

    # no need to build the computation graph for backprop when computing accuracy
    model.eval()
    with torch.set_grad_enabled(False):
        train_acc, train_loss = compute_accuracy_and_loss(model, train_loader, device=DEVICE)
        val_acc, val_loss = compute_accuracy_and_loss(model, val_loader, device=DEVICE)
        #test_acc, test_loss = compute_accuracy_and_loss(model, test_loader, device=DEVICE)
        epoch_avg_cost /= batch_idx + 1
        collect['epoch'].append(epoch+1)
        collect['train_acc'].append(train_acc_fixed)
        collect['train_cost'].append(train_loss_fixed)
        collect['val_acc'].append(val_acc_fixed)
        collect['val_cost'].append(val_loss_fixed)
        collect['cost'].append(epoch_avg_cost / iter_per_ep)
        print('base_lr: ', base_lr)
        print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} Train Acc.: {train_acc:.2f}%'
              f' | Validation Acc.: {val_acc:.2f}%')
    
    #############################################
    # Update learning rate
    base_lr = cyclical_learning_rate(batch_step=batch_step,
                                     step_size=NUM_EPOCHS*iter_per_ep,
                                     base_lr=base_lr,
                                     max_lr=max_lr)
    
    for g in optimizer.param_groups:
        g['lr'] = base_lr
    #############################################
    
    elapsed = (time.time() - start_time)/60
    print(f'Time elapsed: {elapsed:.2f} min')
    
  
elapsed = (time.time() - start_time)/60
print(f'Total Training Time: {elapsed:.2f} min')

Epoch: 001/020 | Batch 000/383 | Cost: 1.4604
Epoch: 001/020 | Batch 020/383 | Cost: 1.5650
Epoch: 001/020 | Batch 040/383 | Cost: 1.6830
Epoch: 001/020 | Batch 060/383 | Cost: 1.7872
Epoch: 001/020 | Batch 080/383 | Cost: 1.6068
Epoch: 001/020 | Batch 100/383 | Cost: 1.9296
Epoch: 001/020 | Batch 120/383 | Cost: 1.6093
Epoch: 001/020 | Batch 140/383 | Cost: 1.7641
Epoch: 001/020 | Batch 160/383 | Cost: 1.5573
Epoch: 001/020 | Batch 180/383 | Cost: 1.6447
Epoch: 001/020 | Batch 200/383 | Cost: 1.5172
Epoch: 001/020 | Batch 220/383 | Cost: 1.5761
Epoch: 001/020 | Batch 240/383 | Cost: 1.4257
Epoch: 001/020 | Batch 260/383 | Cost: 1.5313
Epoch: 001/020 | Batch 280/383 | Cost: 1.6719
Epoch: 001/020 | Batch 300/383 | Cost: 1.6395
Epoch: 001/020 | Batch 320/383 | Cost: 1.5684
Epoch: 001/020 | Batch 340/383 | Cost: 1.5570
Epoch: 001/020 | Batch 360/383 | Cost: 1.4578
Epoch: 001/020 | Batch 380/383 | Cost: 1.4346
base_lr:  0.01
Epoch: 001/020 Train Acc.: 50.58% | Validation Acc.: 48.60%
Time 

Epoch: 009/020 | Batch 000/383 | Cost: 0.8153
Epoch: 009/020 | Batch 020/383 | Cost: 0.7454
Epoch: 009/020 | Batch 040/383 | Cost: 0.7204
Epoch: 009/020 | Batch 060/383 | Cost: 0.5954
Epoch: 009/020 | Batch 080/383 | Cost: 0.6474
Epoch: 009/020 | Batch 100/383 | Cost: 0.5983
Epoch: 009/020 | Batch 120/383 | Cost: 0.6855
Epoch: 009/020 | Batch 140/383 | Cost: 0.6470
Epoch: 009/020 | Batch 160/383 | Cost: 0.5842
Epoch: 009/020 | Batch 180/383 | Cost: 0.6448
Epoch: 009/020 | Batch 200/383 | Cost: 0.7179
Epoch: 009/020 | Batch 220/383 | Cost: 0.6976
Epoch: 009/020 | Batch 240/383 | Cost: 0.7008
Epoch: 009/020 | Batch 260/383 | Cost: 0.6277
Epoch: 009/020 | Batch 280/383 | Cost: 0.5007
Epoch: 009/020 | Batch 300/383 | Cost: 0.6372
Epoch: 009/020 | Batch 320/383 | Cost: 0.5949
Epoch: 009/020 | Batch 340/383 | Cost: 0.7252
Epoch: 009/020 | Batch 360/383 | Cost: 0.7733
Epoch: 009/020 | Batch 380/383 | Cost: 0.6899
base_lr:  0.09384193154228651
Epoch: 009/020 Train Acc.: 81.92% | Validation Acc

Epoch: 017/020 | Batch 000/383 | Cost: 0.4195
Epoch: 017/020 | Batch 020/383 | Cost: 0.2722
Epoch: 017/020 | Batch 040/383 | Cost: 0.3789
Epoch: 017/020 | Batch 060/383 | Cost: 0.2587
Epoch: 017/020 | Batch 080/383 | Cost: 0.4078
Epoch: 017/020 | Batch 100/383 | Cost: 0.3788
Epoch: 017/020 | Batch 120/383 | Cost: 0.3396
Epoch: 017/020 | Batch 140/383 | Cost: 0.3821
Epoch: 017/020 | Batch 160/383 | Cost: 0.2213
Epoch: 017/020 | Batch 180/383 | Cost: 0.4078
Epoch: 017/020 | Batch 200/383 | Cost: 0.5205
Epoch: 017/020 | Batch 220/383 | Cost: 0.2368
Epoch: 017/020 | Batch 240/383 | Cost: 0.3161
Epoch: 017/020 | Batch 260/383 | Cost: 0.3893
Epoch: 017/020 | Batch 280/383 | Cost: 0.3889
Epoch: 017/020 | Batch 300/383 | Cost: 0.2667
Epoch: 017/020 | Batch 320/383 | Cost: 0.2396
Epoch: 017/020 | Batch 340/383 | Cost: 0.4693
Epoch: 017/020 | Batch 360/383 | Cost: 0.4676
Epoch: 017/020 | Batch 380/383 | Cost: 0.2765
base_lr:  0.09999958497640907
Epoch: 017/020 Train Acc.: 91.68% | Validation Acc

In [120]:
## Now we test for the case when the learning rate is fixed

model_fixed = EfficientNet(version = version, num_classes = NUM_CLASSES).to(DEVICE)
model_fixed.to(DEVICE)

# base_lr = 0.01
optimizer_fixed = torch.optim.SGD(model_fixed.parameters(), lr = 0.01, momentum = 0.9)

f1_score_fixed = []
collect_fixed = {'epoch': [], 'train_cost': [], 'val_cost': [], 'train_acc': [], 'val_acc': []}
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    epoch_avg_cost_fixed = 0.
    model_fixed.train()
    
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        batch_step += 1
        ### PREPARE MINIBATCH
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)
            
        ### FORWARD AND BACK PROP
        outputs_fixed = model_fixed(features)
        _, preds_fixed = torch.max(outputs_fixed, 1)
        cost_fixed = F.cross_entropy(outputs_fixed, targets)
        optimizer_fixed.zero_grad()
        
        cost_fixed.backward()
        f1_score_fixed.append(f1_score(targets, preds_fixed, average = "macro"))
        
        ### UPDATE MODEL PARAMETERS
        optimizer_fixed.step()
        
        epoch_avg_cost_fixed += cost
        
        ### LOGGING
        if not batch_idx % 20:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} |' 
                   f' Cost: {cost_fixed:.4f}')

    # no need to build the computation graph for backprop when computing accuracy
    model_fixed.eval()
    with torch.set_grad_enabled(False):
        train_acc_fixed, train_loss_fixed = compute_accuracy_and_loss(model_fixed, train_loader, device=DEVICE)
        val_acc_fixed, val_loss_fixed = compute_accuracy_and_loss(model_fixed, val_loader, device=DEVICE)
        #test_acc_fixed, test_loss_fixed = compute_accuracy_and_loss(model_fixed, test_loader, device=DEVICE)
        epoch_avg_cost_fixed /= batch_idx + 1
        collect_fixed['epoch'].append(epoch+1)
        collect_fixed['train_acc'].append(train_acc_fixed)
        collect_fixed['train_cost'].append(train_loss_fixed)
        collect_fixed['val_acc'].append(val_acc_fixed)
        collect_fixed['val_cost'].append(val_loss_fixed)
        #collect_fixed['cost'].append(epoch_avg_cost_fixed / iter_per_ep)
        print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} Train Acc.: {train_acc_fixed:.2f}%'
              f' | Validation Acc.: {val_acc_fixed:.2f}%')
    
    elapsed = (time.time() - start_time)/60
    print(f'Time elapsed: {elapsed:.2f} min')
  
elapsed = (time.time() - start_time)/60
print(f'Total Training Time: {elapsed:.2f} min')

Epoch: 001/020 | Batch 000/383 | Cost: 2.3470
Epoch: 001/020 | Batch 020/383 | Cost: 2.3932
Epoch: 001/020 | Batch 040/383 | Cost: 2.4622
Epoch: 001/020 | Batch 060/383 | Cost: 1.9749
Epoch: 001/020 | Batch 080/383 | Cost: 1.8989
Epoch: 001/020 | Batch 100/383 | Cost: 1.8282
Epoch: 001/020 | Batch 120/383 | Cost: 1.9167
Epoch: 001/020 | Batch 140/383 | Cost: 2.2750
Epoch: 001/020 | Batch 160/383 | Cost: 1.7853
Epoch: 001/020 | Batch 180/383 | Cost: 1.9189
Epoch: 001/020 | Batch 200/383 | Cost: 1.6980
Epoch: 001/020 | Batch 220/383 | Cost: 1.5968
Epoch: 001/020 | Batch 240/383 | Cost: 1.7430
Epoch: 001/020 | Batch 260/383 | Cost: 1.5054
Epoch: 001/020 | Batch 280/383 | Cost: 1.6981
Epoch: 001/020 | Batch 300/383 | Cost: 1.5849
Epoch: 001/020 | Batch 320/383 | Cost: 1.5796
Epoch: 001/020 | Batch 340/383 | Cost: 1.5837
Epoch: 001/020 | Batch 360/383 | Cost: 1.4933
Epoch: 001/020 | Batch 380/383 | Cost: 1.6130
Epoch: 001/020 Train Acc.: 43.03% | Validation Acc.: 40.80%
Time elapsed: 15.00 

Epoch: 009/020 | Batch 080/383 | Cost: 0.5530
Epoch: 009/020 | Batch 100/383 | Cost: 0.6758
Epoch: 009/020 | Batch 120/383 | Cost: 0.6280
Epoch: 009/020 | Batch 140/383 | Cost: 0.6007
Epoch: 009/020 | Batch 160/383 | Cost: 0.5875
Epoch: 009/020 | Batch 180/383 | Cost: 0.5827
Epoch: 009/020 | Batch 200/383 | Cost: 0.6474
Epoch: 009/020 | Batch 220/383 | Cost: 0.5734
Epoch: 009/020 | Batch 240/383 | Cost: 0.8382
Epoch: 009/020 | Batch 260/383 | Cost: 0.6538
Epoch: 009/020 | Batch 280/383 | Cost: 0.6257
Epoch: 009/020 | Batch 300/383 | Cost: 0.6309
Epoch: 009/020 | Batch 320/383 | Cost: 0.5927
Epoch: 009/020 | Batch 340/383 | Cost: 0.6204
Epoch: 009/020 | Batch 360/383 | Cost: 0.5535
Epoch: 009/020 | Batch 380/383 | Cost: 0.5335
Epoch: 009/020 Train Acc.: 84.32% | Validation Acc.: 72.30%
Time elapsed: 584.65 min
Epoch: 010/020 | Batch 000/383 | Cost: 0.5623
Epoch: 010/020 | Batch 020/383 | Cost: 0.5536
Epoch: 010/020 | Batch 040/383 | Cost: 0.5808
Epoch: 010/020 | Batch 060/383 | Cost: 0.

Epoch: 017/020 | Batch 160/383 | Cost: 0.2116
Epoch: 017/020 | Batch 180/383 | Cost: 0.1944
Epoch: 017/020 | Batch 200/383 | Cost: 0.4448
Epoch: 017/020 | Batch 220/383 | Cost: 0.2084
Epoch: 017/020 | Batch 240/383 | Cost: 0.1657
Epoch: 017/020 | Batch 260/383 | Cost: 0.3287
Epoch: 017/020 | Batch 280/383 | Cost: 0.3617
Epoch: 017/020 | Batch 300/383 | Cost: 0.3303
Epoch: 017/020 | Batch 320/383 | Cost: 0.2784
Epoch: 017/020 | Batch 340/383 | Cost: 0.2115
Epoch: 017/020 | Batch 360/383 | Cost: 0.2102
Epoch: 017/020 | Batch 380/383 | Cost: 0.3110
Epoch: 017/020 Train Acc.: 95.12% | Validation Acc.: 72.60%
Time elapsed: 704.38 min
Epoch: 018/020 | Batch 000/383 | Cost: 0.2541
Epoch: 018/020 | Batch 020/383 | Cost: 0.1302
Epoch: 018/020 | Batch 040/383 | Cost: 0.3168
Epoch: 018/020 | Batch 060/383 | Cost: 0.1540
Epoch: 018/020 | Batch 080/383 | Cost: 0.1390
Epoch: 018/020 | Batch 100/383 | Cost: 0.2234
Epoch: 018/020 | Batch 120/383 | Cost: 0.1487
Epoch: 018/020 | Batch 140/383 | Cost: 0.