# Train/ Valid/ Test

## Configurations

### Install and import necessary libraries

In [10]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.metrics import confusion_matrix

import os
import random
import itertools
from datetime import datetime

import torch
from torch.utils.data import Dataset, DataLoader,SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import torchvision.models as models
from torchvision import transforms
from torchsummary import summary

#### GPU or not?

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Hyperparameters

In [12]:
args = {}
args['DATA_DIR'] = 'data'
args['LABEL_DIR'] = 'label2.csv'
args['IMAGE_DIR'] = os.path.join(args['DATA_DIR'], 'DaanForestPark')

args['NUM_WORKERS'] = 4
args['EPOCHES'] = 50
args['BATCH_SIZE'] = 32
args['PATIENCE'] = 5
args['VALID_RATIO'] = .2
args['LR'] = 1e-2
args['MIN_LR'] = 1e-5
args['L1_ratio'] = 1e-4
args['L2_ratio'] = 1e-3
args['CLIPPING'] = .9
args['W_DECAY'] = .9

In [13]:
# ToPILImage() -> Resize() -> ToTensor()
transform = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomCrop(224),
#         transforms.CenterCrop(224),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.3),
#         transforms.ColorJitter(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])        
        ])

### Customized DataSet and Official DataLoader 

In [14]:

class MyDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        _images, _labels = [], []
        # total amount of dataset 
        _number = 0
        # Reading the categorical file
        label_df = pd.read_csv(label_dir)
        
        # Iterate all files including .jpg inages  
        for subdir, dirs, files in tqdm(os.walk(image_dir)):
            for filename in files:
                corr_label = label_df[label_df['dirpath']==subdir[len(args['DATA_DIR'])+1:]]['label'].values
                if corr_label.size!= 0 and filename.endswith(('jpg')):
                    _images.append(subdir + os.sep + filename)
                    _labels.append(corr_label)
                    _number+=1
        
        # Randomly arrange data pairs
        mapIndexPosition = list(zip(_images, _labels))
        random.shuffle(mapIndexPosition)
        _images, _labels = zip(*mapIndexPosition)
        
        self._image = iter(_images)
        self._labels = iter(_labels)
        self._number = _number
        self._category = label_df['label'].nunique()
        self._corr_name2id = {label_df[label_df['label'] == n]['target'].unique()[0]:n for n in label_df['label'].unique()}
        self._corr_id2name = {v:k for k,v in self._corr_name2id.items()}
        self.transform = transform
        
    def __len__(self):
        return self._number

    def __getitem__(self, index):    
        img = next(self._image)
        lab = next(self._labels)
        
        img = self._loadimage(img)
        if self.transform:
            img = self.transform(img)        
        return img, lab
     
    def _categorical(self, label):
        return np.arange(self._category) == label[:,None]
    
    def _loadimage(self, file):
        return Image.open(file).convert('RGB')
    
    def get_categorical_nums(self):
        return self._category

    def get_name2id_dict(self):
        return self._corr_name2id
    
    def get_id2name_dict(self):
        return self._corr_id2name    

In [15]:
train_dataset = MyDataset(args['IMAGE_DIR'], args['LABEL_DIR'], transform=transform)

valid_size = args['VALID_RATIO']
num_train = len(train_dataset)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = DataLoader(dataset=train_dataset, batch_size=args['BATCH_SIZE'], num_workers=args['NUM_WORKERS'], sampler=train_sampler, drop_last=True)
valid_loader = DataLoader(dataset=train_dataset, batch_size=args['BATCH_SIZE'], num_workers=args['NUM_WORKERS'], sampler=valid_sampler, drop_last=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
# def init_weights(m):
#     if type(m) == nn.Conv2d:
#         nn.init.xavier_uniform(m.weight)
#         m.bias.data.fill_(0.01)
#     if type(m) == nn.Linear:
#         nn.init.uniform_(m.weight)
#         m.bias.data.fill_(0.01)   
# class SimpleCNN(nn.Module):
    
#     def __init__(self, target):
#         super(SimpleCNN, self).__init__()
#         # Input size (3, 1136, 640)
# #         self.imgzipper  = nn.AvgPool2d(kernel_size=ZIPSIZE)
#         # Input size (3, 284, 160)
#         self.conv1 = nn.Sequential(
#             nn.Conv2d(in_channels=CHANNEL_NUMS,
#                         out_channels=FILTER_NUMS,
#                         kernel_size=KERNEL_SIZE,
#                         stride=STRIDE,
#                         padding=(KERNEL_SIZE-STRIDE)//2 # padding=(kernel_size-stride)/2 -> original size
#                     ),
#             nn.Dropout(0.5),
#             nn.ReLU(),
#             # (8, 1136, 640)
#             nn.MaxPool2d(kernel_size=KERNEL_SIZE)
#             # (8, 87, 49)
#             # zipper (8, 21, 12)
#         ).apply(init_weights)
        
#         # (8, 87, 49)
#         self.conv2 = nn.Sequential(
#             nn.Conv2d(in_channels=FILTER_NUMS,
#                         out_channels=FILTER_NUMS2,
#                         kernel_size=KERNEL_SIZE,
#                         stride=STRIDE,
#                         padding=(KERNEL_SIZE-STRIDE)//2 # padding=(kernel_size-stride)/2 -> original size
#                     ),
#             nn.Dropout(0.5),
#             nn.ReLU(),
#             # (16, 87, 49)
#             nn.MaxPool2d(kernel_size=5)
#             # (16, 6, 3)
#         ).apply(init_weights)
#         self.MLP = nn.Sequential(
#             nn.Linear(128, 81),
#             nn.ReLU(),
#             nn.Linear(81, 81),
#             nn.ReLU(),
#             nn.Linear(81, target)
#         ).apply(init_weights)
#     def forward(self, x):
#         x = self.imgzipper(x)
#         x = self.conv1(x)
#         x = self.conv2(x)
#         x = x.view(x.size(0), -1)
#         x = self.MLP(x)
#         return x

### Model

In [17]:
# model = SimpleCNN(train_dataset.get_categorical_nums()).to(device)
model = models.shufflenet_v2_x1_0(pretrained=True)
# model = models.shufflenet_v2_x1_0(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, train_dataset.get_categorical_nums())

#### Network Summary

In [18]:
# channels, H, W
model = model.to(device=device)
summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 112, 112]             648
       BatchNorm2d-2         [-1, 24, 112, 112]              48
              ReLU-3         [-1, 24, 112, 112]               0
         MaxPool2d-4           [-1, 24, 56, 56]               0
            Conv2d-5           [-1, 24, 28, 28]             216
       BatchNorm2d-6           [-1, 24, 28, 28]              48
            Conv2d-7           [-1, 58, 28, 28]           1,392
       BatchNorm2d-8           [-1, 58, 28, 28]             116
              ReLU-9           [-1, 58, 28, 28]               0
           Conv2d-10           [-1, 58, 56, 56]           1,392
      BatchNorm2d-11           [-1, 58, 56, 56]             116
             ReLU-12           [-1, 58, 56, 56]               0
           Conv2d-13           [-1, 58, 28, 28]             522
      BatchNorm2d-14           [-1, 58,

### Dynamically tunning Learning Rate

In [10]:
""" 根據epoch調整 lr """
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args['LR'] * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
        
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [11]:
""" 分層設定 lr """
# large_lr_layers = list(map(id,model.fc.parameters()))
# small_lr_layers = filter(lambda p:id(p) not in large_lr_layers,model.parameters())
# optimizer = torch.optim.SGD([
#             {"params":large_lr_layers},
#             {"params":small_lr_layers,"lr":1e-4}
#             ],lr = 1e-2,momenum=0.9)

' 分層設定 lr '

### Saving a checkpoint 

In [12]:
def save_checkpoint(model, state, filename, ckptname='resNet.ckpt'):
    if not os.path.isdir(filename):
        try: 
            os.mkdir(filename) 
            print('Create {}'.format(filename))
        except OSError as err: 
            raise err

    _ckpt = os.path.join(filename, ckptname)
    torch.save(state, _ckpt)
    print('Saving the {} in {}'.format(ckptname, filename))

    """ If you wwanna save the whole model, Uncomment below """
#     _model = os.path.join(filename, 'model_best.ckpt')
#     torch.save(model, _model)

In [13]:
def vis_confusion(writer, step, matrix, class_dict):
    """
    Visualization of confusion matrix

    Parameters:
        writer (tensorboard.SummaryWriter): TensorBoard SummaryWriter instance.
        step (int): Counter usually specifying steps/epochs/time.
        matrix (numpy.array): Square-shaped array of size class x class.
            Should specify cross-class accuracies/confusion in percent
            values (range 0-1).
        class_dict (dict): Dictionary specifying class names as keys and
            corresponding integer labels/targets as values.
    """
    all_categories = sorted(class_dict, key=class_dict.get)

    # Normalize by dividing every row by its sum
    matrix = matrix.astype(float)
    for i in range(len(class_dict)):
        matrix[i] = matrix[i] / matrix[i].sum()

    # Create the figure
    fig = plt.figure(figsize=(15,15))
    ax = fig.add_subplot(111)

    # Show the matrix and define a discretized color bar
    cax = ax.matshow(matrix, cmap=plt.cm.get_cmap('Oranges'))
    fig.colorbar(cax)

    # Set up axes. Rotate the x ticks by 90 degrees.
    ax.set_xticklabels([''] + all_categories, rotation=90)
    ax.set_yticklabels([''] + all_categories)
    
    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    thresh = matrix.max() / 2
    for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
        plt.text(j, i, matrix[i, j],
             horizontalalignment="center",
             color="white" if matrix[i, j] > thresh else "black")
    
    
    # Turn off the grid for this plot. Enforce a tight layout to reduce white margins
    ax.grid(False)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # Call our auxiliary to TensorBoard function to render the figure 
    plot_to_tensorboard(writer, fig, step)
    
    
def plot_to_tensorboard(writer, fig, step):
    """
    Takes a matplotlib figure handle and converts it using
    canvas and string-casts to a numpy array that can be
    visualized in TensorBoard using the add_image function

    Parameters:
        writer (tensorboard.SummaryWriter): TensorBoard SummaryWriter instance.
        fig (matplotlib.pyplot.fig): Matplotlib figure handle.
        step (int): counter usually specifying steps/epochs/time.
    """

    # Draw figure on canvas
    fig.canvas.draw()

    # Convert the figure to numpy array, read the pixel values and reshape the array
    img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    # Normalize into 0-1 range for TensorBoard(X). Swap axes for newer versions where API expects colors in first dim
    img = img / 255.0
    img = np.transpose(img, (2,0,1))
    # img = np.swapaxes(img, 0, 2) # if your TensorFlow + TensorBoard version are >= 1.8
    # Add figure in numpy "image" to TensorBoard writer
    writer.add_image('valid/confusion_matrix', img, step)
    plt.close(fig)

### Training Phase

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=args['LR'])   # optimize all cnn parameters
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                       factor=args['W_DECAY'], 
                                                       patience=0, 
                                                       min_lr=args['MIN_LR'], 
                                                       verbose=True)
criterion = nn.CrossEntropyLoss().to(device=device)

# early stopping
min_val_loss = np.Inf
patience = args['PATIENCE']
global_step = 1
# build the writer file
write_file = 'runs/experiment_{}'.format(datetime.now().strftime('%f'))
writer = SummaryWriter(write_file)
# define the ckpt path
writer_ckpt_path = os.path.join(write_file, 'ckpt')

model.train()

for epoch in range(args['EPOCHES']):
    for i, (img_batch, label_batch) in tqdm(enumerate(train_loader)):    
        optimizer.zero_grad()      
        img_batch = img_batch.to(device=device)
        label_batch = label_batch.to(device=device)  
        output = model(img_batch)
        loss = criterion(output, label_batch.squeeze())
        
        # l2 Regularization loss
        l1_regularization = 0
        l2_regularization = 0
        for p in model.parameters():
            l1_regularization += torch.norm(p, 1)
            l2_regularization += torch.norm(p, 2)
        loss = loss + (args['L2_ratio'] * l2_regularization)

        loss.backward()
#         clip the grandient value for avoiding explosion
        nn.utils.clip_grad_norm_(model.parameters(), args['CLIPPING']) 
        optimizer.step()

        # Compute accuracy
        _, predicted = torch.max(output.cpu().data, 1)
        accuracy = torch.sum(predicted == label_batch.cpu().data.view(-1), dtype=torch.float32) / args['BATCH_SIZE']
        
        # Write tensorboard
        writer.add_scalar('train/Accuracy', accuracy.item(), global_step)
        writer.add_scalar('train/Loss', loss.item(), global_step)
        writer.add_scalar('train/L1RegLoss', l1_regularization.item(), global_step)
        writer.add_scalar('train/L2RegLoss', l2_regularization.item(), global_step)
        writer.add_scalar('train/LR', get_lr(optimizer), global_step)
        
        global_step += 1
        
        if i % 50== 0:
            print('epoch {}, step {}, \
            total_loss={:.3f}, \
            accuracy={:.3f}'.format(epoch+1, i, loss.item(), accuracy.item()))
    
    
    print('--- Validation phase ---')
    eval_loss = 0
    eval_acc = 0
    # classifier recored
    val_actual_record = {i:0 for i in range(train_dataset.get_categorical_nums())}
    val_pred_record = val_actual_record.copy()
    true_positive_record = val_actual_record.copy()
    
    with torch.no_grad():
        numpy_label_batch, numpy_predicted = None, None
        for i, (img_batch, label_batch) in enumerate(valid_loader):
            output = model(img_batch.to(device))
            _, predicted = torch.max(output.cpu().data, 1)
            loss = criterion(output, label_batch.to(device).squeeze())
            accuracy = torch.sum(predicted == label_batch.data.view(-1), dtype=torch.float32) / args['BATCH_SIZE']
            eval_loss += loss.item()
            eval_acc += accuracy.item()
            # Write to tensorboard
#             for cat in range(train_dataset.get_categorical_nums()):
#                 writer.add_pr_curve('valid/pr_curve_{}'.format(cat), (label_batch == cat).int().squeeze(),
#                                     (predicted == cat).int().squeeze(), epoch*len(valid_loader)+i)
#             writer.add_images('valid/image_batch', img_batch, epoch*len(valid_loader)+i)
            
            # compute confusion matrix
            numpy_label_batch = label_batch.squeeze().data.numpy()
            numpy_predicted  = predicted.data.numpy()
            
            # compute precision and recall
            for i, j in zip(numpy_label_batch, numpy_predicted):
                if i == j:
                    true_positive_record[i]+=1
                val_actual_record[i]+=1
                val_pred_record[i]+=1
            
            writer.add_histogram('valid/actual', label_batch, epoch*len(valid_loader)+i)
            writer.add_histogram('valid/pred', predicted, epoch*len(valid_loader)+i)
            writer.add_scalar('valid/Accuracy', accuracy.item(), epoch*len(valid_loader)+i)
            writer.add_scalar('valid/Loss', loss.item(), epoch*len(valid_loader)+i)
        
        print("confusion matrix drawing")
        cm = confusion_matrix(numpy_label_batch, numpy_predicted)
        batch_dict = { i:train_dataset.get_id2name_dict()[i] 
                 for i in np.unique(np.concatenate([numpy_label_batch, numpy_predicted])).astype(int)}
        # write confusion matrix to tensorboard 
        vis_confusion(writer, epoch*len(valid_loader)+i, cm, batch_dict)
    
    
    precision = {}
    recall = {}
    for k in true_positive_record.keys():
        precision[train_dataset.get_id2name_dict()[k]] = true_positive_record[k] / (val_pred_record[k] + 0.1)
        recall[train_dataset.get_id2name_dict()[k]] = true_positive_record[k] / (val_actual_record[k] + 0.1)
    writer.add_scalars('precision', precision, epoch+1)
    writer.add_scalars('recall', recall, epoch+1)
    
    eval_loss = eval_loss / len(valid_loader)
    eval_acc = eval_acc / len(valid_loader)

    scheduler.step(eval_loss)
    
    writer.add_hparams(args, {'hparam/epoch': epoch+1,
                              'hparam/lr_':get_lr(optimizer),
                              'hparam/eval_loss':eval_loss,
                              'hparam/accuracy':eval_acc})
    print('epoch {}, val_loss={:.3f}'.format(epoch+1, eval_loss))

    ## Early Stopping
    if eval_loss < min_val_loss:
        save_checkpoint(model, 
            {
            'epoch': epoch+1,
            'state_dict': model.state_dict(),
            'best_loss': eval_loss,
            'optimizer' :optimizer.state_dict(),
            }, writer_ckpt_path)
        min_val_loss = eval_loss
    else:
        patience-=1
    
    if patience == 0:
        print('Early stopping')
        break
         
writer.flush()
writer.close()
print('Finish all training !')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 1, step 0,             total_loss=1.019,             accuracy=0.938
epoch 1, step 50,             total_loss=0.689,             accuracy=0.969
epoch 1, step 100,             total_loss=0.866,             accuracy=0.906
epoch 1, step 150,             total_loss=1.051,             accuracy=0.875
epoch 1, step 200,             total_loss=0.876,             accuracy=0.906
epoch 1, step 250,             total_loss=0.723,             accuracy=0.969

--- Validation phase ---
confusion matrix drawing




epoch 1, val_loss=0.512
Create runs/experiment_836147/ckpt
Saving the resNet.ckpt in runs/experiment_836147/ckpt


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 2, step 0,             total_loss=1.071,             accuracy=0.875
epoch 2, step 50,             total_loss=0.738,             accuracy=0.938
epoch 2, step 100,             total_loss=0.926,             accuracy=0.875
epoch 2, step 150,             total_loss=0.863,             accuracy=0.906
epoch 2, step 200,             total_loss=1.218,             accuracy=0.781
epoch 2, step 250,             total_loss=0.967,             accuracy=0.906

--- Validation phase ---
confusion matrix drawing
Epoch     2: reducing learning rate of group 0 to 9.0000e-03.
epoch 2, val_loss=0.514


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 3, step 0,             total_loss=0.892,             accuracy=0.875
epoch 3, step 50,             total_loss=0.663,             accuracy=1.000
epoch 3, step 100,             total_loss=0.805,             accuracy=0.906
epoch 3, step 150,             total_loss=1.019,             accuracy=0.875
epoch 3, step 200,             total_loss=0.917,             accuracy=0.875
epoch 3, step 250,             total_loss=0.678,             accuracy=0.969

--- Validation phase ---
confusion matrix drawing
epoch 3, val_loss=0.327
Saving the resNet.ckpt in runs/experiment_836147/ckpt


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 4, step 0,             total_loss=0.816,             accuracy=0.938
epoch 4, step 50,             total_loss=0.698,             accuracy=0.969
epoch 4, step 100,             total_loss=0.681,             accuracy=1.000
epoch 4, step 150,             total_loss=0.727,             accuracy=0.969
epoch 4, step 200,             total_loss=0.724,             accuracy=0.938
epoch 4, step 250,             total_loss=0.704,             accuracy=0.938

--- Validation phase ---
confusion matrix drawing
Epoch     4: reducing learning rate of group 0 to 8.1000e-03.
epoch 4, val_loss=0.348


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 5, step 0,             total_loss=1.151,             accuracy=0.812
epoch 5, step 50,             total_loss=0.742,             accuracy=0.938
epoch 5, step 100,             total_loss=0.792,             accuracy=0.938
epoch 5, step 150,             total_loss=0.747,             accuracy=0.969
epoch 5, step 200,             total_loss=0.700,             accuracy=0.906
epoch 5, step 250,             total_loss=0.641,             accuracy=0.969

--- Validation phase ---
confusion matrix drawing
epoch 5, val_loss=0.261
Saving the resNet.ckpt in runs/experiment_836147/ckpt


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 6, step 0,             total_loss=0.710,             accuracy=0.906
epoch 6, step 50,             total_loss=0.657,             accuracy=0.969
epoch 6, step 100,             total_loss=0.629,             accuracy=0.938
epoch 6, step 150,             total_loss=0.737,             accuracy=0.906
epoch 6, step 200,             total_loss=0.858,             accuracy=0.906
epoch 6, step 250,             total_loss=0.723,             accuracy=0.906

--- Validation phase ---
confusion matrix drawing
Epoch     6: reducing learning rate of group 0 to 7.2900e-03.
epoch 6, val_loss=0.363


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 7, step 0,             total_loss=0.765,             accuracy=0.906
epoch 7, step 50,             total_loss=0.725,             accuracy=0.906
epoch 7, step 100,             total_loss=0.606,             accuracy=1.000
epoch 7, step 150,             total_loss=0.687,             accuracy=0.906
epoch 7, step 200,             total_loss=0.634,             accuracy=0.969
epoch 7, step 250,             total_loss=0.576,             accuracy=0.969

--- Validation phase ---
confusion matrix drawing
epoch 7, val_loss=0.197
Saving the resNet.ckpt in runs/experiment_836147/ckpt


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 8, step 0,             total_loss=0.665,             accuracy=0.969
epoch 8, step 50,             total_loss=0.630,             accuracy=0.969
epoch 8, step 100,             total_loss=0.607,             accuracy=0.969
epoch 8, step 150,             total_loss=0.700,             accuracy=0.969
epoch 8, step 200,             total_loss=0.772,             accuracy=0.875
epoch 8, step 250,             total_loss=0.588,             accuracy=1.000

--- Validation phase ---
confusion matrix drawing
Epoch     8: reducing learning rate of group 0 to 6.5610e-03.
epoch 8, val_loss=0.361


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

epoch 9, step 0,             total_loss=0.624,             accuracy=0.969
epoch 9, step 50,             total_loss=0.769,             accuracy=0.875
epoch 9, step 100,             total_loss=0.559,             accuracy=1.000
epoch 9, step 150,             total_loss=0.780,             accuracy=0.844
epoch 9, step 200,             total_loss=0.642,             accuracy=0.969
epoch 9, step 250,             total_loss=0.635,             accuracy=0.938

--- Validation phase ---
confusion matrix drawing
Epoch     9: reducing learning rate of group 0 to 5.9049e-03.
epoch 9, val_loss=0.261
Early stopping
Finish all training !


### Validation Phase

In [16]:
# CKPT_PATH = 'runs/experiment_837515/ckpt/resNet.ckpt'
# model.load_state_dict(torch.load(CKPT_PATH)['state_dict'])
model.eval()
acc = 0
for i, (img_batch, label_batch) in enumerate(valid_loader):
    output = model(img_batch.to(device))
    _, predicted = torch.max(output.cpu().data, 1)
    accuracy = torch.sum(predicted == label_batch.data.view(-1), dtype=torch.float32) / args['BATCH_SIZE']
    acc += accuracy
print('accuracy={}'.format(acc/len(valid_loader)))

accuracy=0.9116848111152649


### Save the whole model consisting in architecture

In [None]:
# torch.save(model, 'arch_w_dict.ckpt')

In [None]:
# validate
# model2 = torch.load('arch_w_dict.ckpt')
# model2

### Release CUDA cache memory

In [10]:
# torch.cuda.empty_cache()

### From dict to the whole model

In [None]:
path = 'kaggle/runs/experiment_505653/ckpt/resNet.ckpt'
weights = torch.load(path)['state_dict']
model.load_state_dict(weights)

# torch.save(model, 'best_model96.pt')
# torch.save(model, 'best_model96.pth')