<h2>Training the Model

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"


from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config=ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

from input_log_utils_others.yolov3.dataset import Dataset
from input_log_utils_others.yolov3.yolov4 import Create_Yolo, compute_loss
from input_log_utils_others.yolov3.utils import load_yolo_weights
from input_log_utils_others.yolov3.configs import *
from input_log_utils_others.evaluate_mAP import get_mAP

if YOLO_TYPE == "yolov4":
    Darknet_weights = YOLO_V4_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V4_WEIGHTS
if YOLO_TYPE == "yolov3":
    Darknet_weights = YOLO_V3_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V3_WEIGHTS
    
if TRAIN_YOLO_TINY: TRAIN_MODEL_NAME += "_Tiny"

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1449011547201874508
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 14523623013445680984
physical_device_desc: "device: XLA_CPU device"
]


In [2]:
global TRAIN_FROM_CHECKPOINT

gpus = tf.config.experimental.list_physical_devices('GPU')
print(f'GPUs {gpus}')
if len(gpus)>0:
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError: pass

if os.path.exists(TRAIN_LOGDIR): shutil.rmtree(TRAIN_LOGDIR)
writer = tf.summary.create_file_writer(TRAIN_LOGDIR)

GPUs []


In [3]:
trainset = Dataset('train')
testset = Dataset('test')
steps_per_epoch = len(trainset)
global_steps = tf.Variable(1, trainable=False, dtype=tf.int64)
warmup_steps = TRAIN_WARMUP_EPOCHS * steps_per_epoch
total_steps = TRAIN_EPOCHS * steps_per_epoch


if TRAIN_TRANSFER:
    Darknet = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=YOLO_COCO_CLASSES)
    load_yolo_weights(Darknet, Darknet_weights)
    
yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, training=True, CLASSES=TRAIN_CLASSES)

annotations #:  25


KeyError: '.\\frames\\3.png does not exist ... '

In [None]:
if TRAIN_FROM_CHECKPOINT:
    try:
        yolo.load_weights(TRAIN_CHECKPOINTS_FOLDER + os.path.sep + TRAIN_MODEL_NAME)
    except ValueError:
        print("Shapes are incompatible, transfering from Darknet weights")
        TRAIN_FROM_CHECKPOINT = False
        
if TRAIN_TRANSFER and not TRAIN_FROM_CHECKPOINT:
    for i, l in enumerate(Darknet.layers):
        layer_weights = l.get_weights()
        if layer_weights != []:
            try:
                yolo.layers[i].set_weights(layer_weights)
            except:
                print("skipping", yolo.layers[i].name)

optimizer = tf.keras.optimizers.Adam()

In [5]:
def train_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = yolo(image_data, training=True)
        giou_loss=conf_loss=prob_loss=0
        
        grid=3 if not TRAIN_YOLO_TINY else 2
        for i in range(grid):
            conv, pred = pred_result[i*2], pred_result[i*2+1]
            loss_items = compute_loss(pred, conv, *target[i], i, CLASSES=TRAIN_CLASSES)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]
        total_loss = giou_loss + conf_loss + prob_loss
        gradients = tape.gradient(total_loss, yolo.trainable_variables)
        optimizer.apply_gradients(zip(gradients, yolo.trainable_variables))
        # update learning rate            
        global_steps.assign_add(1)
        if global_steps < warmup_steps:
            lr = global_steps / warmup_steps * TRAIN_LR_INIT
        else:
            lr = TRAIN_LR_END + 0.5 * (TRAIN_LR_INIT - TRAIN_LR_END) * ((1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * (np.pi))))
                
        optimizer.lr.assign(lr.numpy())
        
        with writer.as_default():
            tf.summary.scalar("lr", optimizer.lr, step=global_steps)
            tf.summary.scalar("loss/total_loss", total_loss, step=global_steps)
            tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps)
            tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps)
            tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps)
        writer.flush()
    return global_steps.numpy(), optimizer.lr.numpy(), giou_loss.numpy(), conf_loss.numpy(), prob_loss.numpy(), total_loss.numpy()

In [6]:
validate_writer = tf.summary.create_file_writer(TRAIN_LOGDIR)
def validate_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = yolo(image_data, training=False)
        giou_loss = conf_loss = prob_loss = 0
        grid=3 if not TRAIN_YOLO_TINY else 2
        for i in range(grid):
            conv, pred = pred_result[i*2], pred_result[i*2+1]
            loss_items = compute_loss(pred, conv, *target[i], i, CLASSES=TRAIN_CLASSES)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]
        total_loss = giou_loss + conf_loss + prob_loss
    return giou_loss.numpy(), conf_loss.numpy(), prob_loss.numpy(), total_loss.numpy()

In [7]:
mAP_model = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES)
best_val_loss = 100000

In [8]:
for epoch in range(TRAIN_EPOCHS):
    for image_data, target in trainset:
        results = train_step(image_data, target)
        cur_step = results[0]%steps_per_epoch
        print("epoch:{:2.0f} step:{:5.0f}/{}, lr:{:.6f}, giou_loss:{:7.2f}, conf_loss:{:7.2f}, prob_loss:{:7.2f}, total_loss:{:7.2f}".format(epoch, cur_step, steps_per_epoch, results[1], results[2], results[3], results[4], results[5]))
    if len(testset) == 0:
        print("configure TEST options to validate model")
        yolo.save_weights(os.path_join(TRAIN_CHECKPOINT_FOLDER, TRAIN_MODEL_NAME))
        continue
    count, giou_val, conf_val, prob_val, total_val = 0.,0,0,0,0
    for image_data, target in testset:
        results = validate_step(image_data, target)
        count += 1
        giou_val += results[0]
        conf_val += results[1]
        prob_val += results[2]
        total_val+= results[3]
        print("total_val", total_val)
        
    with validate_writer.as_default():
        tf.summary.scalar("validate_loss/total_val", total_val/count, step=epoch)
        tf.summary.scalar("validate_loss/giou_val", giou_val/count, step=epoch)
        tf.summary.scalar("validate_loss/conf_val", conf_val/count, step=epoch)
        tf.summary.scalar("validate_loss/prob_val", prob_val/count, step=epoch)
    validate_writer.flush()
    print("\n\ngiou_val_loss:{:7.2f}, conf_val_loss:{:7.2f}, prob_val_loss:{:7.2f}, total_val_loss:{:7.2f}\n\n"
          .format(giou_val/count, conf_val/count, prob_val/count, total_val/count))
        
    if TRAIN_SAVE_CHECKPOINT and not TRAIN_SAVE_BEST_ONLY:
        save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME+"_val_loss_{:7.2f}".format(total_val/count))
        yolo.save_weights(save_directory)
    if TRAIN_SAVE_BEST_ONLY and best_val_loss>total_val/count:
        save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME)
        yolo.save_weights(save_directory)
        best_val_loss = total_val/count
    if not TRAIN_SAVE_BEST_ONLY and not TRAIN_SAVE_CHECKPOINT:
        save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME)
        yolo.save_weights(save_directory)

epoch: 0 step:    0/2, lr:0.000020, giou_loss:  51.49, conf_loss:1790.72, prob_loss:  10.62, total_loss:1852.83
epoch: 0 step:    1/2, lr:0.000030, giou_loss:  49.97, conf_loss:1483.07, prob_loss:   9.76, total_loss:1542.80
total_val 1740.6976318359375
total_val 3493.574462890625
total_val 5265.015625


giou_val_loss:  58.65, conf_val_loss:1684.39, prob_val_loss:  11.97, total_val_loss:1755.01


epoch: 1 step:    0/2, lr:0.000040, giou_loss:  56.97, conf_loss:1477.79, prob_loss:  11.13, total_loss:1545.89
epoch: 1 step:    1/2, lr:0.000050, giou_loss:  47.95, conf_loss:1468.97, prob_loss:   9.10, total_loss:1526.02
total_val 1753.56787109375
total_val 3502.33447265625
total_val 5203.497314453125


giou_val_loss:  58.61, conf_val_loss:1663.96, prob_val_loss:  11.93, total_val_loss:1734.50


epoch: 2 step:    0/2, lr:0.000060, giou_loss:  55.84, conf_loss:1456.79, prob_loss:  10.83, total_loss:1523.46
epoch: 2 step:    1/2, lr:0.000070, giou_loss:  49.79, conf_loss:1437.78, prob_loss:   

total_val 870.9093627929688
total_val 1685.7755737304688
total_val 2494.6201171875


giou_val_loss:  54.75, conf_val_loss: 767.69, prob_val_loss:   9.10, total_val_loss: 831.54


epoch:21 step:    0/2, lr:0.000084, giou_loss:  44.89, conf_loss: 685.83, prob_loss:   6.92, total_loss: 737.63
epoch:21 step:    1/2, lr:0.000083, giou_loss:  45.13, conf_loss: 672.84, prob_loss:   6.87, total_loss: 724.83
total_val 826.6541748046875
total_val 1612.9417114257812
total_val 2415.8468017578125


giou_val_loss:  54.37, conf_val_loss: 742.04, prob_val_loss:   8.88, total_val_loss: 805.28


epoch:22 step:    0/2, lr:0.000082, giou_loss:  41.02, conf_loss: 666.46, prob_loss:   5.30, total_loss: 712.78
epoch:22 step:    1/2, lr:0.000081, giou_loss:  42.41, conf_loss: 666.20, prob_loss:   6.52, total_loss: 715.13
total_val 794.3175048828125
total_val 1555.833251953125
total_val 2341.1673583984375


giou_val_loss:  53.98, conf_val_loss: 717.75, prob_val_loss:   8.66, total_val_loss: 780.39


epoch:23 s

epoch:41 step:    0/2, lr:0.000040, giou_loss:  41.98, conf_loss: 385.16, prob_loss:   5.18, total_loss: 432.32
epoch:41 step:    1/2, lr:0.000039, giou_loss:  36.59, conf_loss: 369.77, prob_loss:   4.29, total_loss: 410.64
total_val 525.1842651367188
total_val 1066.2894287109375
total_val 1606.546630859375


giou_val_loss:  49.57, conf_val_loss: 479.08, prob_val_loss:   6.87, total_val_loss: 535.52


epoch:42 step:    0/2, lr:0.000037, giou_loss:  41.06, conf_loss: 368.83, prob_loss:   4.75, total_loss: 414.64
epoch:42 step:    1/2, lr:0.000036, giou_loss:  40.44, conf_loss: 364.37, prob_loss:   4.78, total_loss: 409.59
total_val 532.4258422851562
total_val 1056.5205688476562
total_val 1586.8721923828125


giou_val_loss:  49.42, conf_val_loss: 472.73, prob_val_loss:   6.80, total_val_loss: 528.96


epoch:43 step:    0/2, lr:0.000035, giou_loss:  39.29, conf_loss: 368.76, prob_loss:   4.90, total_loss: 412.95
epoch:43 step:    1/2, lr:0.000034, giou_loss:  39.18, conf_loss: 358.20, pro

epoch:61 step:    1/2, lr:0.000004, giou_loss:  40.89, conf_loss: 320.93, prob_loss:   4.93, total_loss: 366.75
total_val 444.5010070800781
total_val 921.2498168945312
total_val 1408.1270751953125


giou_val_loss:  48.59, conf_val_loss: 414.47, prob_val_loss:   6.31, total_val_loss: 469.38


epoch:62 step:    0/2, lr:0.000004, giou_loss:  37.49, conf_loss: 316.67, prob_loss:   4.83, total_loss: 358.99
epoch:62 step:    1/2, lr:0.000003, giou_loss:  37.47, conf_loss: 312.46, prob_loss:   4.11, total_loss: 354.05
total_val 459.283447265625
total_val 919.3467407226562
total_val 1403.19140625


giou_val_loss:  48.56, conf_val_loss: 412.88, prob_val_loss:   6.29, total_val_loss: 467.73


epoch:63 step:    0/2, lr:0.000003, giou_loss:  41.93, conf_loss: 314.63, prob_loss:   4.98, total_loss: 361.54
epoch:63 step:    1/2, lr:0.000003, giou_loss:  42.68, conf_loss: 323.25, prob_loss:   5.47, total_loss: 371.40
total_val 468.7588195800781
total_val 950.5907897949219
total_val 1398.6309509277344