In [None]:
import tempfile
import os
import numpy as np
from sklearn.model_selection import KFold
from glob import iglob
import torch
import logging
import sys
import itertools
from shutil import rmtree

import torch
from torch.utils.data import DataLoader, ConcatDataset
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR

from vision.utils.misc import str2bool, Timer, freeze_net_layers, store_labels
from vision.ssd.ssd import MatchPrior
from vision.ssd.vgg_ssd import create_vgg_ssd
from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
from vision.datasets.voc_dataset import VOCDataset
from vision.datasets.open_images import OpenImagesDataset
from vision.nn.multibox_loss import MultiboxLoss
from vision.ssd.config import vgg_ssd_config
from vision.ssd.config import mobilenetv1_ssd_config
from vision.ssd.config import squeezenet_ssd_config
from vision.ssd.data_preprocessing import TrainAugmentation, TestTransform

In [None]:
class Arguments:
    def __init__(self):
        self.dataset_type = 'voc'
        self.datasets = None
        self.validation_dataset = None
        self.balance_data = False
        self.net = 'vgg16-ssd'
        self.freeze_base_net = False
        self.freeze_net = False
        self.mb2_width_mult = 1.0
        self.learning_rate = 1e-3
        self.momentum = 0.9
        self.weight_decay = 5e-4
        self.gamma = 0.1
        self.base_net_lr = None
        self.extra_layers_lr = None
        self.base_net = None
        self.pretrained_ssd = None
        self.resume = None
        self.scheduler = 'multi-step'
        self.milestones = '80,100'
        self.t_max = 120
        self.batch_size = 32
        self.num_epochs = 120
        self.num_workers = 4
        self.validation_epochs = 5
        self.debug_steps = 100
        self.use_cuda = True
        self.checkpoint_folder = 'models/'

In [None]:
ori_dataset_dir = '/home/laps-100/Documentos/dataset'

In [None]:
args = Arguments()
args.net = 'mb1-ssd'
args.pretrained_ssd = 'models/mobilenet-v1-ssd-mp-0_675.pth'
args.batch_size = 24
args.num_epochs = 300
args.scheduler = 'cosine'
args.lr = 0.01
args.base_net_lr = 0.001
args.t_max = 300

In [None]:
def _get_frames_keys(frames_folder):
    frames_template = os.path.join(frames_folder, '*.jpg')

    frames_keys = []

    for frame_path in iglob(frames_template):
        basename = os.path.basename(frame_path)
        frames_keys.append(os.path.splitext(basename)[0])

    np.random.shuffle(frames_keys)
    return frames_keys

In [None]:
def create_cross_dirs(dataset_dir, folds):
    JPEGImages_dir = os.path.join(dataset_dir, 'JPEGImages')
    Annotations_dir = os.path.join(dataset_dir, 'Annotations')
    labels_file = os.path.join(dataset_dir, 'labels.txt')
    
    frames_keys = np.array(_get_frames_keys(JPEGImages_dir))
    
    cross_val_dir = os.path.join(os.path.dirname(dataset_dir), 'cross_validation')
    if os.path.exists(cross_val_dir):
        rmtree(cross_val_dir, ignore_errors=True)
        
    cross_dirs = []
    
    kf = KFold(n_splits=folds, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(frames_keys)):
        dir_name = os.path.join(cross_val_dir, f'fold{i}')
        os.makedirs(dir_name)
        cross_dirs.append(dir_name)

        images_dir = os.path.join(dir_name, 'JPEGImages')
        os.symlink(JPEGImages_dir, images_dir, target_is_directory=True)

        annotations_dir = os.path.join(dir_name, 'Annotations')
        os.symlink(Annotations_dir, annotations_dir, target_is_directory=True)
        
        tmp_labels_file = os.path.join(dir_name, 'labels.txt')
        os.symlink(labels_file, tmp_labels_file)
        
        sets_dir = os.path.join(dir_name, 'ImageSets', 'Main')
        os.makedirs(sets_dir)
        
        frames_trains, frames_test = frames_keys[train_index], frames_keys[test_index]
        
        train_file = os.path.join(sets_dir, 'trainval.txt')
        with open(train_file, 'w+') as f:
            for img_key in frames_trains:
                f.write(f'{img_key}\n')
        
        test_file = os.path.join(sets_dir, 'test.txt')
        with open(test_file, 'w+') as f:
            for img_key in frames_test:
                f.write(f'{img_key}\n')
        
    
    return cross_dirs

In [None]:
logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
    net.train(True)
    running_loss = 0.0
    running_regression_loss = 0.0
    running_classification_loss = 0.0
    for i, data in enumerate(loader):
        images, boxes, labels = data
        images = images.to(device)
        boxes = boxes.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        confidence, locations = net(images)
        regression_loss, classification_loss = criterion(
            confidence, locations, labels, boxes
        )  # TODO CHANGE BOXES
        loss = regression_loss + classification_loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_regression_loss += regression_loss.item()
        running_classification_loss += classification_loss.item()
        if i and i % debug_steps == 0:
            avg_loss = running_loss / debug_steps
            avg_reg_loss = running_regression_loss / debug_steps
            avg_clf_loss = running_classification_loss / debug_steps
            logging.info(
                f"Epoch: {epoch}, Step: {i}, "
                + f"Average Loss: {avg_loss:.4f}, "
                + f"Average Regression Loss {avg_reg_loss:.4f}, "
                + f"Average Classification Loss: {avg_clf_loss:.4f}"
            )
            running_loss = 0.0
            running_regression_loss = 0.0
            running_classification_loss = 0.0

In [None]:
def test(loader, net, criterion, device):
    net.eval()
    running_loss = 0.0
    running_regression_loss = 0.0
    running_classification_loss = 0.0
    num = 0
    for _, data in enumerate(loader):
        images, boxes, labels = data
        images = images.to(device)
        boxes = boxes.to(device)
        labels = labels.to(device)
        num += 1

        with torch.no_grad():
            confidence, locations = net(images)
            regression_loss, classification_loss = criterion(
                confidence, locations, labels, boxes
            )
            loss = regression_loss + classification_loss

        running_loss += loss.item()
        running_regression_loss += regression_loss.item()
        running_classification_loss += classification_loss.item()
    return (
        running_loss / num,
        running_regression_loss / num,
        running_classification_loss / num,
    )

In [None]:
def fold_train(args, iter_i):
    timer = Timer()

    fold_i = os.path.basename(args.validation_dataset)[4:]
    logging.info(f'Starting fold {fold_i} -- iteration {iter_i}')
    logging.info(f'Dataset folder: {args.validation_dataset}')
    
    if args.net == 'vgg16-ssd':
        create_net = create_vgg_ssd
        config = vgg_ssd_config
    elif args.net == 'mb1-ssd':
        create_net = create_mobilenetv1_ssd
        config = mobilenetv1_ssd_config
    elif args.net == 'mb1-ssd-lite':
        create_net = create_mobilenetv1_ssd_lite
        config = mobilenetv1_ssd_config
    elif args.net == 'sq-ssd-lite':
        create_net = create_squeezenet_ssd_lite
        config = squeezenet_ssd_config
    elif args.net == 'mb2-ssd-lite':
        create_net = lambda num: create_mobilenetv2_ssd_lite(   # noqa: E731
            num, width_mult=args.mb2_width_mult
        )
        config = mobilenetv1_ssd_config
    else:
        logging.fatal("The net type is wrong.")
        parser.print_help(sys.stderr)
        sys.exit(1)
    train_transform = TrainAugmentation(
        config.image_size, config.image_mean, config.image_std
    )
    target_transform = MatchPrior(
        config.priors, config.center_variance, config.size_variance, 0.5
    )

    test_transform = TestTransform(
        config.image_size, config.image_mean, config.image_std
    )

    logging.info("Prepare training datasets.")
    datasets = []
    for dataset_path in args.datasets:
        if args.dataset_type == 'voc':
            dataset = VOCDataset(
                dataset_path,
                transform=train_transform,
                target_transform=target_transform,
            )
            label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt")
            store_labels(label_file, dataset.class_names)
            num_classes = len(dataset.class_names)
        elif args.dataset_type == 'open_images':
            dataset = OpenImagesDataset(
                dataset_path,
                transform=train_transform,
                target_transform=target_transform,
                dataset_type="train",
                balance_data=args.balance_data,
            )
            label_file = os.path.join(
                args.checkpoint_folder, "open-images-model-labels.txt"
            )
            store_labels(label_file, dataset.class_names)
            logging.info(dataset)
            num_classes = len(dataset.class_names)

        else:
            raise ValueError(f"Dataset type {args.dataset_type} is not supported.")
        datasets.append(dataset)
    logging.info(f"Stored labels into file {label_file}.")
    train_dataset = ConcatDataset(datasets)
    logging.info("Train dataset size: {}".format(len(train_dataset)))
    train_loader = DataLoader(
        train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True
    )
    logging.info("Prepare Validation datasets.")
    if args.dataset_type == "voc":
        val_dataset = VOCDataset(
            args.validation_dataset,
            transform=test_transform,
            target_transform=target_transform,
            is_test=True,
        )
    elif args.dataset_type == 'open_images':
        val_dataset = OpenImagesDataset(
            dataset_path,
            transform=test_transform,
            target_transform=target_transform,
            dataset_type="test",
        )
        logging.info(val_dataset)
    logging.info("validation dataset size: {}".format(len(val_dataset)))

    val_loader = DataLoader(
        val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False
    )
    logging.info("Build network.")
    net = create_net(num_classes)
    min_loss = -10000.0
    last_epoch = -1

    base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
    extra_layers_lr = (
        args.extra_layers_lr if args.extra_layers_lr is not None else args.lr
    )
    if args.freeze_base_net:
        logging.info("Freeze base net.")
        freeze_net_layers(net.base_net)
        params = itertools.chain(
            net.source_layer_add_ons.parameters(),
            net.extras.parameters(),
            net.regression_headers.parameters(),
            net.classification_headers.parameters(),
        )
        params = [
            {
                'params': itertools.chain(
                    net.source_layer_add_ons.parameters(), net.extras.parameters()
                ),
                'lr': extra_layers_lr,
            },
            {
                'params': itertools.chain(
                    net.regression_headers.parameters(),
                    net.classification_headers.parameters(),
                )
            },
        ]
    elif args.freeze_net:
        freeze_net_layers(net.base_net)
        freeze_net_layers(net.source_layer_add_ons)
        freeze_net_layers(net.extras)
        params = itertools.chain(
            net.regression_headers.parameters(), net.classification_headers.parameters()
        )
        logging.info("Freeze all the layers except prediction heads.")
    else:
        params = [
            {'params': net.base_net.parameters(), 'lr': base_net_lr},
            {
                'params': itertools.chain(
                    net.source_layer_add_ons.parameters(), net.extras.parameters()
                ),
                'lr': extra_layers_lr,
            },
            {
                'params': itertools.chain(
                    net.regression_headers.parameters(),
                    net.classification_headers.parameters(),
                )
            },
        ]

    timer.start("Load Model")
    if args.resume:
        logging.info(f"Resume from the model {args.resume}")
        net.load(args.resume)
    elif args.base_net:
        logging.info(f"Init from base net {args.base_net}")
        net.init_from_base_net(args.base_net)
    elif args.pretrained_ssd:
        logging.info(f"Init from pretrained ssd {args.pretrained_ssd}")
        net.init_from_pretrained_ssd(args.pretrained_ssd)
    logging.info(f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

    net.to(DEVICE)

    criterion = MultiboxLoss(
        config.priors,
        iou_threshold=0.5,
        neg_pos_ratio=3,
        center_variance=0.1,
        size_variance=0.2,
        device=DEVICE,
    )
    optimizer = torch.optim.SGD(
        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay
    )
    logging.info(
        f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
        + f"Extra Layers learning rate: {extra_layers_lr}."
    )

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(
            optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch
        )
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch)
    else:
        logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, args.num_epochs):
        scheduler.step()
        train(
            train_loader,
            net,
            criterion,
            optimizer,
            device=DEVICE,
            debug_steps=args.debug_steps,
            epoch=epoch,
        )

        if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
            val_loss, val_regression_loss, val_classification_loss = test(
                val_loader, net, criterion, DEVICE
            )
            logging.info(
                f"Epoch: {epoch}, "
                + f"Validation Loss: {val_loss:.4f}, "
                + f"Validation Regression Loss {val_regression_loss:.4f}, "
                + f"Validation Classification Loss: {val_classification_loss:.4f}"
            )
    
    fold_name = os.path.basename(args.validation_dataset)
    
    save_path = os.path.join(args.checkpoint_folder, {fold_name})
    if not os.path.exists(save_path):
        os.makedirs(save_path)
            
    model_path = os.path.join(
        save_path, f"{args.net}-{iter_i}-Loss-{val_loss}.pth"
    )
            
    net.save(model_path)
    logging.info(f"Saved model {model_path}")

    logging.info(f"Iteration {iter_i} finished!")

In [None]:
DEVICE = torch.device(
    "cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu"
)

if args.use_cuda and torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    logging.info("Use Cuda.")

In [None]:
cross_dirs = create_cross_dirs(ori_dataset_dir, 5)

for fold_i, dataset_folder in enumerate(cross_dirs):
    for iter_i in range(10):
        args.datasets = [dataset_folder]
        args.validation_dataset = dataset_folder

        fold_train(args, iter_i)

In [None]:
val_dir_fp.cleanup()