In [1]:
import os, sys
import argparse
import logging
import mxnet as mx
import random
from mxnet.io import DataBatch, DataIter
import numpy as np
import time
import subprocess
import errno
print(mx.__version__)

1.2.0


In [2]:
def add_data_args(parser):
    data = parser.add_argument_group('Data', 'the input images')
    data.add_argument('--data-train', type=str, help='the training data')
    data.add_argument('--data-val', type=str, help='the validation data')
    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
                      help='a tuple of size 3 for the mean rgb')
    data.add_argument('--pad-size', type=int, default=0,
                      help='padding the input image')
    data.add_argument('--image-shape', type=str,
                      help='the image shape feed into the network, e.g. (3,224,224)')
    data.add_argument('--num-classes', type=int, help='the number of classes')
    data.add_argument('--num-examples', type=int, help='the number of training examples')
    data.add_argument('--data-nthreads', type=int, default=4,
                      help='number of threads for data decoding')
    return data

In [8]:
def get_rec_iter(args, kv=None):
    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
    dtype = np.float32

    if kv:
        (rank, nworker) = (kv.rank, kv.num_workers)
    else:
        (rank, nworker) = (0, 1)

    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]

    train = mx.io.ImageRecordIter(
        path_imgrec         = args.data_train,
        label_width         = 1,
        mean_r              = rgb_mean[0],
        mean_g              = rgb_mean[1],
        mean_b              = rgb_mean[2],
        data_name           = 'data',
        label_name          = 'softmax_label',
        data_shape          = image_shape,
        batch_size          = args.batch_size,
        pad                 = args.pad_size,
        fill_value          = 127,
        preprocess_threads  = args.data_nthreads,
        shuffle             = True,
        num_parts           = nworker,
        part_index          = rank)
    if args.data_val is None:
        return (train, None)
    val = mx.io.ImageRecordIter(
        path_imgrec         = args.data_val,
        label_width         = 1,
        mean_r              = rgb_mean[0],
        mean_g              = rgb_mean[1],
        mean_b              = rgb_mean[2],
        data_name           = 'data',
        label_name          = 'softmax_label',
        batch_size          = args.batch_size,
        data_shape          = image_shape,
        preprocess_threads  = args.data_nthreads,
        num_parts           = nworker,
        part_index          = rank)
    return (train, val)

In [9]:
def _save_model(args, rank=0):
    if args.model_prefix is None:
        return None

    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
        args.model_prefix, rank))

In [10]:
def add_fit_args(parser):
    """
    parser : argparse.ArgumentParser
    return a parser added with args required by fit
    """
    train = parser.add_argument_group('Training', 'model training')
    train.add_argument('--network', type=str,
                       help='the neural network to use')
    train.add_argument('--num-layers', type=int,
                       help='number of layers in the neural network, required by some networks such as resnet')
    train.add_argument('--gpus', type=str,
                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
    train.add_argument('--kv-store', type=str, default='device',
                       help='key-value store type')
    train.add_argument('--num-epochs', type=int, default=100,
                       help='max num of epochs')
    train.add_argument('--lr', type=float, default=0.1,
                       help='initial learning rate')
    train.add_argument('--optimizer', type=str, default='sgd',
                       help='the optimizer type')
    train.add_argument('--mom', type=float, default=0.9,
                       help='momentum for sgd')
    train.add_argument('--wd', type=float, default=0.0001,
                       help='weight decay for sgd')
    train.add_argument('--batch-size', type=int, default=128,
                       help='the batch size')
    train.add_argument('--disp-batches', type=int, default=40,
                       help='show progress for every n batches')
    train.add_argument('--model-prefix', type=str,
                       help='model prefix')
    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
                        help='log network parameters every N iters if larger than 0')
    return train

def fit(args, network, data_loader, **kwargs):
    """
    train a model
    args : argparse returns
    network : the symbol definition of the neural network
    data_loader : function that returns the train and val data iterators
    """
    # kvstore
    kv = mx.kvstore.create(args.kv_store)
    print("args kvstore is %s"%(kv))

    # logging
    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)
    logging.info('start with arguments %s', args)

    # data iterators
    (train, val) = data_loader(args, kv)



    # save model
    checkpoint = _save_model(args, kv.rank)

    # devices for training
    devs = mx.gpu(0)

    # create model
    model = mx.mod.Module(
        context       = devs,
        symbol        = network
    )

    optimizer_params = {
            'learning_rate': args.lr,
            'momentum' : args.mom,
            'wd' : args.wd}

    monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None

    if args.network == 'alexnet':
        # AlexNet will not converge using Xavier
        initializer = mx.init.Normal()
    else:
        initializer = mx.init.Xavier(
            rnd_type='gaussian', factor_type="in", magnitude=2)

    # evaluation metrices
    eval_metrics = ['accuracy']

    # callbacks that run after each batch
    batch_end_callbacks = [mx.callback.Speedometer(args.batch_size, args.disp_batches)]

    # run
    model.fit(train,
        num_epoch          = args.num_epochs,
        eval_data          = val,
        eval_metric        = eval_metrics,
        kvstore            = kv,
        optimizer          = args.optimizer,
        optimizer_params   = optimizer_params,
        initializer        = initializer,
        batch_end_callback = batch_end_callbacks,
        epoch_end_callback = checkpoint,
        allow_missing      = True,
        monitor            = monitor)

In [11]:
logger = logging.getLogger()

if logger.handlers:
    logger.handlers[0].close()
    logger.handlers = []

fhandler = logging.FileHandler(filename='lab2.log', mode='w')
console = logging.StreamHandler()

# tell the handler to use this format
formatter = logging.Formatter('%(asctime)s - %(message)s')
fhandler.setFormatter(formatter)
console.setFormatter(formatter)
# add the handler to the root logger
logger.addHandler(fhandler)
logger.addHandler(console)
console.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)

if __name__ == '__main__':
    # download data
    (train_fname, val_fname) = ('./cifar10_train.rec','./cifar10_val.rec')

    # parse args
    parser = argparse.ArgumentParser(description="train cifar10",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    add_fit_args(parser)
    add_data_args(parser)
    parser.set_defaults(
        # network
        network        = 'resnet', #Network Name
        num_layers     = 50, #Number of layers in the network

        # data
        data_train     = train_fname, # Training dataset
        data_val       = val_fname, # Validation data
        num_classes    = 10, # number of classes
        num_examples  = 50000,
        image_shape    = '3,28,28',
        pad_size       = 4,

        # train
        batch_size     = 128,
        num_epochs     = 5,
        lr             = .01
    )
    args = parser.parse_args('--model-prefix model'.split())

    # load network
    print(args.network)
    from importlib import import_module
    net = import_module('symbols.'+args.network)
    sym = net.get_symbol(**vars(args))
    print(sym)

    model_prefix = 'mx_resnet'
    checkpoint = mx.callback.do_checkpoint(model_prefix)
    # train
    fit(args, sym, get_rec_iter, epoch_end_callback=checkpoint)

2018-07-13 05:31:36,224 - start with arguments Namespace(batch_size=128, data_nthreads=4, data_train='./cifar10_train.rec', data_val='./cifar10_val.rec', disp_batches=40, gpus=None, image_shape='3,28,28', kv_store='device', lr=0.01, model_prefix='model', mom=0.9, monitor=0, network='resnet', num_classes=10, num_epochs=5, num_examples=50000, num_layers=50, optimizer='sgd', pad_size=4, rgb_mean='123.68,116.779,103.939', wd=0.0001)


resnet
<Symbol softmax>
args kvstore is <mxnet.kvstore.KVStore object at 0x7fc280e28b70>


MXNetError: [05:31:36] src/io/input_split_base.cc:173: Check failed: files_.size() != 0U (0 vs. 0) Cannot find any files that matches the URI pattern ./cifar10_train.rec

Stack trace returned 10 entries:
[bt] (0) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1c05f2) [0x7fc25bc3f5f2]
[bt] (1) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1c0bd8) [0x7fc25bc3fbd8]
[bt] (2) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2de7a0f) [0x7fc25e866a0f]
[bt] (3) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2de7ec7) [0x7fc25e866ec7]
[bt] (4) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2daaf79) [0x7fc25e829f79]
[bt] (5) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2dab52d) [0x7fc25e82a52d]
[bt] (6) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2871fe5) [0x7fc25e2f0fe5]
[bt] (7) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x287277c) [0x7fc25e2f177c]
[bt] (8) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(MXDataIterCreateIter+0x1d0) [0x7fc25e198f30]
[bt] (9) /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fc295769ec0]

