# import library

In [None]:
# these packages are maybe useful

import os
import sys
import time
import random
import argparse
import traceback

import numpy as np
import matplotlib.pyplot as plt

## GPU environment variables

In notebook, maybe use %env will be a better solution, but not a general one

In [None]:
gpu = "1"

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'          # for TensorFlow, which version?
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

# import code

%load filename

%%writefile

In [None]:
%load_ext autoreload
%autoreload 2

from ExpUtils import *

# model arch

In [None]:
# also can load from file

# define args

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description='PyTorch Playground')
    parser.add_argument('--dataset', type=str, default='cifar10', help='cifar10, svhn (default: cifar10)')
    parser.add_argument('--data-dir', type=str, default='./dataset', help='default: ./dataset')
    parser.add_argument('--trainer', type=str, default='ce', help='ae, be, ce, de (default: ce)')
    parser.add_argument('--size', type=int, default=1000, help='size of training data set(default: )')
    parser.add_argument('--arch', type=str, default='CNN', help='MLP, CNN, ResNet, VGG')
    parser.add_argument('--num-epochs', type=int, default=100, metavar='N', help='number of epochs (default: 100)')
    
    parser.add_argument('--seed', type=int, default=1, metavar='N', help='random seed (default: 1)')
    parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
    parser.add_argument('--gpu-id', type=str, default="", metavar='N', help='gpu id list (default: auto select)')
    parser.add_argument('--log-interval', type=int, default=1, metavar='N', help='iterations to wait before logging status, (default: 1)')
    parser.add_argument('--batch-size', type=int, default=32, help='batch size of training data set (default: 32)')
    
    parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.001)')
    parser.add_argument('--lr-decay', type=float, default=0.95, help='learning rate decay (default: 0.95)')
    parser.add_argument('--epoch-decay-start', type=float, default=80, help='start to decay learning rate (default: 80)')    
    parser.add_argument('--affine', action='store_true', default=False, help='batch norm affine configuration')
    
    parser.add_argument('--drop', type=float, default=0.5, help='dropout rate, (default: 0.5)')
    parser.add_argument('--log-dir', type=str, default='', metavar='S', help='tensorboard directory, (default: an absolute path)')
    parser.add_argument('--log-arg', type=str, default='', metavar='S', help='show the arguments in directory name')
    parser.add_argument('--debug', action='store_true', default=False, help='compare log side by side')
    parser.add_argument('--vis', action='store_true', default=False, help='visual by tensor board')
    parser.add_argument('-r', '--resume', type=str, default='', metavar='S', help='resume from pth file')

    args = parser.parse_args()
    args.dir_path = None

    if args.gpu_id == "":
        args.gpu_id = auto_select_gpu()
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id

    if not args.log_arg:
        args.log_arg = "trainer-size-lr"

    if args.vis:
        # use some parameters, pid and running time to mark the process
        args_dict = vars(args)
        run_time = time.strftime('%d%H%M', time.localtime(time.time()))
        exp_marker = "-".join("%s=%s" % (e, str(args_dict.get(e, "None"))) for e in args.log_arg.split("-"))
        exp_marker = "NameABC/%s/%s_%d_%s" % (args.dataset, exp_marker, os.getpid(), run_time)
        base_dir = os.path.join(os.environ['HOME'], 'project/runs') if not args.log_dir else args.log_dir
        dir_path = os.path.join(base_dir, exp_marker)
        args.dir_path = dir_path
        set_file_logger(logger, args)
        args.writer = SummaryWriter(log_dir=dir_path)
    wlog("args in this experiment:\n%s" % '\n'.join(str(e) for e in sorted(vars(args).items())))

    args.cuda = not args.no_cuda and torch.cuda.is_available()

    device = torch.device("cuda" if args.cuda else "cpu")
    args.device = device

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    return args, kwargs

# train function

In [None]:
def train(train_set, test_set, ..., args):    
    arch = getattr(CNN, args.arch)
    model = arch(args)
    
    if args.debug:
        # for pytorch, or weight initialization in the model
        # weights init is based on numpy, so only need np.random.seed()
        np.random.seed(args.seed)
        model.apply(weights_init_uniform)
        
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    # use linear lr decay or some other schedulers
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay)
    start_epoch = 0
    if args.resume:
        # resume example
        checkpoint = load_checkpoint_by_marker(args, exp_marker)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(args.device)
    model = model.to(args.device)
    
    for epoch/iteration in range(total_steps):
        images, labels = train_set
        
        # loss = ...
        
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        
        if epoch/iteration % args.log_interval == 0:
            n_err, test_loss = evaluate_classifier(model, test_loader, args.device)
            acc = 1 - n_err / len(test_set)
            print("Epoch %d, test acc: %.5f" % (epoch, acc))
            if args.vis:
                if epoch % 5 == 0:
                    # save the last model
                    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': test_loss, 'acc': acc},
                               "%s/%s.pth" % (args.dir_path, "model"))
                    if epoch % 50 == 0:
                        # save the milestons
                        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': test_loss, 'acc': acc},
                                   "%s/model_%d.pth" % (args.dir_path, epoch))
                        
                args.writer.add_scalar("Train/total_loss", total_loss, epoch)

In [None]:
def main():
    args, kwargs = parse_args()
    set_framework_seed(args.seed, args.debug)  
    train_set, test_set = load_dataset(args.data_dir, valid=False, dataset_seed=args.seed)
    wlog("size of dataset".format(train_set.size()))
    train(train_set, test_set, ..., args)