In [2]:
import argparse
import datetime
import glob
import os
from pathlib import Path
from test import repeat_eval_ckpt, eval_single_ckpt
from noise import add_noise_to_weights
import numba
import logging
import time

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import torch
import torch.distributed as dist
import torch.nn as nn
from tensorboardX import SummaryWriter

from pcdet.config import cfg, cfg_from_list, cfg_from_yaml_file, log_config_to_file
from pcdet.datasets import build_dataloader
from pcdet.models_multinomial import build_network, model_fn_decorator
from pcdet.utils import common_utils
from train_utils.optimization import build_optimizer, build_scheduler
from train_utils.train_utils import train_model, model_save
from eval_utils import eval_utils_multinomial
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')


def parse_config():
    parser = argparse.ArgumentParser(description='arg parser')
    parser.add_argument('--cfg_file', type=str, default='./cfgs/kitti_models/pointpillar_bayes.yaml', \
                        help='specify the config for training')
    # sunqiao/OpenPCDet/tools/cfgs/kitti_models/pointpillar_bayes.yaml
    parser.add_argument('--batch_size', type=int, default=8, required=False, help='batch size for training')
    parser.add_argument('--epochs', type=int, default=80, required=False, help='number of epochs to train for')
    parser.add_argument('--workers', type=int, default=64, help='number of workers for dataloader')
    parser.add_argument('--extra_tag', type=str, default='multinomial', help='extra tag for this experiment')
    parser.add_argument('--ckpt', type=str, default='checkpoint_epoch_80_bayes.pth', 
                        help='checkpoint to start from')
    
    # ./checkpoint_epoch_80.pth
    parser.add_argument('--pretrained_model', type=str, default=True, help='pretrained_model')
    parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none')
    parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training')
    parser.add_argument('--sync_bn', action='store_true', default=False, help='whether to use sync bn')
    parser.add_argument('--fix_random_seed', action='store_true', default=True, help='')
    parser.add_argument('--ckpt_save_interval', type=int, default=1, help='number of training epochs')
    parser.add_argument('--local_rank', type=int, default=0, help='local rank for distributed training')
    parser.add_argument('--max_ckpt_save_num', type=int, default=99999, help='max number of saved checkpoint')
    parser.add_argument('--merge_all_iters_to_one_epoch', action='store_true', default=False, help='')
    parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER,
                        help='set extra config keys if needed')

    parser.add_argument('--max_waiting_mins', type=int, default=0, help='max waiting minutes')
    parser.add_argument('--start_epoch', type=int, default=0, help='')
    parser.add_argument('--save_to_file', action='store_true', default=False, help='')

    args = parser.parse_known_args()[0]

    cfg_from_yaml_file(args.cfg_file, cfg)
    cfg.TAG = Path(args.cfg_file).stem
    cfg.EXP_GROUP_PATH = '/'.join(args.cfg_file.split('/')[1: -1])  # remove 'cfgs' and 'xxxx.yaml'

    if args.set_cfgs is not None:
        cfg_from_list(args.set_cfgs, cfg)

    return args, cfg

class Opt():
    def __init__(self, train_set, train_loader, train_sampler, test_set, test_loader, sampler):
        # self.sigma = sigma
        # self.model = model
        self.train_set = train_set
        self.train_loader = train_loader
        self.train_sampler = train_sampler
        self.test_set = test_set
        self.test_loader = test_loader
        self.sampler = sampler
        
    def opt_function(self, p1, p2, p3):
        
        # sigma = self.sigma
        # model = self.model
#         train_set = self.train_set
#         train_loader = self.train_loader
#         train_sampler = self.train_sampler
#         test_set = self.test_set
#         test_loader = self.test_loader
#         sampler = self.sampler
        
#         model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), 
#                             p1=p1, 
#                             p2=p2, 
#                             p3=p3,
#                             dataset=train_set)
#         model.cuda()
        
#         optimizer = build_optimizer(model, cfg.OPTIMIZATION)
#         model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)
        
        
        global n
        n += 1
        
        print("=============")
        print(p1, p2, p3)
        print("=============")

        global best_accu

        # p1 = round(p1, 2)
        # p2 = round(p2, 2)

#         train_set, train_loader, train_sampler = build_dataloader(
#             dataset_cfg=cfg.DATA_CONFIG,
#             class_names=cfg.CLASS_NAMES,
#             batch_size=args.batch_size,
#             dist=dist_train, workers=args.workers,
#             logger=logger,
#             training=True,
#             merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
#             total_epochs=args.epochs
#         )

#         model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), 
#                             p1=p1, 
#                             p2=p2, 
#                             p3=p3,
#                             dataset=train_set)
#         model.cuda()
        # print(model.state_dict())
        # print("???????????")

        # if args.sync_bn:
        #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
        

        
        # # load checkpoint if it is possible
        start_epoch = it = 0
        last_epoch = -1
        # if args.pretrained_model is True:
        #     model.load_params_from_file(filename=args.ckpt, to_cpu=dist, logger=logger)

        # if args.ckpt is not None:
            # it, start_epoch = model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)
            # last_epoch = start_epoch + 1
            
            
        # model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)
        
        
        # else:
        #     ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        #     if len(ckpt_list) > 0:
        #         ckpt_list.sort(key=os.path.getmtime)
        #         it, start_epoch = model.load_params_with_optimizer(
        #             ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger
        #         )
        #         last_epoch = start_epoch + 1

        model.train()  # before wrap to DistributedDataParallel to support fixed some parameters
        model.cuda()
        # if dist_train:
        #     model = nn.parallel.DistributedDataParallel(model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
        # logger.info(model)

        lr_scheduler, lr_warmup_scheduler = build_scheduler(
            optimizer, total_iters_each_epoch=len(train_loader), total_epochs=args.epochs,
            last_epoch=last_epoch, optim_cfg=cfg.OPTIMIZATION
        )

        # -----------------------start training---------------------------
        logger.info('**********************Start training %s/%s(%s)**********************'
                    % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
        
        output_dir = cfg.ROOT_DIR / 'tools' / 'save_path' / args.extra_tag
        ckpt_dir = output_dir / 'ckpt'
        output_dir.mkdir(parents=True, exist_ok=True)
        ckpt_dir.mkdir(parents=True, exist_ok=True)
    
        # print(ckpt_dir)
        # ckpt_dir = './save_path/ckpts'
        print(ckpt_dir)

        train_model(
            model,
            optimizer,
            train_loader,
            model_func=model_fn_decorator(),
            lr_scheduler=lr_scheduler,
            optim_cfg=cfg.OPTIMIZATION,
            start_epoch=start_epoch,
            total_epochs=args.epochs,
            start_iter=it,
            rank=cfg.LOCAL_RANK,
            tb_log=tb_log,
            ckpt_save_dir=ckpt_dir,
            train_sampler=train_sampler,
            lr_warmup_scheduler=lr_warmup_scheduler,
            ckpt_save_interval=args.ckpt_save_interval,
            max_ckpt_save_num=args.max_ckpt_save_num,
            merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch
        )



        # model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), p1=0.42, p2=0.23, p3=0.11, dataset=test_set)

        # optimizer = build_optimizer(model, cfg.OPTIMIZATION)

        # if dist_train: 
        #     model = model.module

        # if args.pretrained_model is True:
        # model.load_params_from_file(filename=args.ckpt, to_cpu=dist, logger=logger)

        # model.load_params_from_file(filename='./checkpoint_epoch_80.pth', logger=logger, to_cpu=dist_train)
        # model.cuda()

        ckpt_pth = save_path+'bayes_model-{}-{}-{}'.format(p1, p2, p3)
        ckpt_name = ckpt_pth+'.pth'

        # if cfg.LOCAL_RANK == 0:
        #     model_save(model, ckpt_pth, optimizer, args.epochs, args.epochs)

        logger.info('**********************End training**********************')

        # time.sleep(30)



        # if dist_train: 
        #     model = model.module

        # sigma = self.sigma
        f = open(save_path+'result.txt', "a+")
        # f.write('----------------Noise-{}-evaluate----------------'.format(sigma))
        f.write('----------------{}-{}-{}---------------\n'.format(p1, p2, p3))
        f.close()

        # logger.info('---------------Epoch-{}-Noise-{}-evaluate----------------'.format(n, sigma))
        # model.load_params_from_file(filename=ckpt_name, logger=logger, to_cpu=dist_train)
        # model.cuda()
        # model = add_noise_to_weights(0, sigma, model)


        acc1 = eval_utils_multinomial.eval_simple(args.ckpt, p1, p2, p3, 0, n, cfg, model, test_loader, logger, save_path, dist_test=dist_train, save_to_file=args.save_to_file, result_dir=eval_output_dir)
        print("----------")
        print(acc1)
        print("----------")


        logger.info('**********************End evaluation**********************')

            # best_accu = acc

        return acc1  #+acc2+acc3




if __name__ == '__main__':

    torch.cuda.set_device(0)            
    # best_accu = 0

    args, cfg = parse_config()
    # if args.launcher == 'none':
    dist_train = False
    # total_gpus = 1
    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    # os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
    # memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()]
    # print('Using GPU:' + str(np.argmax(memory_gpu)))
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(np.argmax(memory_gpu))
    # os.system('rm tmp')
    # else:
    #     total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
    #         args.tcp_port, args.local_rank, backend='nccl'
    #     )
    #     dist_train = True

    # if args.batch_size is None:
    #     args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    # else:
    #     assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
    #     args.batch_size = args.batch_size // total_gpus

    # args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs

    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / 'bayes' / args.extra_tag
    ckpt_dir = output_dir / 'ckpt'
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    save_path = './save_path/bayes/'#/bayes/pointpillar/'+time.strftime('%m%d-%H%M',time.localtime(time.time()))+'/'

    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True) 

    logger = common_utils.create_logger(save_path+'log.txt', rank=cfg.LOCAL_RANK)

    file = open(save_path+'result.txt','w')
    file.write('results\n')
    file.close()

    # head = ''
    # logging.basicConfig(filename='./baseline/pointpillar/log.txt',
    #                     format=head)
    # logger_result = logging.getLogger()
    # logger_result.setLevel(logging.INFO)
    # console = logging.StreamHandler()
    # logging.getLogger('').addHandler(console)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    # if dist_train:
    #     logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.cfg_file, output_dir))

    tb_log = SummaryWriter(log_dir=str(output_dir / 'tensorboard')) if cfg.LOCAL_RANK == 0 else None

    eval_output_dir = output_dir / 'eval' / 'eval_with_train'
    eval_output_dir.mkdir(parents=True, exist_ok=True)
    args.start_epoch = max(args.epochs - 10, 0)  # Only evaluate the last 10 epochs
    


    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train, workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs
    )
    
    test_set, test_loader, sampler = build_dataloader(
                                    dataset_cfg=cfg.DATA_CONFIG,
                                    class_names=cfg.CLASS_NAMES,
                                    batch_size=args.batch_size,
                                    dist=dist_train, workers=args.workers, logger=logger, training=False
                                )


    model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), p1=0.23, p2=0.77, p3=0.68, dataset=train_set)
    model.cuda()
    # model.load_params_from_file(filename=args.ckpt, to_cpu=dist, logger=logger)
    # model.cuda()
    optimizer = build_optimizer(model, cfg.OPTIMIZATION)
    
    start_epoch = 0
    it = 0
    last_epoch = -1
    
    lr_scheduler, lr_warmup_scheduler = build_scheduler(
    optimizer, total_iters_each_epoch=len(train_loader), total_epochs=args.epochs,
    last_epoch=last_epoch, optim_cfg=cfg.OPTIMIZATION
    )
    

    
    train_model(
        model,
        optimizer,
        train_loader,
        model_func=model_fn_decorator(),
        lr_scheduler=lr_scheduler,
        optim_cfg=cfg.OPTIMIZATION,
        start_epoch=start_epoch,
        total_epochs=args.epochs,
        start_iter=it,
        rank=cfg.LOCAL_RANK,
        tb_log=tb_log,
        ckpt_save_dir=ckpt_dir,
        train_sampler=train_sampler,
        lr_warmup_scheduler=lr_warmup_scheduler,
        ckpt_save_interval=args.ckpt_save_interval,
        max_ckpt_save_num=args.max_ckpt_save_num,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch
    )

    
# #     # -----------------------start training---------------------------
#     logger.info('**********************Start training %s/%s(%s)**********************'
#                 % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))


#     logger.info('----------------Bayes Optimization----------------')
#     for sigma in np.linspace(1e-31, 1.0, 21):

#         # opt_function(0.11, 0.11)
#         print("=============")
#         p1 = 0.42
#         p2 = 0.23
#         p3 = 0.11
#         print(p1, p2, p3)
#         print("=============")
        
#         opt= Opt(sigma, train_set, train_loader, train_sampler, test_set, test_loader, sampler)
#         opt_function = opt.opt_function
        
#         # Bounded region of parameter space
#         pbounds = {'p1': (0.1, 0.9), 'p2': (0.1, 0.9), 'p3': (0.1, 0.9)}


#         optimizer_bayes = BayesianOptimization(
#             f=opt_function,
#             pbounds=pbounds,
#             verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
#             random_state=1,
#         )
#         optimizer_bayes.probe(
#             params={'p1': 0.68, 'p2': 0.77, 'p3': 0.23},
#             lazy=True,
#         )

#         logger_bayes = JSONLogger(path=save_path+"logs2.json")
#         optimizer_bayes.subscribe(Events.OPTIMIZATION_STEP, logger_bayes)


#         n = 0
#         optimizer_bayes.maximize(
#             init_points=3,
#             n_iter=1,
#         )
#     print("=======end========")





RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
def add_noise_to_weights(mean, std, model):
    """
    with torch.no_grad():
        if hasattr(m, 'weight'):
            m.weight.add_(torch.randn(m.weight.size()) * 0.1)
    """
    model = copy.deepcopy(model)
    gassian_kernel = torch.distributions.Normal(mean, std)
    with torch.no_grad():
        for param in model.parameters():                  
            param.mul_(torch.exp(gassian_kernel.sample(param.size())).cuda())
    return model

In [None]:
train_set, train_loader, train_sampler = build_dataloader(
    dataset_cfg=cfg.DATA_CONFIG,
    class_names=cfg.CLASS_NAMES,
    batch_size=args.batch_size,
    dist=dist_train, workers=args.workers,
    logger=logger,
    training=True,
    merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
    total_epochs=args.epochs
)

test_set, test_loader, sampler = build_dataloader(
                                dataset_cfg=cfg.DATA_CONFIG,
                                class_names=cfg.CLASS_NAMES,
                                batch_size=args.batch_size,
                                dist=dist_train, workers=args.workers, logger=logger, training=False
                            )


start_epoch = it = 0
last_epoch = -1

model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), 
                    p1=p1, 
                    p2=p2, 
                    p3=p3,
                    dataset=train_set)

optimizer = build_optimizer(model, cfg.OPTIMIZATION)
model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)

In [None]:
# copy.deepcopy
import torch.nn
# import copy
# def cpmodel(model):
#     model = copy.deepcopy(model)
#     with torch.no_grad():
#         for param in model.parameters():                  
#             param.mul_(torch.tensor(2).cuda())
            
#     return model
# model = nn.Linear(4,2).cuda()

model.cuda()

for i in model.parameters():
    print(i)
    break

model1 = add_noise_to_weights(0.0, 0.3, model)

# cpmodel(model)

for i in model.parameters():
    print(i)
    break
    
for i in model1.parameters():
    print(i)
    break

In [None]:
np.linspace(1e-31, 1.0, 21)[6]