torch.load('./checkpoint_epoch_80_multinomial.pth', map_location=torch.device('cuda:0'))['model_state']

找到具有正确输出的预训练模型的 loss 和 mAP 的对应关系

已完成：调整学习率，降低代码复杂度



In [1]:
import tqdm
from torch.nn.utils import clip_grad_norm_
from pcdet.models_multinomial_half.detectors.pointpillar import PointPillar

import argparse
import datetime
import glob
import os
from pathlib import Path
from test import repeat_eval_ckpt, eval_single_ckpt
from noise import add_noise_to_weights
import numba
import logging
import time

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import torch
import torch.distributed as dist
import torch.nn as nn
from tensorboardX import SummaryWriter

from pcdet.config import cfg, cfg_from_list, cfg_from_yaml_file, log_config_to_file
from pcdet.datasets import build_dataloader
from pcdet.models_multinomial_half import build_network, model_fn_decorator
from pcdet.utils import common_utils
from train_utils.optimization import build_optimizer, build_scheduler
# from train_utils.train_utils import train_model
# from eval_utils import eval_utils_multinomial
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
from eval_utils import eval_utils_multinomial_finalv_half as eval_utils



def load_data_to_gpu(batch_dict):
    for key, val in batch_dict.items():
        if not isinstance(val, np.ndarray):
            continue
        elif key in ['frame_id', 'metadata', 'calib']:
            continue
        elif key in ['images']:
            batch_dict[key] = image_to_tensor(val).float().cuda().contiguous()
        elif key in ['image_shape']:
            batch_dict[key] = torch.from_numpy(val).int().cuda()
        else:
            batch_dict[key] = torch.from_numpy(val).float().cuda()


def parse_config():
    parser = argparse.ArgumentParser(description='arg parser')
    parser.add_argument('--cfg_file', type=str, default='./cfgs/kitti_models/pointpillar_bayes.yaml', help='specify the config for training')
    parser.add_argument('--batch_size', type=int, default=8, required=False, help='batch size for training')
    parser.add_argument('--workers', type=int, default=32, help='number of workers for dataloader')
    parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment')
    parser.add_argument('--pretrained_model', type=str, default=True, help='pretrained_model')
    parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none')
    parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training')
    parser.add_argument('--sync_bn', action='store_true', default=False, help='whether to use sync bn')
    parser.add_argument('--fix_random_seed', action='store_true', default=True, help='')
    parser.add_argument('--ckpt_save_interval', type=int, default=80, help='number of training epochs')
    parser.add_argument('--local_rank', type=int, default=0, help='local rank for distributed training')
    parser.add_argument('--max_ckpt_save_num', type=int, default=81, help='max number of saved checkpoint')
    parser.add_argument('--merge_all_iters_to_one_epoch', action='store_true', default=False, help='')
    parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER, help='set extra config keys if needed')
    parser.add_argument('--max_waiting_mins', type=int, default=0, help='max waiting minutes')
    parser.add_argument('--start_epoch', type=int, default=0, help='')
    parser.add_argument('--save_to_file', action='store_true', default=False, help='')
    
    
    parser.add_argument('--epochs', type=int, default=800, required=False, help='number of epochs to train for')
    parser.add_argument('--ckpt', type=str, default='checkpoint_epoch_80_multinomial.pth', help='checkpoint to start from')

    args = parser.parse_known_args()[0]

    cfg_from_yaml_file(args.cfg_file, cfg)
    cfg.TAG = Path(args.cfg_file).stem
    cfg.EXP_GROUP_PATH = '/'.join(args.cfg_file.split('/')[1: -1])  # remove 'cfgs' and 'xxxx.yaml'

    if args.set_cfgs is not None:
        cfg_from_list(args.set_cfgs, cfg)

    return args, cfg




if __name__ == '__main__':
    torch.cuda.set_device(0)

    args, cfg = parse_config()
    dist_train = False
    total_gpus = 1
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
    memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()]
    print('Using GPU:' + str(np.argmax(memory_gpu)))
    os.environ["CUDA_VISIBLE_DEVICES"] = str(np.argmax(memory_gpu))
    os.system('rm tmp')


    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs
    
    common_utils.set_random_seed(666)
    save_path = './save_path/logger/'#/bayes/pointpillar/'+time.strftime('%m%d-%H%M',time.localtime(time.time()))+'/'

    logger = common_utils.create_logger(save_path+'log.txt', rank=cfg.LOCAL_RANK)
    
    print("=============")
    p1 = 0.23
    p2 = 0.77
    p3 = 0.68
    print(p1, p2, p3)
    print("=============")
    

    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train, workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs
    )

    model = PointPillar(
        model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), p1=p1, p2=p2, p3=p3, dataset=train_set
    )
    
    pth = torch.load('./checkpoint_epoch_80_multinomial.pth', map_location=torch.device('cuda:0'))['model_state']
    model.load_state_dict(pth)#['model_state']
    # torch.load('./checkpoint_epoch_33_multinomial.pth')['model_state']
    # model.cuda()

    optim_cfg = cfg.OPTIMIZATION
    optim_cfg.LR = 0.00000001
    optimizer = torch.optim.Adam(model.parameters(), lr=optim_cfg.LR, weight_decay=optim_cfg.WEIGHT_DECAY)

    start_epoch = it = 0
    last_epoch = -1

    # model.train()  # before wrap to DistributedDataParallel to support fixed some parameters
    # for m in model.modules():
    #     if isinstance(m, nn.BatchNorm2d):
    #         m.eval()
    def fix_bn(m):
        classname = m.__class__.__name__
        if classname.find('BatchNorm') != -1:
            m.eval()

    # model = models.resnet50(pretrained=True)
    model.cuda()
    model.train()
    model.apply(fix_bn)  # fix batchnorm

    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 20, eta_min=0, last_epoch=-1)
    

#     # -----------------------start training---------------------------
    logger.info('**********************Start training %s/%s(%s)**********************'
                % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    
    optim_cfg=cfg.OPTIMIZATION
    total_epochs=args.epochs
    rank=cfg.LOCAL_RANK
    
    with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank==0)) as tbar:
        total_it_each_epoch = len(train_loader)
        for cur_epoch in tbar:
            dataloader_iter = iter(train_loader)
            loss_list = []

            for cur_it in range(total_it_each_epoch):
                batch = next(dataloader_iter)
                model.train()
                optimizer.zero_grad()
                
                load_data_to_gpu(batch)
                ret_dict, tb_dict, disp_dict = model(batch)

                loss = ret_dict['loss'].mean()
        
                loss_list.append(loss)
                loss_avg = sum(loss_list) / len(loss_list)

                loss.backward()
                # clip_grad_norm_(model.parameters(), optim_cfg.GRAD_NORM_CLIP)
                
                optimizer.step()
                # lr_scheduler.step()

                print('cur_epoch: {}/{}, cur_it: {}/{}, lr: {:.6}, loss: {:.6}'.format(cur_epoch+1, total_epochs, cur_it+1, total_it_each_epoch, optim_cfg.LR, loss_avg))
            
            logger.info('**********************Start testing**********************')
            test_set, test_loader, sampler = build_dataloader(
                                            dataset_cfg=cfg.DATA_CONFIG,
                                            class_names=cfg.CLASS_NAMES,
                                            batch_size=args.batch_size,
                                            dist=dist_train, workers=args.workers, logger=logger, training=False
                                        )

            acc1, acc2, ret = eval_utils.eval_(cfg, model, test_loader)
            print(acc1, acc2, ret)

            filename = './save_path/0722/ckpt_epoch{}.pth'.format(cur_epoch+1)
            last_filename = filename
            torch.save(model.state_dict(), filename)
        
        
    # filename = './save_path/0722/ckpt_epoch{}.pth'.format(0+1)
    # torch.save(model.state_dict(), filename)        
    
#     logger.info('**********************Start testing %s/%s(%s)**********************'
#                 % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
#     test_set, test_loader, sampler = build_dataloader(
#                                     dataset_cfg=cfg.DATA_CONFIG,
#                                     class_names=cfg.CLASS_NAMES,
#                                     batch_size=args.batch_size,
#                                     dist=dist_train, workers=args.workers, logger=logger, training=False
#                                 )

#     acc1, acc2, ret = eval_utils.eval_(cfg, model, test_loader)
#     print(acc1, ac2, ret)

    # logger.info('----------------Bayes Optimization----------------')
    # Bounded region of parameter space
    # pbounds = {'p1': (0.1, 0.9), 'p2': (0.1, 0.9)}
    

#     optimizer = BayesianOptimization(
#         f=opt_function,
#         pbounds=pbounds,
#         verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
#         random_state=1,
#     )
#     optimizer.probe(
#         params={'p1': 0.11, 'p2': 0.11},
#         lazy=True,
#     )

#     logger_bayes = JSONLogger(path=save_path+"logs2.json")
#     optimizer.subscribe(Events.OPTIMIZATION_STEP, logger_bayes)
    
    
#     n = 0
#     optimizer.maximize(
#         init_points=3,
#         n_iter=10,
#     )
    print("=======end========")





2023-07-22 23:55:24,239   INFO  Database filter by min points Car: 14357 => 13532
2023-07-22 23:55:24,240   INFO  Database filter by min points Pedestrian: 2207 => 2168
2023-07-22 23:55:24,241   INFO  Database filter by min points Cyclist: 734 => 705
2023-07-22 23:55:24,256   INFO  Database filter by difficulty Car: 13532 => 10759
2023-07-22 23:55:24,258   INFO  Database filter by difficulty Pedestrian: 2168 => 2075
2023-07-22 23:55:24,259   INFO  Database filter by difficulty Cyclist: 705 => 581
2023-07-22 23:55:24,264   INFO  Loading KITTI dataset


Using GPU:0
0.23 0.77 0.68


2023-07-22 23:55:24,343   INFO  Total samples for KITTI dataset: 3712
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
2023-07-22 23:55:26,144   INFO  **********************Start training cfgs/kitti_models/pointpillar_bayes(default)**********************
epochs:   0%|          | 0/800 [00:00<?, ?it/s]

cur_epoch: 1/800, cur_it: 1/464, lr: 1e-08, loss: 4.69438
cur_epoch: 1/800, cur_it: 2/464, lr: 1e-08, loss: 4.57229
cur_epoch: 1/800, cur_it: 3/464, lr: 1e-08, loss: 4.98539
cur_epoch: 1/800, cur_it: 4/464, lr: 1e-08, loss: 4.83517
cur_epoch: 1/800, cur_it: 5/464, lr: 1e-08, loss: 4.75756
cur_epoch: 1/800, cur_it: 6/464, lr: 1e-08, loss: 4.6181
cur_epoch: 1/800, cur_it: 7/464, lr: 1e-08, loss: 4.54114
cur_epoch: 1/800, cur_it: 8/464, lr: 1e-08, loss: 4.5293
cur_epoch: 1/800, cur_it: 9/464, lr: 1e-08, loss: 4.48194
cur_epoch: 1/800, cur_it: 10/464, lr: 1e-08, loss: 4.40207
cur_epoch: 1/800, cur_it: 11/464, lr: 1e-08, loss: 4.41913
cur_epoch: 1/800, cur_it: 12/464, lr: 1e-08, loss: 4.3883
cur_epoch: 1/800, cur_it: 13/464, lr: 1e-08, loss: 4.37268
cur_epoch: 1/800, cur_it: 14/464, lr: 1e-08, loss: 4.3427
cur_epoch: 1/800, cur_it: 15/464, lr: 1e-08, loss: 4.31392
cur_epoch: 1/800, cur_it: 16/464, lr: 1e-08, loss: 4.30032
cur_epoch: 1/800, cur_it: 17/464, lr: 1e-08, loss: 4.29488
cur_epoch:

2023-07-23 00:01:09,648   INFO  **********************Start testing**********************
2023-07-23 00:01:09,650   INFO  Loading KITTI dataset
2023-07-23 00:01:09,744   INFO  Total samples for KITTI dataset: 3769


cur_epoch: 1/800, cur_it: 464/464, lr: 1e-08, loss: 4.54344
Average predicted number of objects(3769 samples): 0.000


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
[1m
File "../pcdet/datasets/kitti/kitti_object_eval_python/eval.py", line 122:[0m
[1m@numba.jit(nopython=True, parallel=True)
[1mdef d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
[0m[1m^[0m[0m
[0m
epochs:   0%|          | 1/800 [08:59<119:48:07, 539.78s/it]

0.0 0.0 {'recall/roi_0.3': 0.0, 'recall/rcnn_0.3': 0.0, 'recall/roi_0.5': 0.0, 'recall/rcnn_0.5': 0.0, 'recall/roi_0.7': 0.0, 'recall/rcnn_0.7': 0.0, 'Car_3d/easy_R40': 0.0, 'Car_3d/moderate_R40': 0.0, 'Car_3d/hard_R40': 0.0, 'Car_bev/easy_R40': 0.0, 'Car_bev/moderate_R40': 0.0, 'Car_bev/hard_R40': 0.0, 'Car_image/easy_R40': 0.0, 'Car_image/moderate_R40': 0.0, 'Car_image/hard_R40': 0.0, 'Pedestrian_3d/easy_R40': 0.0, 'Pedestrian_3d/moderate_R40': 0.0, 'Pedestrian_3d/hard_R40': 0.0, 'Pedestrian_bev/easy_R40': 0.0, 'Pedestrian_bev/moderate_R40': 0.0, 'Pedestrian_bev/hard_R40': 0.0, 'Pedestrian_image/easy_R40': 0.0, 'Pedestrian_image/moderate_R40': 0.0, 'Pedestrian_image/hard_R40': 0.0, 'Cyclist_3d/easy_R40': 0.0, 'Cyclist_3d/moderate_R40': 0.0, 'Cyclist_3d/hard_R40': 0.0, 'Cyclist_bev/easy_R40': 0.0, 'Cyclist_bev/moderate_R40': 0.0, 'Cyclist_bev/hard_R40': 0.0, 'Cyclist_image/easy_R40': 0.0, 'Cyclist_image/moderate_R40': 0.0, 'Cyclist_image/hard_R40': 0.0}
cur_epoch: 2/800, cur_it: 1/464

2023-07-23 00:10:05,674   INFO  **********************Start testing**********************
2023-07-23 00:10:05,676   INFO  Loading KITTI dataset
2023-07-23 00:10:05,773   INFO  Total samples for KITTI dataset: 3769


cur_epoch: 2/800, cur_it: 464/464, lr: 1e-08, loss: 4.63953
Average predicted number of objects(3769 samples): 0.000


epochs:   0%|          | 2/800 [17:52<118:39:55, 535.33s/it]

0.0 0.0 {'recall/roi_0.3': 0.0, 'recall/rcnn_0.3': 0.0, 'recall/roi_0.5': 0.0, 'recall/rcnn_0.5': 0.0, 'recall/roi_0.7': 0.0, 'recall/rcnn_0.7': 0.0, 'Car_3d/easy_R40': 0.0, 'Car_3d/moderate_R40': 0.0, 'Car_3d/hard_R40': 0.0, 'Car_bev/easy_R40': 0.0, 'Car_bev/moderate_R40': 0.0, 'Car_bev/hard_R40': 0.0, 'Car_image/easy_R40': 0.0, 'Car_image/moderate_R40': 0.0, 'Car_image/hard_R40': 0.0, 'Pedestrian_3d/easy_R40': 0.0, 'Pedestrian_3d/moderate_R40': 0.0, 'Pedestrian_3d/hard_R40': 0.0, 'Pedestrian_bev/easy_R40': 0.0, 'Pedestrian_bev/moderate_R40': 0.0, 'Pedestrian_bev/hard_R40': 0.0, 'Pedestrian_image/easy_R40': 0.0, 'Pedestrian_image/moderate_R40': 0.0, 'Pedestrian_image/hard_R40': 0.0, 'Cyclist_3d/easy_R40': 0.0, 'Cyclist_3d/moderate_R40': 0.0, 'Cyclist_3d/hard_R40': 0.0, 'Cyclist_bev/easy_R40': 0.0, 'Cyclist_bev/moderate_R40': 0.0, 'Cyclist_bev/hard_R40': 0.0, 'Cyclist_image/easy_R40': 0.0, 'Cyclist_image/moderate_R40': 0.0, 'Cyclist_image/hard_R40': 0.0}
cur_epoch: 3/800, cur_it: 1/464

epochs:   0%|          | 2/800 [21:55<145:50:44, 657.95s/it]


KeyboardInterrupt: 