In [1]:
import argparse

import numpy as np
import torch.utils.tensorboard
from torch.nn.utils import clip_grad_norm_

from pointCloud.utils.dataset import *
from pointCloud.utils.misc import *
from pointCloud.utils.data import *
from pointCloud.utils.transform import *
from pointCloud.models.autoencoder import *
from pointCloud.evaluation import EMD_CD

In [10]:
# Arguments
parser = argparse.ArgumentParser()
# Model arguments
parser.add_argument('--latent_dim', type=int, default=256)
parser.add_argument('--num_steps', type=int, default=200)
parser.add_argument('--beta_1', type=float, default=1e-4)
parser.add_argument('--beta_T', type=float, default=0.05)
parser.add_argument('--sched_mode', type=str, default='linear')
parser.add_argument('--flexibility', type=float, default=0.0)
parser.add_argument('--residual', type=eval, default=True, choices=[True, False])
# parser.add_argument('--resume', type=str, default=None)

parser.add_argument('--resume', type=str, default='D:\PycharmProjects\Replay_continual_learning_2\pointCloud\logs_ae\AE_2023_06_17__17_18_04\ckpt_0.000548_1000.pt')


# Datasets and loaders
parser.add_argument('--dataset_path', type=str, default='./data/shapenet.hdf5')
parser.add_argument('--categories', type=str_list, default=['airplane'])
parser.add_argument('--scale_mode', type=str, default='shape_unit')
parser.add_argument('--train_batch_size', type=int, default=128)
parser.add_argument('--val_batch_size', type=int, default=32)
parser.add_argument('--rotate', type=eval, default=False, choices=[True, False])

# Optimizer and scheduler
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--weight_decay', type=float, default=0)
parser.add_argument('--max_grad_norm', type=float, default=10)
parser.add_argument('--end_lr', type=float, default=1e-4)
parser.add_argument('--sched_start_epoch', type=int, default=150*THOUSAND)
parser.add_argument('--sched_end_epoch', type=int, default=300*THOUSAND)

# Training
parser.add_argument('--seed', type=int, default=2020)
parser.add_argument('--logging', type=eval, default=True, choices=[True, False])
parser.add_argument('--log_root', type=str, default='./logs_ae')
parser.add_argument('--device', type=str, default='cuda')
parser.add_argument('--max_iters', type=int, default=float('inf'))
parser.add_argument('--val_freq', type=float, default=1000)
parser.add_argument('--tag', type=str, default=None)
parser.add_argument('--num_val_batches', type=int, default=-1)
parser.add_argument('--num_inspect_batches', type=int, default=1)
parser.add_argument('--num_inspect_pointclouds', type=int, default=4)
args = parser.parse_args([])
seed_all(args.seed)  # 将种子值应用于随机数生成器，确保在每次运行代码时使用相同的种子值，从而使结果可复现

In [7]:
# Logging
# 如果参数中设置了 logging 为 True，则进行日志记录
if args.logging:
    # 获取一个新的日志目录路径
    log_dir = get_new_log_dir(args.log_root, prefix='AE_', postfix='_' + args.tag if args.tag is not None else '')
    # 获取一个名为 'train' 的日志记录器，并指定保存目录为 log_dir
    logger = get_logger('train', log_dir)
    # 创建一个 TensorBoard 的 SummaryWriter 对象，用于将日志写入 TensorBoard
    writer = torch.utils.tensorboard.SummaryWriter(log_dir)
    # 创建一个 CheckpointManager 对象，用于管理保存检查点文件
    ckpt_mgr = CheckpointManager(log_dir)
else:
    logger = get_logger('train', None)
    writer = BlackHole()
    ckpt_mgr = BlackHole()

# 将参数 args 写入日志文件
logger.info(args)


[2023-06-17 17:32:12,353::train::INFO] Namespace(latent_dim=256, num_steps=200, beta_1=0.0001, beta_T=0.05, sched_mode='linear', flexibility=0.0, residual=True, resume='D:\\PycharmProjects\\Replay_continual_learning_2\\pointCloud\\logs_ae\\AE_2023_06_17__17_18_04\\ckpt_0.000548_1000.pt', dataset_path='./data/shapenet.hdf5', categories=['airplane'], scale_mode='shape_unit', train_batch_size=128, val_batch_size=32, rotate=False, lr=0.001, weight_decay=0, max_grad_norm=10, end_lr=0.0001, sched_start_epoch=150000, sched_end_epoch=300000, seed=2020, logging=True, log_root='./logs_ae', device='cuda', max_iters=inf, val_freq=1000, tag=None, num_val_batches=-1, num_inspect_batches=1, num_inspect_pointclouds=4)
[2023-06-17 17:32:12,353::train::INFO] Namespace(latent_dim=256, num_steps=200, beta_1=0.0001, beta_T=0.05, sched_mode='linear', flexibility=0.0, residual=True, resume='D:\\PycharmProjects\\Replay_continual_learning_2\\pointCloud\\logs_ae\\AE_2023_06_17__17_18_04\\ckpt_0.000548_1000.pt',

In [8]:
# Datasets and loaders
transform = None
if args.rotate:
    transform = RandomRotate(180, ['pointcloud'], axis=1)  # 默认不进行旋转
logger.info('Transform: %s' % repr(transform))  # 打印数据变换对象的信息
logger.info('Loading datasets...')

# 加载训练数据集
"""
scale_mode有以下三种方式，默认为shape_unit
"shape_unit"：将每个点云缩放到单位尺寸范围内（[-0.5, 0.5] 或 [0, 1]），使其尺寸归一化。
"shape_sphere"：将每个点云缩放到单位球体内，使其尺寸和形状归一化。
"shape_sphere_uniform_scale"：与 "shape_sphere" 相同，但会保持原始点云的长宽高比例。
"""

train_dset = ShapeNetCore(
    path=args.dataset_path,
    cates=args.categories,
    split='train',
    scale_mode=args.scale_mode,
    transform=transform,
)

# 加载验证数据集
val_dset = ShapeNetCore(
    path=args.dataset_path,
    cates=args.categories,
    split='val',
    scale_mode=args.scale_mode,
    transform=transform,
)

# 创建训练数据集的迭代器，通过使用 get_data_iterator 函数，可以在训练过程中使用 DataLoader 对象进行数据的无限循环迭代。这在训练过程中非常有用，因为它可以保证每个训练样本都能够被使用到，并且在迭代到最后一个样本后能够重新开始迭代，形成一个无限的数据流。
train_iter = get_data_iterator(DataLoader(
    train_dset,
    batch_size=args.train_batch_size,
    num_workers=0,
))

# 创建验证数据集的数据加载器
val_loader = DataLoader(val_dset, batch_size=args.val_batch_size, num_workers=0)

[2023-06-17 17:32:12,575::train::INFO] Transform: None
[2023-06-17 17:32:12,575::train::INFO] Transform: None
[2023-06-17 17:32:12,577::train::INFO] Loading datasets...
[2023-06-17 17:32:12,577::train::INFO] Loading datasets...


In [9]:
# Model
logger.info('Building model...')
# 尝试加载checkpoint
if args.resume is not None:
    logger.info('Resuming from checkpoint...')
    ckpt = torch.load(args.resume)
    model = AutoEncoder(ckpt['args']).to(args.device)
    model.load_state_dict(ckpt['state_dict'])
else:
    # 创建新的模型
    # 根据 args 中的配置信息创建新的 AutoEncoder 模型对象
    model = AutoEncoder(args).to(args.device)
logger.info(repr(model))

[2023-06-17 17:32:15,618::train::INFO] Building model...
[2023-06-17 17:32:15,618::train::INFO] Building model...
[2023-06-17 17:32:15,620::train::INFO] Resuming from checkpoint...
[2023-06-17 17:32:15,620::train::INFO] Resuming from checkpoint...
[2023-06-17 17:32:17,882::train::INFO] AutoEncoder(
  (encoder): PointNetEncoder(
    (conv1): Conv1d(3, 128, kernel_size=(1,), stride=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    (conv3): Conv1d(128, 256, kernel_size=(1,), stride=(1,))
    (conv4): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc1_m): Linear(in_features=512, out_features=256, bias=

In [10]:
# Optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay
                             )
# 创建线性学习率调度器
scheduler = get_linear_scheduler(
    optimizer,
    start_epoch=args.sched_start_epoch,
    end_epoch=args.sched_end_epoch,
    start_lr=args.lr,
    end_lr=args.end_lr
)

In [11]:
# Train, validate
def train(it):
    # Load data
    batch = next(train_iter)  # 从数据迭代器中获取一个批次的数据
    x = batch['pointcloud'].to(args.device)  # 将点云数据转移到指定设备（GPU）上

    # Reset grad and model state
    optimizer.zero_grad()  # 清零优化器中的梯度
    model.train()  # 设置模型为训练模式

    # Forward
    loss = model.get_loss(x)  # 前向传播计算损失函数  TODO:需要具体看一下get_loss函数

    # Backward and optimize
    loss.backward()  # 反向传播计算梯度
    orig_grad_norm = clip_grad_norm_(model.parameters(), args.max_grad_norm)  # 对梯度进行裁剪，防止梯度爆炸
    optimizer.step()  # 对梯度进行裁剪，防止梯度爆炸
    scheduler.step()  # 更新学习率

    # 写入日志
    logger.info('[Train] Iter %04d | Loss %.6f | Grad %.4f ' % (it, loss.item(), orig_grad_norm))
    writer.add_scalar('train/loss', loss, it)
    writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], it)
    writer.add_scalar('train/grad_norm', orig_grad_norm, it)
    writer.flush()  # 更新学习率

In [12]:
def validate_loss(it):
    # 初始化空的列表用于保存参考点云和重建点云
    all_refs = []
    all_recons = []
    for i, batch in enumerate(tqdm(val_loader, desc='Validate')):
        # 如果指定了验证批次数目，并且已经达到指定数目，则退出循环
        if args.num_val_batches > 0 and i >= args.num_val_batches:
            break
        # 将参考点云转移到设备（GPU）
        ref = batch['pointcloud'].to(args.device)
        shift = batch['shift'].to(args.device)
        scale = batch['scale'].to(args.device)
        # 禁用梯度计算，设置模型为评估模式
        with torch.no_grad():
            model.eval()
            # 使用编码器将参考点云编码为潜在空间向量
            code = model.encode(ref)
            # 使用解码器根据编码向量生成重建点云
            recons = model.decode(code, ref.size(1), flexibility=args.flexibility)

        # 将参考点云和重建点云进行尺度和偏移还原，并添加到对应的列表中
        all_refs.append(ref * scale + shift)
        all_recons.append(recons * scale + shift)

    # 将列表中的点云拼接起来，得到完整的参考点云和重建点云
    all_refs = torch.cat(all_refs, dim=0)
    all_recons = torch.cat(all_recons, dim=0)
    # 使用拼接后的点云计算指标（如 Chamfer 距离和 Earth Mover's Distance）作为模型在验证集上的损失
    metrics = EMD_CD(all_recons, all_refs, batch_size=args.val_batch_size)
    cd, emd = metrics['MMD-CD'].item(), metrics['MMD-EMD'].item()

    # 将计算得到的损失值记录到日志中，并使用 TensorBoard 进行可视化
    logger.info('[Val] Iter %04d | CD %.6f | EMD %.6f  ' % (it, cd, emd))
    writer.add_scalar('val/cd', cd, it)
    writer.add_scalar('val/emd', emd, it)
    writer.flush()

    return cd

In [13]:
def validate_inspect(it):
    sum_n = 0
    sum_chamfer = 0
    # 遍历验证集中的每个批次
    for i, batch in enumerate(tqdm(val_loader, desc='Inspect')):
        x = batch['pointcloud'].to(args.device)
        model.eval()
        code = model.encode(x)
        recons = model.decode(code, x.size(1), flexibility=args.flexibility).detach()
        # 更新计数器
        sum_n += x.size(0)
        # 如果达到指定的检查批次数目，则退出循环，只检查指定的批次数
        if i >= args.num_inspect_batches:
            break   # Inspect only 5 batch
    # 将重建的点云可视化，并使用 TensorBoard 进行记录
    writer.add_mesh('val/pointcloud', recons[:args.num_inspect_pointclouds], global_step=it)
    writer.flush()

In [12]:
# Main loop
logger.info('Start training...')
try:
    # 初始化迭代次数为1
    it = 1
    # 循环执行训练过程，直到达到最大迭代次数
    while it <= args.max_iters:
        # 调用train函数进行模型训练，传入当前的迭代次数
        train(it)
        # 判断是否达到进行验证的时机
        if it % args.val_freq == 0 or it == args.max_iters:
            with torch.no_grad():
                # 计算验证集上的损失
                cd_loss = validate_loss(it)
                # 对重建的点云进行检查
                validate_inspect(it)
            # 保存当前的优化器和调度器状态
            opt_states = {
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
            }
            # 保存模型的状态、参数和优化器状态到检查点文件中
            ckpt_mgr.save(model, args, cd_loss, opt_states, step=it)
        it += 1

except KeyboardInterrupt:
    logger.info('Terminating...')

[2023-06-17 16:48:59,238::train::INFO] Start training...
[2023-06-17 16:49:02,661::train::INFO] [Train] Iter 0001 | Loss 0.592925 | Grad 1.6427 
[2023-06-17 16:49:03,395::train::INFO] [Train] Iter 0002 | Loss 0.584033 | Grad 2.1974 
[2023-06-17 16:49:04,379::train::INFO] [Train] Iter 0003 | Loss 0.608077 | Grad 1.0924 
[2023-06-17 16:49:05,287::train::INFO] [Train] Iter 0004 | Loss 0.558488 | Grad 0.5156 
[2023-06-17 16:49:06,447::train::INFO] [Train] Iter 0005 | Loss 0.564405 | Grad 0.3766 
[2023-06-17 16:49:07,490::train::INFO] [Train] Iter 0006 | Loss 0.523171 | Grad 0.6145 
[2023-06-17 16:49:08,558::train::INFO] [Train] Iter 0007 | Loss 0.673214 | Grad 0.5378 
[2023-06-17 16:49:10,619::train::INFO] [Train] Iter 0008 | Loss 0.569181 | Grad 0.3076 
[2023-06-17 16:49:12,353::train::INFO] [Train] Iter 0009 | Loss 0.370654 | Grad 0.2336 
[2023-06-17 16:49:15,482::train::INFO] [Train] Iter 0010 | Loss 0.540885 | Grad 0.3158 
[2023-06-17 16:49:18,425::train::INFO] [Train] Iter 0011 | Loss

In [25]:
for i, batch in enumerate(tqdm(val_loader, desc='Inspect')):
    print(i)

Inspect:   0%|          | 0/19 [00:00<?, ?it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [4]:
import open3d as o3d
import numpy as np
gen_pcs = np.load(r'D:\PycharmProjects\Replay_continual_learning_2\pointCloud\results\GEN_Ours_car_1687072132\out.npy')
for x in gen_pcs:
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(x)
    o3d.visualization.draw_geometries([pcd])
    break

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [11]:
from pointCloud.utils.dataset import ShapeNetCore
train_d = ShapeNetCore(
    path='./data/shapenet.hdf5',
    cates=['car'],
    split='train',
    scale_mode='shape_unit',
    transform=None,
)


In [6]:
x = train_d[0]

In [12]:
len(train_d)

2986

In [15]:
x['pointcloud'].shape

torch.Size([2048, 3])