- 1. 如果根据使用技术区分
    - 1. 第一个阶段为使用pytorch、torch.autograd.Function编写的自定义函数
         - LSS
    - 2. 第二个阶段为使用了C++扩展
    - 3. 第三个阶段为使用了C++ 以及 cuda扩展
         - buvfusion
         - nvidia-bevfusion
        
https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-a-mixed-c-cuda-extension

# 第一阶段LSS中的bev_pool
- 技术点：使用了custom function

In [2]:
import torch
import pickle

In [3]:
"""
x.pkl，是第一个sample的，QuickCumsumCuda中导出的x的数据
geom_feats.pkl，是第一个sample的，QuickCumsumCuda中导出的x的数据
ranks.pkl，是第一个sample的，QuickCumsumCuda中导出的x的数据
"""
with open("x.pkl", "rb") as f:
    content = f.read()
    x = pickle.loads(content)
x = x.to("cpu")
print(x.shape)

with open("geom_feats.pkl", "rb") as f:
    content = f.read()
    geom_feats = pickle.loads(content)
geom_feats = geom_feats.to("cpu")
print(geom_feats.shape)

with open("ranks.pkl", "rb") as f:
    content = f.read()
    ranks = pickle.loads(content)
ranks = ranks.to("cpu")
print(ranks.shape)

torch.Size([1834995, 80])
torch.Size([1834995, 4])
torch.Size([1834995])


In [5]:
class QuickCumsum(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, geom_feats, ranks):
        x = x.cumsum(0)
        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
        kept[:-1] = ranks[1:] != ranks[:-1]

        x, geom_feats = x[kept], geom_feats[kept]
        x = torch.cat((x[:1], x[1:] - x[:-1]))

        # save kept for backward
        ctx.save_for_backward(kept) # x在backward中用不到，所以不用存

        # no gradient for geom_feats
        ctx.mark_non_differentiable(geom_feats)

        return x, geom_feats

    @staticmethod
    def backward(ctx, gradx, gradgeom):
        (kept,) = ctx.saved_tensors
        back = torch.cumsum(kept, 0) # cumulative sum
        back[kept] -= 1

        val = gradx[back]

        return val, None, None

In [6]:
x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks)

In [7]:
print(x.shape)
print(geom_feats.shape)

torch.Size([107208, 80])
torch.Size([107208, 4])


# 后期cpu时间

In [None]:
import time

import torch

batch_size = 16
input_features = 32
state_size = 128

X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)

rnn = LLTM(input_features, state_size)

forward = 0
backward = 0
for _ in range(100000):
    start = time.time()
    new_h, new_C = rnn(X, (h, C))
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    backward += time.time() - start

print('Forward: {:.3f} s | Backward {:.3f} s'.format(forward, backward))

# 后期GPU时间

In [None]:
import torch

assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU

batch_size = 16
input_features = 32
state_size = 128

# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)

rnn = LLTM(input_features, state_size).to(cuda_device)

forward = 0
backward = 0
for _ in range(100000):
    start = time.time()
    new_h, new_C = rnn(X, (h, C))
    torch.cuda.synchronize()
    forward += time.time() - start

    start = time.time()
    (new_h.sum() + new_C.sum()).backward()
    torch.cuda.synchronize()
    backward += time.time() - start

print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))

ModuleNotFoundError: No module named 'bev_pool_ext'