In [1]:
import torch
import torch.nn.functional as F
import pytest
from sparsetriton.tensor import SparseTensor, randn
from sparsetriton.nn.functional import sparse_conv3d

@pytest.mark.parametrize("C_in, C_out, kernel_size, stride, padding, dilation", [
    (8, 16, 3, 1, 1, 1),
    (4, 8, 3, 2, 1, 1),
    (16, 16, 5, 1, 2, 1),
])
def test_sparse_conv3d_vs_torch_dense(C_in, C_out, kernel_size, stride, padding, dilation):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 1. Create input sparse tensor
    spatial_shape = (3, 3, 3)
    st_tensor = randn(spatial_shape, batch_size=1, channels=C_in, nnz=27, device=device)

    # st_tensor.F = torch.ones_like(st_tensor.F)

    # 2. Create convolution weight (K, C_in, C_out)
    weight = torch.rand(kernel_size**3, C_in, C_out, device=device)
    weight.requires_grad = True
    

    # 3. Run sparsetriton convolution (submanifold=False for full comparison)
    st_out_tensor = sparse_conv3d(
        st_tensor.half(),
        weight.half(),
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        submanifold=False,
        transposed=True
    ).float()

    st_out_tensor.F.sum().backward()
    grad1 = weight.grad.clone()
    print(grad1[0, :10, 0])
    weight.grad = None
    
    # 4. Run torch dense convolution
    # Weight: (K, C_in, C_out) -> (C_out, C_in, k, k, k)
    k = kernel_size
    weight_torch = weight.view(k, k, k, C_in, C_out).permute(3, 4, 0, 1, 2).contiguous()
    # weight_torch = weight.view(k, k, k, C_in, C_out).permute(4, 3, 1, 0, 2).contiguous()
    # weight_torch = weight.view(k, k, k, C_in, C_out).permute(4, 3, 0, 2, 1).contiguous()
    # weight_torch = weight.view(k, k, k, C_in, C_out).permute(4, 3, 2, 1, 0).contiguous()
    # weight_torch = weight.view(k, k, k, C_in, C_out).permute(4, 3, 2, 0, 1).contiguous()
    # weight_torch = weight.view(k, k, k, C_in, C_out).permute(4, 3, 1, 2, 0).contiguous()
    
    # Input: Sparse -> Dense (N, D, H, W, C) -> (N, C, D, H, W)
    dense_input = st_tensor.dense().permute(0, 4, 1, 2, 3).contiguous()
    
    dense_output = F.conv_transpose3d(
        dense_input.half(),
        weight_torch.half(),
        stride=stride,
        padding=padding,
        dilation=dilation
    ).float()
    dense_output.sum().backward()
    grad2 = weight.grad.clone()
    print(grad2[0, :10, 0])
    # 5. Compare dense results
    # st_out_tensor.dense() is (N, D_out, H_out, W_out, C_out)
    # dense_output is (N, C_out, D_out, H_out, W_out)
    st_dense_output = st_out_tensor.dense()
    torch_dense_output = dense_output.permute(0, 2, 3, 4, 1).contiguous()
    # print(st_dense_output.nonzero())
    # print(torch_dense_output.nonzero())
    # print(st_dense_output[..., -1])
    # print(torch_dense_output[..., -1])
    print((st_dense_output - torch_dense_output).abs().max())
    assert st_dense_output.shape == torch_dense_output.shape, \
        f"Shape mismatch: {st_dense_output.shape} vs {torch_dense_output.shape}"
        
    assert torch.allclose(st_dense_output, torch_dense_output, atol=1e-3, rtol=1e-3), \
        f"Feature values mismatch. Max diff: {(st_dense_output - torch_dense_output).abs().max()}"
    
test_sparse_conv3d_vs_torch_dense(* (16, 16, 3, 2, 1, 1))

tensor([-5.5508, -5.9297, -0.9688,  1.6465,  2.9766, -0.1832, -0.6606,  3.7852,
        -1.1699, -2.2598], device='cuda:0')
tensor([-5.5508, -5.9297, -0.9688,  1.6465,  2.9766, -0.1832, -0.6606,  3.7852,
        -1.1699, -2.2598], device='cuda:0')
tensor(0., device='cuda:0', grad_fn=<MaxBackward1>)


In [2]:
from sparsetriton.nn.modules.conv import Conv3d
from sparsetriton.nn.modules.activation import ReLU
from sparsetriton import SparseTensor
from sparsetriton.tensor import randn
from tqdm import tqdm
from torch import nn
import torch

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = Conv3d(16, 64, (3, 3, 3), 1, 1)
        self.relu1 = ReLU()
        self.conv2 = Conv3d(64, 64, (3, 3, 3), 1, 1)
        self.relu2 = ReLU()
        self.conv3 = Conv3d(64, 64, (3, 3, 3), 1, 1)
        self.relu3 = ReLU()
        self.conv4 = Conv3d(64, 64, (3, 3, 3), 1, 1)
        self.relu4 = ReLU()
        self.conv5 = Conv3d(64, 1, (3, 3, 3), 1, 1) # 변수명 중복 수정

    def forward(self, x):
        # SparseTensor의 특징값(.F)에 대해서만 ReLU를 적용하고 교체(replace)합니다.
        x = self.conv1(x)
        x = self.relu1(x)
        
        x = self.conv2(x)
        x = self.relu2(x)
        
        x = self.conv3(x)
        x = self.relu3(x)
        
        x = self.conv4(x)
        x = self.relu4(x)
        
        x = self.conv5(x)
        return x
net = Net().to("cuda")

In [3]:
optim = torch.optim.Adam(net.parameters())

# x = randn((512, 512, 512), 10, 16, 512**3 // 100).to("cuda")
y = 10

for _ in tqdm(range(10000)):
    x = randn((512, 512, 512), 1, 16, 512**3 // 30, device = "cuda")
    optim.zero_grad()
    out = net(x)
    loss = (out.F - y) ** 2
    loss = loss.mean()
    loss.backward()
    optim.step()

  0%|          | 7/10000 [00:31<12:17:35,  4.43s/it]


KeyboardInterrupt: 

In [3]:
conv = Conv3d(16, 64, (3, 3, 3), 1, 1).to("cuda")

In [4]:
conv(test_tensor)


AcceleratorError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [15]:
torch.iinfo(torch.uint32).max / (1536**3)

1.1851851849092379

In [2]:
from sparsetriton.tensor import SparseTensor, randn
from sparsetriton.utils.hash import HashTable, flatten_coord, hash_coords, unflatten_coord
from sparsetriton.nn.functional.conv.kmap import get_neighbor_map
from torchsparse import SparseTensor as TSparseTensor
from torchsparse.nn import Conv3d

In [3]:
import torch
x = randn(batch_size=1, spatial_shape=(512, 512, 512), nnz=512 ** 3 // 10, device='cuda')

sp_x = TSparseTensor(
    x.F.clone(),
    coords=x.C.int().clone(),
    
)

In [4]:
sp_conv3d = Conv3d(1, 32, kernel_size=3, stride=2, padding=0).cuda()

In [5]:
sp_out = sp_conv3d(sp_x)

In [6]:
kmaps = list(sp_x._caches.kmaps.values())[0]


In [7]:
sp_x.C.shape

torch.Size([13421772, 4])

In [12]:
15469544 / 13421772

1.152570912395174

In [8]:
kmaps["coords"].shape

torch.Size([15469544, 4])

In [14]:
sp_out.C.shape

torch.Size([13421772, 4])

In [12]:
kmaps["out_in_map"].shape

torch.Size([13421824, 27])

In [5]:
sp_conv3d.named_parameters().__next__()[1].shape

torch.Size([27, 1, 32])

In [5]:
import torchsparse
import torchsparse.tensor
from tqdm import tqdm

In [6]:
512 ** 3 // 10

13421772

In [6]:
for _ in tqdm(range(1)):
    sp_out = sp_conv3d(sp_x)

NameError: name 'tqdm' is not defined

In [None]:
13421824 / 13421772

1.0000038743021413

In [9]:
list(sp_x._caches.kmaps.values())[0]["out_in_map"].shape

torch.Size([13421824, 27])

In [6]:
import triton
import triton.language as tl

@triton.jit

def implicit_gemm_kernel(
    features_ptr, weights_ptr, in_out_map, out_ptr,
    N, C_in, C_out,
    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_C_OUT: tl.constexpr,
    BLOCK_SIZE_C_IN: tl.constexpr, K_VOL: tl.constexpr
):
    """
    Sparse Convolution Implicit GEMM Kernel
    """
    # 프로그램 ID (N축과 C_out축에 대한 타일링)
    pid_n = tl.program_id(0)
    pid_cout = tl.program_id(1)

    off_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    off_cout = pid_cout * BLOCK_SIZE_C_OUT + tl.arange(0, BLOCK_SIZE_C_OUT)
    
    mask_n = off_n < N
    mask_cout = off_cout < C_out

    # 누적할 출력 버퍼 초기화 (FP32 권장)
    acc = tl.zeros((BLOCK_SIZE_N, BLOCK_SIZE_C_OUT), dtype=tl.float32)

    # Kernel Volume(K^3) 루프
    for k in range(K_VOL):
        # 1. 현재 커널 위치에서의 이웃 인덱스 로드
        # neighbor_map_ptr shape: (N, K_VOL)
        n_idx = tl.load(in_out_map + off_n * K_VOL + k, mask=mask_n, other=-1)
        
        # 2. 유효한 이웃이 있는 경우만 연산 (Masking)
        # n_idx가 -1인 경우 실제 연산에서 제외되도록 마스크 생성
        valid_mask = (n_idx >= 0) & mask_n

        # 3. Inner Loop: C_in 방향으로 Dot Product (GEMM)
        for c_in_off in range(0, C_in, BLOCK_SIZE_C_IN):
            off_cin = c_in_off + tl.arange(0, BLOCK_SIZE_C_IN)
            mask_cin = off_cin < C_in

            # Features 로드: (BLOCK_SIZE_N, BLOCK_SIZE_C_IN)
            # n_idx는 각 n에 대한 이웃의 절대 인덱스임
            f_tile = tl.load(
                features_ptr + n_idx[:, None] * C_in + off_cin[None, :],
                mask=valid_mask[:, None] & mask_cin[None, :],
                other=0.0
            )

            # Weights 로드: (BLOCK_SIZE_C_IN, BLOCK_SIZE_C_OUT)
            # Weight shape: (K_VOL, C_in, C_out)
            w_tile = tl.load(
                weights_ptr + (k * C_in * C_out) + (off_cin[:, None] * C_out + off_cout[None, :]),
                mask=mask_cin[:, None] & mask_cout[None, :],
                other=0.0
            )

            # Matrix Multiply-Accumulate
            acc += tl.dot(f_tile, w_tile)

    # 최종 결과 저장
    out_off = off_n[:, None] * C_out + off_cout[None, :]
    tl.store(out_ptr + out_off, acc.to(out_ptr.dtype.element_ty), mask=mask_n[:, None] & mask_cout[None, :])

def sparse_conv_implicit_gemm(features, weights, neighbor_map):
    N, C_in = features.shape
    K_vol, _, C_out = weights.shape
    
    output = torch.empty((N, C_out), device=features.device, dtype=features.dtype)

    # 튜닝이 필요한 하이퍼파라미터
    BLOCK_SIZE_N = 128
    BLOCK_SIZE_C_OUT = 64
    BLOCK_SIZE_C_IN = 32

    grid = (
        triton.cdiv(N, BLOCK_SIZE_N),
        triton.cdiv(C_out, BLOCK_SIZE_C_OUT)
    )

    implicit_gemm_kernel[grid](
        features, weights, neighbor_map, output,
        N, C_in, C_out,
        BLOCK_SIZE_N=BLOCK_SIZE_N,
        BLOCK_SIZE_C_OUT=BLOCK_SIZE_C_OUT,
        BLOCK_SIZE_C_IN=BLOCK_SIZE_C_IN,
        K_VOL=K_vol
    )
    return output

In [9]:
sp_out.F.shape

torch.Size([13421772, 32])

In [38]:
weights = next(sp_conv3d.named_parameters())[1]

In [39]:
for _ in tqdm(range(1)):
    sp_out = sp_conv3d(sp_x)

100%|██████████| 1/1 [00:00<00:00, 101.57it/s]


In [45]:
import torch

ht = HashTable(capacity=len(x.coords) * 2, device='cuda')
ht.insert(x.coords)
near_map = get_neighbor_map(x.coords, ht, 3)

for _ in tqdm(range(1)):
    new_feats = sparse_conv_implicit_gemm(
        x.feats, 
        weights,
        near_map
    )

100%|██████████| 1/1 [00:00<00:00, 2910.69it/s]


In [52]:
x.C[9377932], x.C[13421771]

(tensor([  0, 467, 324,  96], device='cuda:0', dtype=torch.int16),
 tensor([  0, 468, 325,  95], device='cuda:0', dtype=torch.int16))

In [49]:
near_map[-1]

tensor([      -1,       -1,  9377932,       -1,       -1,       -1,       -1,
              -1,       -1,       -1,       -1,       -1,       -1, 13421771,
              -1,       -1,       -1,       -1,       -1,  3634089,       -1,
        13180638,       -1,       -1,       -1,       -1,       -1],
       device='cuda:0')

In [54]:
x.feats

tensor([[-1.0958],
        [-0.2044],
        [-0.1956],
        ...,
        [ 0.2063],
        [ 1.8992],
        [-0.2374]], device='cuda:0')

In [46]:
new_feats

tensor([[ 0.1162, -0.1180,  0.0080,  ...,  0.0388,  0.0006, -0.1098],
        [-0.0447,  0.0252, -0.0847,  ...,  0.1661, -0.1428,  0.0587],
        [-0.1634,  0.2396, -0.0615,  ...,  0.2173, -0.5398, -0.2531],
        ...,
        [-0.0839,  0.1356,  0.0647,  ...,  0.0978,  0.0993,  0.1051],
        [-0.1454,  0.2487,  0.0021,  ..., -0.1271,  0.0118,  0.2781],
        [-0.3452,  0.3124, -0.3670,  ...,  0.0354,  0.0554,  0.2061]],
       device='cuda:0')

In [43]:
sp_out.F

tensor([[ 0.0839, -0.1435, -0.0012,  ...,  0.0734, -0.0068, -0.1605],
        [ 0.0499, -0.1520, -0.0876,  ...,  0.0707,  0.1188, -0.0122],
        [ 0.3973, -0.0955,  0.0447,  ...,  0.0290, -0.0794, -0.2195],
        ...,
        [ 0.1722, -0.0536,  0.1265,  ..., -0.2103, -0.1825, -0.0393],
        [-0.1454,  0.2487,  0.0021,  ..., -0.1271,  0.0118,  0.2781],
        [-0.2518, -0.0438, -0.0515,  ...,  0.1507,  0.1332, -0.3083]],
       device='cuda:0', grad_fn=<ImplicitGEMMConvolutionFuntionBackward>)

In [37]:
x.C

tensor([[  0, 428,  78,  93],
        [  0, 491, 265, 260],
        [  0,  78,  15, 461],
        ...,
        [  0, 481, 225, 405],
        [  0, 335, 247, 354],
        [  0, 468, 325,  95]], device='cuda:0', dtype=torch.int16)

In [53]:
sp_out.F

tensor([[ 0.0839, -0.1435, -0.0012,  ...,  0.0734, -0.0068, -0.1605],
        [ 0.0499, -0.1520, -0.0876,  ...,  0.0707,  0.1188, -0.0122],
        [ 0.3973, -0.0955,  0.0447,  ...,  0.0290, -0.0794, -0.2195],
        ...,
        [ 0.1722, -0.0536,  0.1265,  ..., -0.2103, -0.1825, -0.0393],
        [-0.1454,  0.2487,  0.0021,  ..., -0.1271,  0.0118,  0.2781],
        [-0.2518, -0.0438, -0.0515,  ...,  0.1507,  0.1332, -0.3083]],
       device='cuda:0', grad_fn=<ImplicitGEMMConvolutionFuntionBackward>)

In [34]:
sp_out.F - new_feats[0]

tensor([[-0.0323, -0.0255, -0.0092,  ...,  0.0345, -0.0074, -0.0507],
        [ 0.0946, -0.1772, -0.0030,  ..., -0.0953,  0.2616, -0.0709],
        [ 0.5607, -0.3351,  0.1062,  ..., -0.1883,  0.4604,  0.0336],
        ...,
        [ 0.2560, -0.1893,  0.0618,  ..., -0.3081, -0.2817, -0.1444],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0934, -0.3563,  0.3154,  ...,  0.1153,  0.0777, -0.5143]],
       device='cuda:0', grad_fn=<SubBackward0>)

In [44]:
new_feats[0]

tensor([[ 0.1162, -0.1180,  0.0080,  ...,  0.0388,  0.0006, -0.1098],
        [-0.0447,  0.0252, -0.0847,  ...,  0.1661, -0.1428,  0.0587],
        [-0.1634,  0.2396, -0.0615,  ...,  0.2173, -0.5398, -0.2531],
        ...,
        [-0.0839,  0.1356,  0.0647,  ...,  0.0978,  0.0993,  0.1051],
        [-0.1454,  0.2487,  0.0021,  ..., -0.1271,  0.0118,  0.2781],
        [-0.3452,  0.3124, -0.3670,  ...,  0.0354,  0.0554,  0.2061]],
       device='cuda:0')

In [None]:
for _ in tqdm(range(10000)):
    new_feats = sparse_conv_implicit_gemm(
        x.feats, 
        weights,
        near_map
    )

  0%|          | 0/10000 [00:00<?, ?it/s]


AcceleratorError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
ht = HashTable(capacity=len(x.coords) * 2, device='cuda')
ht.insert(x.coords)
near_map = get_neighbor_map(x.coords, ht, 3)

In [None]:
x.feats

tensor([[ 0.3570],
        [ 0.7091],
        [-0.4729],
        ...,
        [-1.3130],
        [-0.6882],
        [ 0.4037]], device='cuda:0')

In [None]:
import torch
x = SparseTensor(
    feats=torch.ones(3, 1, device='cuda'),
    coords=torch.tensor([[0,0,0,0],[0,0,0,10],[0,0,1,0]], device='cuda')
)
ht = HashTable(capacity=len(x.coords) * 2, device='cuda')
ht.insert(x.coords)
near_map = get_neighbor_map(x.coords, ht, 3)
weights = torch.ones(27, x.feats.shape[1], 64, device='cuda')
new_feats = sparse_conv_implicit_gemm(
    x.feats, 
    weights,
    near_map
)

In [None]:
x.coords

tensor([[ 0,  0,  0,  0],
        [ 0,  0,  0, 10],
        [ 0,  0,  1,  0]], device='cuda:0', dtype=torch.int16)

In [None]:
new_feats

tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]], device='cuda:0')

In [None]:
from torch.nn.functional import conv3d

dx = x.dense()
dx_input = dx.permute(0, 4, 1, 2, 3).contiguous()
print(dx_input.stride())
weights_5d = weights.view(3, 3, 3, x.feats.shape[1], 64)

# 4. 차원 순서 변경 (Permute)
# PyTorch 규격: (Out_Channels, In_Channels, kD, kH, kW)
# 현재: (kD, kH, kW, In, Out) -> [3, 3, 3, 1, 64]
# 목표 인덱스 순서: (4, 3, 0, 1, 2)
weights_5d = weights_5d.permute(4, 3, 0, 1, 2).contiguous()
output = conv3d(dx_input, weights_5d, padding=1, stride=1)

(262144, 1, 4096, 64, 1)


In [None]:
new_feats

tensor([[ 0.2876,  0.1467,  0.5692,  ..., -0.6065, -0.0976,  0.4856],
        [ 0.5712,  0.2915,  1.1306,  ..., -1.2047, -0.1939,  0.9645],
        [-0.3810, -0.1944, -0.7541,  ...,  0.8036,  0.1293, -0.6433],
        ...,
        [-1.0574, -0.5396, -2.0931,  ...,  2.2302,  0.3589, -1.7855],
        [-0.5543, -0.2829, -1.0972,  ...,  1.1690,  0.1881, -0.9359],
        [ 0.3251,  0.1659,  0.6436,  ..., -0.6857, -0.1103,  0.5490]],
       device='cuda:0')

In [None]:
output.transpose

<function Tensor.transpose>

In [None]:
print(f"Input shape: {dx_input.shape}")   # (B, C_in, D, H, W) 형태여야 함
print(f"Weight shape: {weights.shape}")    # (C_out, C_in, k, k, k) 형태여야 함

Input shape: torch.Size([10, 1, 64, 64, 64])
Weight shape: torch.Size([27, 1, 64])


In [None]:
dx.shape.t

torch.Size([10, 64, 64, 64, 1])

In [None]:
new_feats

tensor([[ 0.2876,  0.1467,  0.5692,  ..., -0.6065, -0.0976,  0.4856],
        [ 0.5712,  0.2915,  1.1306,  ..., -1.2047, -0.1939,  0.9645],
        [-0.3810, -0.1944, -0.7541,  ...,  0.8036,  0.1293, -0.6433],
        ...,
        [-1.0574, -0.5396, -2.0931,  ...,  2.2302,  0.3589, -1.7855],
        [-0.5543, -0.2829, -1.0972,  ...,  1.1690,  0.1881, -0.9359],
        [ 0.3251,  0.1659,  0.6436,  ..., -0.6857, -0.1103,  0.5490]],
       device='cuda:0')

In [None]:
(near_map[:, 0] != -1).float().mean()

tensor(0.0040, device='cuda:0')

In [None]:
len(set((flatten_coord(x.coords) % (1024 * 20)).cpu().numpy().tolist()))

320

In [None]:
len(set((flatten_coord(x.coords)).cpu().numpy().tolist()))

10219

In [None]:
import torch
from tqdm import tqdm
hts = []

for _ in tqdm(range(1)):
    x = randn(batch_size=10, spatial_shape=(512, 512, 512), nnz=512**3 // 10, device='cuda')
    ht = HashTable(capacity=512**3 // 10 * 4,  device='cuda')
    ht.insert(x.coords)
    ht.table_keys
    ht.to("cuda")
    ht.query(x.coords)


100%|██████████| 1/1 [00:00<00:00, 11.42it/s]


In [None]:
ht.query(x.coords)

tensor([       0,        1,        2,  ..., 13421769, 13421770, 13421771],
       device='cuda:0')

In [None]:
x = randn(batch_size=10, spatial_shape=(64, 64, 64), nnz=1024 * 100, device='cuda')
ht = HashTable(capacity=1024*200,  device='cpu')
ht.insert(x.coords)

NameError: name 'randn' is not defined

In [None]:
import torch
from tqdm import tqdm
for _ in tqdm(range(1000)):
    x = randn(batch_size=10, spatial_shape=(64, 64, 64), nnz=1024 * 1000, device='cuda')
    ht = HashTable(capacity=1024*20000,  device='cuda')
    ht.insert(x.coords)
    ht.table_keys
    # ht.query(x.coords)
torch.cuda.empty_cache()

100%|██████████| 1000/1000 [00:05<00:00, 181.18it/s]


In [None]:
import torch
from tqdm import tqdm
for _ in tqdm(range(10)):
    x = randn(batch_size=10, spatial_shape=(64, 64, 64), nnz=1024 * 1000, device='cuda')
    ht = HashTable(capacity=1024*20000,  device='cpu')
    ht.insert(x.coords)
    ht.table_keys
    # ht.query(x.coords)
torch.cuda.empty_cache()

100%|██████████| 10/10 [00:01<00:00,  9.92it/s]


In [None]:
ht.table_values.device


device(type='cuda', index=0)

In [None]:
torch.cuda.empty_cache()

In [None]:
import torch
from tqdm import tqdm
for _ in tqdm(range(100000)):
    x = randn(batch_size=10, spatial_shape=(64, 64, 64), nnz=1024 * 10, device='cuda')
    ht = HashTable(capacity=1024*12,  device='cuda')
    ht.insert(x.coords)
    ht.table_keys
    ht.query(x.coords)

  0%|          | 114/100000 [00:00<02:55, 567.65it/s]



  0%|          | 369/100000 [00:00<01:33, 1069.96it/s]



  1%|          | 674/100000 [00:00<01:14, 1332.27it/s]



  1%|          | 988/100000 [00:00<01:07, 1460.44it/s]



  1%|▏         | 1293/100000 [00:01<01:06, 1474.35it/s]



  2%|▏         | 1590/100000 [00:01<01:07, 1466.72it/s]



  2%|▏         | 1892/100000 [00:01<01:05, 1488.35it/s]



  2%|▏         | 2194/100000 [00:01<01:05, 1496.16it/s]



  2%|▏         | 2493/100000 [00:01<01:05, 1481.83it/s]



  3%|▎         | 2799/100000 [00:02<01:04, 1502.27it/s]



  3%|▎         | 3100/100000 [00:02<01:04, 1494.71it/s]



  3%|▎         | 3412/100000 [00:02<01:03, 1524.71it/s]



  4%|▎         | 3726/100000 [00:02<01:02, 1529.14it/s]



  4%|▍         | 4032/100000 [00:02<01:02, 1525.89it/s]



  4%|▍         | 4341/100000 [00:03<01:02, 1529.32it/s]



  5%|▍         | 4797/100000 [00:03<01:03, 1489.88it/s]



  5%|▌         | 5104/100000 [00:03<01:02, 1511.41it/s]



  5%|▌         | 5412/100000 [00:03<01:02, 1525.04it/s]



  6%|▌         | 5718/100000 [00:03<01:01, 1521.22it/s]



  6%|▌         | 6017/100000 [00:04<01:04, 1452.08it/s]






KeyboardInterrupt: 

In [None]:
(ht.query(x.coords) == -1).float().mean()

tensor(0., device='cuda:0')

In [None]:
481506304001 % 1024*20

20

In [None]:
ht.table_keys

tensor([-1, -1, -1,  ..., -1, -1, -1], device='cuda:0')

In [None]:
10215

10215

In [None]:
(ht.table_values != -1).sum()

tensor(10216, device='cuda:0')

In [None]:
ht.table_values

tensor([413273899086,           -1,           -1,  ...,           -1,
                  -1,           -1], device='cuda:0')

In [None]:
1048576 / (262144*4)

1.0

In [None]:
262144 / 165813

1.5809616857544342

In [None]:
(ht.table_values != -1).sum()

tensor(10216, device='cuda:0')

In [None]:
ht.table_values

tensor([413273899086,           -1,           -1,  ...,           -1,
                  -1,           -1], device='cuda:0')

In [None]:
x.coords[225747]

IndexError: index 225747 is out of bounds for dimension 0 with size 10240

In [None]:
import torch
unflatten_coord(torch.tensor([90194313218]))

tensor([[ 0, 21,  0,  2]], dtype=torch.int16)

In [None]:
30064771072

30064771072

In [None]:
ht.table_values

tensor([ 3488, 33248,  1487,  ...,    -1,    -1,    -1], device='cuda:0')

In [None]:
3056 / 128

23.875

In [None]:
(ht.table_keys != -1).sum()

tensor(69568, device='cuda:0')

In [None]:
ht.table_keys

tensor([207114731520,    117440513, 618861166594,  ...,           -1,
                  -1,           -1], device='cuda:0')

In [None]:
ht.table_values

tensor([ 3488, 33248,  1487,  ...,    -1,    -1,    -1], device='cuda:0')

In [None]:
x.coords.shape

len(set(hash_coords(x.coords)))

262144

In [None]:
q_coords = x.coords.clone()
q_coords[:, 1:] += 1  # Shift spatial coordinates by 1

In [None]:
ht.table_values

tensor([ 3488, 33248,  1487,  ...,    -1,    -1,    -1], device='cuda:0')

In [None]:
(ht.table_keys != -1).sum()

tensor(69568, device='cuda:0')

In [None]:
q_coords

tensor([[ 3, 31, 24, 46],
        [ 3, 43, 21,  5],
        [ 7, 50, 54, 37],
        ...,
        [ 6, 46, 15,  6],
        [ 7, 25, 29,  6],
        [ 6, 22,  5,  3]], device='cuda:0', dtype=torch.int16)

In [None]:
x.coords[2004]

tensor([ 8,  6, 53, 10], device='cuda:0', dtype=torch.int16)

In [None]:
result = ht.query(q_coords)


In [None]:
x.coords[result[result != -1]]

tensor([], device='cuda:0', size=(0, 4), dtype=torch.int16)

In [None]:
import torch

torch.argmax((ht.query(q_coords) != -1).float())

tensor(0, device='cuda:0')

In [None]:
ht.query(q_coords)[128]

tensor(-1, device='cuda:0')

In [None]:
x.C[470]

tensor([ 1, 39, 35, 24], device='cuda:0', dtype=torch.int16)

In [None]:
q_coords[128]

tensor([ 8, 48, 41, 59], device='cuda:0', dtype=torch.int16)