In [1]:
import torch

In [58]:
@torch.jit.script
def benchmark_model(input_tensor: torch.Tensor):  # input_tensor is a vector consisting of just the batch
    # create a modestly-large workload
    invented_tensor = torch.arange(8192, device=input_tensor.device, dtype=input_tensor.dtype) # A N-wide vector
    matrix = torch.outer(invented_tensor, input_tensor)
    self_product = torch.matmul(matrix, torch.transpose(matrix, 0, 1))
    for _  in range(8):
        self_product = torch.softmax(torch.matmul(self_product, torch.transpose(self_product, 0, 1)), dim=0)
    # collapse it down to the same shape as the input
    projection_back = torch.arange(8192, device=input_tensor.device, dtype=input_tensor.dtype)
    return torch.matmul(self_product, projection_back).flatten()

In [59]:
%time print(benchmark_model(torch.rand(4096, device="cuda", dtype=torch.float32)))

tensor([4095.5000, 4095.5000, 4095.5000,  ..., 4095.5000, 4095.5000,
        4095.5000], device='cuda:0')
CPU times: user 1.24 s, sys: 122 µs, total: 1.24 s
Wall time: 1.35 s


In [60]:
torch.jit.save(benchmark_model, "src/resources/busywork.pt")

In [61]:
l = torch.jit.load("src/resources/busywork.pt")

In [62]:
%time print(benchmark_model(torch.rand(1000, device="cuda", dtype=torch.float32)))

tensor([4095.5000, 4095.5000, 4095.5000,  ..., 4095.5000, 4095.5000,
        4095.5000], device='cuda:0')
CPU times: user 1.01 s, sys: 137 µs, total: 1.01 s
Wall time: 1.1 s
