In [1]:
import MLC.mlc as mlc
import numpy as np
import torch
import MLC.demo.model.resnet as Resnet
from tvm import relax
import tvm

In [2]:
resnet = Resnet.resnet18()
hw = 384
x_np = np.random.rand(1, 1, hw, hw).astype('float32')
img_torch = torch.from_numpy(x_np).cuda(0)

img_nd = tvm.nd.array(x_np, tvm.cuda(0))
# x_nd = tvm.nd.array(x_np, tvm.cpu(0))

In [3]:
resnet_fx_module = mlc.from_fx(resnet, [(1, 1, 384, 384)])

In [4]:
resnet_fused = mlc.FuseDenseAddPass()(resnet_fx_module)

In [5]:
lowresnet = mlc.LowerToTensorIRPass()(resnet_fused)

In [6]:
resnetFinal = relax.transform.FuseTIR()(lowresnet)

In [7]:
tunedResnet = mlc.mlc_tune_tir(resnetFinal, target="cuda --max_threads_per_block=1024 --max_shared_memory_per_block=49152")
# tunedResnet.show()

2023-02-15 22:14:00 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,294912,1,91.7766,3.2134,3.2134,1,Y


2023-02-15 22:14:00 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |   FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------
  0 | main | 294912 |      1 |        91.7766 |       3.2134 |                3.2134 |      1 |    Y 
-----------------------------------------------------------------------------------------------------
Total trials: 1
Total latency (us): 3.21337



In [8]:
ex = relax.vm.build(tunedResnet, 'cuda')
vm = relax.VirtualMachine(ex, tvm.cuda(0))
# ex = relax.vm.build(resnetFinal, 'llvm')
# vm = relax.VirtualMachine(ex, tvm.cpu(0))

In [12]:
n_repeat = 1000
f_timer_tuned_resnet = vm.time_evaluator('main', tvm.cuda(0), number=n_repeat)
# f_timer_tuned_resnet = vm.time_evaluator('main', tvm.cpu(0), number=n_repeat)
print("tvm tuned model's time cost : %f ms" % (f_timer_tuned_resnet(img_nd).mean * 1e3))

tvm tuned model's time cost : 2.876463 ms


In [13]:
# torch cuda time cost
resnet = resnet.cuda(0)
resnet.eval()
# method 1
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(10):
    _ = resnet(img_torch)
end.record()
torch.cuda.synchronize()
warn_up_time = start.elapsed_time(end)
print('gpu warm up time %f ms' % warn_up_time)

timings = []
with torch.no_grad():
    for i in range(n_repeat):
        start.record()
        _ = resnet(img_torch)
        end.record()
        torch.cuda.synchronize()
        curr_timing = start.elapsed_time(end)
        timings.append(round(curr_timing, 5))


print("torch model's time cost : %f ms" % (np.mean(timings)))

# method 2
# with torch.autograd.profiler.profile(enabled=True, use_cuda=True) as prof:
#     for _ in range(n_repeat):
#         resnet(img_torch)

# print(prof.key_averages().table())

gpu warm up time 58.591934 ms
torch model's time cost : 3.868004 ms


In [14]:
res_nd = vm['main'](img_nd)
resnet.eval()
resnet.cuda()
with torch.no_grad():
    res_torch = resnet(img_torch)

np.testing.assert_allclose(res_nd.numpy(), res_torch.detach().cpu().numpy(), rtol=1e-5)