In [14]:
import MLC.mlc as mlc
import numpy as np
import torch
import torch.nn as nn
import tvm
import MLC.demo.model.resnet as Resnet

In [15]:
resnet = Resnet.resnet18(num_classes=10)
img_np = np.random.rand(1, 1, 384, 384).astype('float32')
img_nd = tvm.nd.array(img_np, tvm.cuda(0))
img_torch = torch.from_numpy(img_np).cuda(0)


In [6]:
fx_module = torch.fx.symbolic_trace(resnet)
fx_module.graph.print_tabular()


opcode         name                   target                                                      args                                   kwargs
-------------  ---------------------  ----------------------------------------------------------  -------------------------------------  --------
placeholder    x                      x                                                           ()                                     {}
call_module    conv1                  conv1                                                       (x,)                                   {}
call_module    bn1                    bn1                                                         (conv1,)                               {}
call_module    relu                   relu                                                        (bn1,)                                 {}
call_module    maxpool                maxpool                                                     (relu,)                                {}
call_modul

In [7]:
resnet_fx_module = mlc.from_fx(fx_module, [(1, 1, 384, 384)])
resnet_fx_module.show()

To print formatted TVM script, please install the formatter 'Black':
/staff/qiaoliang/anaconda3/envs/MLC/bin/python -m pip install "black==22.3.0" --upgrade --user


In [8]:
resnet_fused = mlc.FuseDenseAddPass()(resnet_fx_module)
resnet_fused.show()

To print formatted TVM script, please install the formatter 'Black':
/staff/qiaoliang/anaconda3/envs/MLC/bin/python -m pip install "black==22.3.0" --upgrade --user


In [9]:
lowresnet = mlc.LowerToTensorIRPass()(resnet_fused)
lowresnet.show()

To print formatted TVM script, please install the formatter 'Black':
/staff/qiaoliang/anaconda3/envs/MLC/bin/python -m pip install "black==22.3.0" --upgrade --user


In [10]:
from tvm import relax
resnetFinal = relax.transform.FuseTIR()(lowresnet)

In [11]:
tunedResnet = mlc.mlc_tune_tir(resnetFinal, target="cuda --max_threads_per_block=1024 --max_shared_memory_per_block=49152")

2023-02-15 13:22:27 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,73728,1,30.0494,2.4536,2.4536,6,Y


2023-02-15 13:22:27 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |  FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
----------------------------------------------------------------------------------------------------
  0 | main | 73728 |      1 |        30.0494 |       2.4536 |                2.4536 |      6 |    Y 
----------------------------------------------------------------------------------------------------
Total trials: 6
Total latency (us): 2.45356



In [13]:
import tvm
ex = relax.vm.build(tunedResnet, target='cuda')
vm = relax.VirtualMachine(ex, tvm.cuda(0))

In [19]:
n_repeat = 1000
f_timer_tuned_resnet = vm.time_evaluator('main', tvm.cuda(0), number=n_repeat)
print("tvm tuned model's time cost : %f ms" % (f_timer_tuned_resnet(img_nd).mean * 1e3))

tvm tuned model's time cost : 3.653016 ms


In [23]:
# torch cuda time cost
resnet = resnet.cuda(0)
resnet.eval()
# method 1
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(10):
    _ = resnet(img_torch)
end.record()
torch.cuda.synchronize()
warn_up_time = start.elapsed_time(end)
print('gpu warm up time %f ms' % warn_up_time)

timings = []
with torch.no_grad():
    for i in range(n_repeat):
        start.record()
        _ = resnet(img_torch)
        end.record()
        torch.cuda.synchronize()
        curr_timing = start.elapsed_time(end)
        timings.append(round(curr_timing, 5))


print("torch model's time cost : %f ms" % (np.mean(timings)))

# method 2
# with torch.autograd.profiler.profile(enabled=True, use_cuda=True) as prof:
#     for _ in range(n_repeat):
#         resnet(img_torch)

# print(prof.key_averages().table())

gpu warm up time 67.352318
torch model's time cost : 4.358024 ms
