# Relay 下的 AutoTVM

参考：[autotvm_relay](https://daobook.github.io/tvm/docs/tutorial/autotvm_relay_x86.html)

In [1]:
from PIL import Image
import numpy as np
import tvm
from tvm import relay
from tvm.contrib.download import download_testdata
from tvm.contrib import graph_executor
import warnings

warnings.filterwarnings('ignore')

## 加载前端模型和数据

In [2]:
from torchvision import models


def get_model(model_name = "resnet50", pretrained=True):
    model = getattr(models, model_name)(pretrained=pretrained)
    return model

torch_model = get_model("resnet50", pretrained=True)

下载图片：

In [3]:
img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
img_path = download_testdata(img_url, "imagenet_cat.png", module="data")

# resize 到 224x224
with Image.open(img_path) as im:
    resized_image = im.resize((224, 224))

# 转换为 float32
img_data = np.asarray(resized_image).astype("float32")

# 输入图像是在 HWC 布局，而 MXNet 期望 CHW 输入
img_data = np.transpose(img_data, (2, 0, 1))

# 根据 ImageNet 输入规范进行 Normalize
imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev

# 添加批处理维度，设置数据为 4 维 输入：NCHW
img_data = np.expand_dims(norm_img_data, axis=0)

## 前端模型转换为 Relay 表示

In [4]:
import torch
from tvm.relay.frontend.pytorch import from_pytorch

dshape = 1, 3, 224, 224
model = get_model().eval()
random_input = torch.randn(dshape)
trace_module = torch.jit.trace(model, random_input).eval()
mod, params = from_pytorch(trace_module, [("data", dshape)])

## 编译

In [6]:
target = "llvm"

with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


## 运行时

In [7]:
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

运行时推理：

In [8]:
# dtype = "float32"
input_name = "data"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0).numpy()

## 性能评估

In [9]:
import timeit

timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}

print(unoptimized)

{'mean': 73.69392811204307, 'median': 77.21288839820772, 'std': 11.900445828441978}


## 后处理

In [10]:
from scipy.special import softmax

from gluoncv.data.imagenet.classification import ImageNet1kAttr

# 获取 ImageNet 标签列表
imagenet_1k_attr = ImageNet1kAttr()
labels = imagenet_1k_attr.classes_long
# 获取输出张量
scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
for rank in ranks[0:5]:
    print(f"class='{labels[rank]}' with probability={scores[rank]:f}")

class='tiger cat' with probability=0.476104
class='tabby, tabby cat' with probability=0.466828
class='Egyptian cat' with probability=0.046270
class='plastic bag' with probability=0.002098
class='carton' with probability=0.000687


## 模型调优

TVM 中的调优是指对模型进行优化以在给定目标上更快地运行的过程。这与训练或微调不同，因为它不影响模型的准确性，而只影响运行时的性能。作为调优过程的一部分，TVM 将尝试运行许多不同的算子实现变体，以观察哪些算子表现最佳。这些运行的结果被储存在调优记录文件中。

In [11]:
from tvm import auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm

为运行器设置一些基本参数。运行器采用一组特定参数生成的编译代码，并测量其性能。`number` 指定将测试的不同配置的数量，而 `repeat` 指定将对每个配置进行多少次测量。`min_repeat_ms` 是一个值，指定需要多长时间运行配置测试。如果重复次数低于这个时间，它将被增加。这个选项对于在 GPU 上进行精确的调优是必要的，而对于 CPU 的调优则不需要。把这个值设置为 `0` 可以禁用它。`timeout` 为每个测试的配置运行训练代码的时间设置了上限。

In [12]:
number = 10
repeat = 1
min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
timeout = 10  # in seconds

# create a TVM runner
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
    enable_cpu_cache_flush=True,
)

使用 XGBoost 算法来指导搜索。对于生产作业来说，你会想把试验的数量设置得比这里使用的 `10` 的值大。对于 CPU，推荐 `1500`，对于 GPU，推荐 `3000-4000`。所需的试验次数可能取决于特定的模型和处理器，因此值得花一些时间来评估各种数值的性能，以找到调整时间和模型优化之间的最佳平衡。`early_stopping` 参数是在应用提前停止搜索的条件之前，要运行的最小轨数。`measure` 选项表示将在哪里建立试验代码，以及将在哪里运行。在这种情况下，使用刚刚创建的 `LocalRunner` 和 `LocalBuilder`。`tuning_records` 选项指定了文件来写入调整数据。

In [13]:
tuning_option = {
    "tuner": "xgb",
    "trials": 1500,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "resnet-50-autotuning.json",
}

运行调优：

In [14]:
# begin by extracting the tasks from the onnx model
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

[Task  1/25]  Current/Best:  120.51/ 307.69 GFLOPS | Progress: (144/1500) | 59.15 s Done.
[Task  2/25]  Current/Best:   92.82/ 251.66 GFLOPS | Progress: (288/1500) | 88.11 s Done.
[Task  3/25]  Current/Best:   70.36/ 392.87 GFLOPS | Progress: (288/1500) | 94.46 s Done.
[Task  4/25]  Current/Best:  128.78/ 355.21 GFLOPS | Progress: (336/1500) | 123.97 s Done.
[Task  5/25]  Current/Best:  211.86/ 330.45 GFLOPS | Progress: (240/1500) | 82.58 s Done.
[Task  6/25]  Current/Best:  109.52/ 314.38 GFLOPS | Progress: (288/1500) | 104.54 s Done.
[Task  7/25]  Current/Best:  206.38/ 316.17 GFLOPS | Progress: (240/1500) | 80.51 s Done.
[Task  8/25]  Current/Best:  187.30/ 281.83 GFLOPS | Progress: (336/1500) | 137.05 s Done.
[Task 10/25]  Current/Best:  105.97/ 280.33 GFLOPS | Progress: (48/1500) | 13.52 s s Done.
[Task 10/25]  Current/Best:  123.63/ 421.92 GFLOPS | Progress: (240/1500) | 76.72 s Done.
[Task 11/25]  Current/Best:  127.37/ 289.90 GFLOPS | Progress: (144/1500) | 53.38 s Done.
[Task 

## 用调优数据编译优化后的模型

上述调优过程的输出存储在 `resnet-50-autotuning.json` 的调优记录。编译器将使用这些结果，在你指定的目标上为模型生成高性能代码。

现在，模型的调优数据已经收集完毕，可以使用优化的算子重新编译模型，以加快计算速度。

In [15]:
with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

 Done.
 Done.


验证优化后的模型是否运行并产生相同的结果：

In [16]:
dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()

scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
for rank in ranks[0:5]:
    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))

class='tiger cat' with probability=0.476103
class='tabby, tabby cat' with probability=0.466829
class='Egyptian cat' with probability=0.046270
class='plastic bag' with probability=0.002098
class='carton' with probability=0.000687


## 比较已调谐和未调谐的模型

收集一些与这个优化模型相关的基本性能数据，将其与未优化的模型进行比较。根据你的底层硬件、迭代次数和其他因素，你应该看到优化后的模型与未优化的模型相比有性能的提高。

In [17]:
import timeit

timing_number = 10
timing_repeat = 10
optimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}


print("optimized: %s" % (optimized))
print("unoptimized: %s" % (unoptimized))

optimized: {'mean': 45.72459139046259, 'median': 42.24954635137692, 'std': 6.151925270695667}
unoptimized: {'mean': 73.69392811204307, 'median': 77.21288839820772, 'std': 11.900445828441978}
