# 自动调度

In [1]:
import numpy as np
import tvm
from tvm import te, auto_scheduler

工作负载（workload）实际上是由 TVM DSL 定义的计算 DAG。但是序列化计算 DAG 并有效地匹配它们并不容易。因此，需要使用字符串 `[func_name, [args...]]` 编码 compute dag（返回值为 ``func_name(*args)``）。

这些字符串对于序列化/匹配非常有效，而且不会太长。

当需要 dag 时，解码字符串并调用函数，该函数将返回 dag。

In [2]:
@auto_scheduler.register_workload  # 注意 auto_scheduler 装饰器
def matmul_add(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)
    C = te.placeholder((N, M), name="C", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    matmul = te.compute(
        (N, M),
        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
        name="matmul",
        attrs={"layout_free_placeholders": [B]},  # 启用张量 B 的自动布局转换
    )
    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
    return [A, B, C, out]

## 搜索任务

在定义了函数之后，为 `auto_scheduler` 创建任务来进行搜索。指定这个矩阵乘法的特殊参数：`N=L=M=1024` 和 `dtype="float32"` 创建搜索任务。

In [3]:
# target = tvm.target.Target("llvm")
target = tvm.target.Target("llvm -mcpu=skylake-avx512")

N = L = M = 1024
task = auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)

# 检查计算图
print("Computational DAG:")
print(task.compute_dag)

Computational DAG:
A = PLACEHOLDER [1024, 1024]
B = PLACEHOLDER [1024, 1024]
matmul(i, j) += (A[i, k]*B[k, j])
C = PLACEHOLDER [1024, 1024]
out(i, j) = (matmul[i, j] + C[i, j])



### 设置参数

In [4]:
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=10, # 搜索过程中可以使用的测量试验的数量，1000 是很好的搜索收敛值
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)

### 运行搜索

In [5]:
# 运行 auto-tuning (search)
task.tune(tune_option)
# 应用最优 schedule
sch, args = task.apply_best(log_file)

  from pandas import MultiIndex, Int64Index


----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 3
Sample Initial Population	#s: 2014	fail_ct: 2	Time elapsed: 0.58
GA Iter: 0	Max score: 0.9991	Min score: 0.9335	#Pop: 128	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9998	Min score: 0.9859	#Pop: 128	#M+: 1384	#M-: 74
EvolutionarySearch		#s: 128	Time elapsed: 2.40
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 10 programs to measure:
..........*E*E*E*E***E*E*E*E
No: 1	GFLOPS: 0.00 / 0.00	results: MeasureResult(error_type:RuntimeDeviceError, error_msg:Traceback (most recent call last):
  File "/media/pc/data/4tb/lxw/libs/anaconda3/envs/py38/lib/python3.8/site-packages/tvm/auto_scheduler/utils.py", line 295, in call_func_with_timeout
    res = work

In [7]:
print("Lowered TIR:")
mod = tvm.lower(sch, args, simple_mode=True)
mod.show()

Lowered TIR:
[38;5;129m@tvm[39m[38;5;129;01m.[39;00mscript[38;5;129;01m.[39;00mir_module
[38;5;28;01mclass[39;00m [38;5;21;01mModule[39;00m:
    [38;5;129m@T[39m[38;5;129;01m.[39;00mprim_func
    [38;5;28;01mdef[39;00m [38;5;21mmain[39m(A: T[38;5;129;01m.[39;00mBuffer[[38;5;28m1048576[39m, [38;5;124m"[39m[38;5;124mfloat32[39m[38;5;124m"[39m], B: T[38;5;129;01m.[39;00mBuffer[[38;5;28m1048576[39m, [38;5;124m"[39m[38;5;124mfloat32[39m[38;5;124m"[39m], C: T[38;5;129;01m.[39;00mBuffer[[38;5;28m1048576[39m, [38;5;124m"[39m[38;5;124mfloat32[39m[38;5;124m"[39m], out: T[38;5;129;01m.[39;00mBuffer[[38;5;28m1048576[39m, [38;5;124m"[39m[38;5;124mfloat32[39m[38;5;124m"[39m]) [38;5;129;01m-[39;00m[38;5;129;01m>[39;00m [38;5;28;01mNone[39;00m:
        [38;5;30;03m# function attr dict[39;00m
        T[38;5;129;01m.[39;00mfunc_attr({[38;5;124m"[39m[38;5;124mfrom_legacy_te_schedule[39m[38;5;124m"[39m: [38;5;28;01mTrue[39;00m, [