<a href="https://colab.research.google.com/github/seongheechoi/education/blob/main/%EC%8B%A4%EC%8A%B5_3_1_optimizing_operators_with_schedule_templates_and_AutoTVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TVM 실습자료 3.1: Optimizing Operators with Schedule Templates and AutoTVM**

In [None]:
!pip install numpy==1.26.4
import numpy as np
print(np.__version__)
!pip list | grep numpy

1.26.4
numpy                                 1.26.4


In [None]:
# Linux/MacOS CPU build only!
# See tlcpack.ai for other pre-built binaries including CUDA
!python -m pip install --upgrade pip
!pip install apache-tvm

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting apache-tvm
  Downloading apache_tvm-0.14.dev273-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading apache_tvm-0.14.dev273-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: apache-tvm
Successfully installed apache-tvm-0.14.dev273


**Install dependencies**

In [None]:
!pip3 install --user psutil xgboost cloudpickle



In [None]:
import logging
import sys

import numpy as np
import tvm
from tvm import te
import tvm.testing

# the module is called `autotvm`
from tvm import autotvm

**Basic Matrix Multiplication with TE**

In [None]:
def matmul_basic(N, L, M, dtype):

    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    yo, yi = s[C].split(y, 8)
    xo, xi = s[C].split(x, 8)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

**A Basic Matrix Multiplication Template**

In [None]:
# Matmul V1: List candidate values
@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
def matmul_v1(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()

    # 3. define search space
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

    # 4. schedule according to config
    yo, yi = s[C].split(y, cfg["tile_y"].val)
    xo, xi = s[C].split(x, cfg["tile_x"].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

**A Matrix Multiplication Template with the Advanced Parameter API**

In [None]:
@autotvm.template("tutorial/matmul")
def matmul(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

**Auto-tuners in TVM**

In [None]:
N, L, M = 512, 512, 512
task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
print(task.config_space)

ConfigSpace (len=100, range_length=100, space_map=
   0 tile_y: Split(policy=factors, product=512, num_outputs=2) len=10
   1 tile_x: Split(policy=factors, product=512, num_outputs=2) len=10
)


In [None]:
# logging config (for printing tuning log to the screen)
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

In [None]:
measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))

# Begin tuning with RandomTuner, log records to file `matmul.log`
# You can use alternatives like XGBTuner.
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(
    n_trial=10,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("matmul.log")],
)

waiting for device...


DEBUG:autotvm:waiting for device...


device available


DEBUG:autotvm:device available


Get devices for measurement successfully!


INFO:autotvm:Get devices for measurement successfully!


No: 1	GFLOPS: 9.50/9.50	result: MeasureResult(costs=(0.0282559126,), error_no=0, all_cost=1.5390675067901611, timestamp=1752211667.752968)	[('tile_y', [-1, 2]), ('tile_x', [-1, 128])],None,71


DEBUG:autotvm:No: 1	GFLOPS: 9.50/9.50	result: MeasureResult(costs=(0.0282559126,), error_no=0, all_cost=1.5390675067901611, timestamp=1752211667.752968)	[('tile_y', [-1, 2]), ('tile_x', [-1, 128])],None,71


No: 2	GFLOPS: 2.68/9.50	result: MeasureResult(costs=(0.1001951884,), error_no=0, all_cost=2.0600357055664062, timestamp=1752211669.627498)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,36


DEBUG:autotvm:No: 2	GFLOPS: 2.68/9.50	result: MeasureResult(costs=(0.1001951884,), error_no=0, all_cost=2.0600357055664062, timestamp=1752211669.627498)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,36


No: 3	GFLOPS: 1.52/9.50	result: MeasureResult(costs=(0.1761797076,), error_no=0, all_cost=3.138453245162964, timestamp=1752211672.9152539)	[('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29


DEBUG:autotvm:No: 3	GFLOPS: 1.52/9.50	result: MeasureResult(costs=(0.1761797076,), error_no=0, all_cost=3.138453245162964, timestamp=1752211672.9152539)	[('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29


No: 4	GFLOPS: 1.43/9.50	result: MeasureResult(costs=(0.1879282398,), error_no=0, all_cost=3.3045709133148193, timestamp=1752211676.173456)	[('tile_y', [-1, 64]), ('tile_x', [-1, 4])],None,26


DEBUG:autotvm:No: 4	GFLOPS: 1.43/9.50	result: MeasureResult(costs=(0.1879282398,), error_no=0, all_cost=3.3045709133148193, timestamp=1752211676.173456)	[('tile_y', [-1, 64]), ('tile_x', [-1, 4])],None,26


No: 5	GFLOPS: 6.51/9.50	result: MeasureResult(costs=(0.0412316204,), error_no=0, all_cost=1.3278255462646484, timestamp=1752211677.710042)	[('tile_y', [-1, 16]), ('tile_x', [-1, 512])],None,94


DEBUG:autotvm:No: 5	GFLOPS: 6.51/9.50	result: MeasureResult(costs=(0.0412316204,), error_no=0, all_cost=1.3278255462646484, timestamp=1752211677.710042)	[('tile_y', [-1, 16]), ('tile_x', [-1, 512])],None,94


No: 6	GFLOPS: 11.58/11.58	result: MeasureResult(costs=(0.023189751,), error_no=0, all_cost=1.0741856098175049, timestamp=1752211678.3375347)	[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,65


DEBUG:autotvm:No: 6	GFLOPS: 11.58/11.58	result: MeasureResult(costs=(0.023189751,), error_no=0, all_cost=1.0741856098175049, timestamp=1752211678.3375347)	[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,65


No: 7	GFLOPS: 10.81/11.58	result: MeasureResult(costs=(0.0248224996,), error_no=0, all_cost=0.7989456653594971, timestamp=1752211679.2609417)	[('tile_y', [-1, 128]), ('tile_x', [-1, 64])],None,67


DEBUG:autotvm:No: 7	GFLOPS: 10.81/11.58	result: MeasureResult(costs=(0.0248224996,), error_no=0, all_cost=0.7989456653594971, timestamp=1752211679.2609417)	[('tile_y', [-1, 128]), ('tile_x', [-1, 64])],None,67


No: 8	GFLOPS: 10.84/11.58	result: MeasureResult(costs=(0.024761730399999998,), error_no=0, all_cost=0.8073360919952393, timestamp=1752211679.9346428)	[('tile_y', [-1, 32]), ('tile_x', [-1, 256])],None,85


DEBUG:autotvm:No: 8	GFLOPS: 10.84/11.58	result: MeasureResult(costs=(0.024761730399999998,), error_no=0, all_cost=0.8073360919952393, timestamp=1752211679.9346428)	[('tile_y', [-1, 32]), ('tile_x', [-1, 256])],None,85


No: 9	GFLOPS: 0.77/11.58	result: MeasureResult(costs=(0.3473268688,), error_no=0, all_cost=5.884622573852539, timestamp=1752211686.1256928)	[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,15


DEBUG:autotvm:No: 9	GFLOPS: 0.77/11.58	result: MeasureResult(costs=(0.3473268688,), error_no=0, all_cost=5.884622573852539, timestamp=1752211686.1256928)	[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,15


No: 10	GFLOPS: 9.57/11.58	result: MeasureResult(costs=(0.028058876600000004,), error_no=0, all_cost=0.9659364223480225, timestamp=1752211686.8450854)	[('tile_y', [-1, 256]), ('tile_x', [-1, 128])],None,78


DEBUG:autotvm:No: 10	GFLOPS: 9.57/11.58	result: MeasureResult(costs=(0.028058876600000004,), error_no=0, all_cost=0.9659364223480225, timestamp=1752211686.8450854)	[('tile_y', [-1, 256]), ('tile_x', [-1, 128])],None,78


In [None]:
# apply history best from log file
with autotvm.apply_history_best("matmul.log"):
    with tvm.target.Target("llvm"):
        s, arg_bufs = matmul(N, L, M, "float32")
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4)

Finish loading 10 records


DEBUG:autotvm:Finish loading 10 records


In [None]:
print(tvm.lower(s, arg_bufs, simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((512, 512), "float32"), B: T.Buffer((512, 512), "float32"), C: T.Buffer((512, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        for i_outer, j_outer in T.grid(16, 8):
            C_1 = T.Buffer((262144,), data=C.data)
            for i_inner_init, j_inner_init in T.grid(32, 64):
                C_1[i_outer * 16384 + i_inner_init * 512 + j_outer * 64 + j_inner_init] = T.float32(0)
            for k, i_inner, j_inner in T.grid(512, 32, 64):
                cse_var_3: T.int32 = j_outer * 64
                cse_var_2: T.int32 = i_outer * 16384 + i_inner * 512
                cse_var_1: T.int32 = cse_var_2 + cse_var_3 + j_inner
                A_1 = T.Buffer((262144,), data=A.data)
                B_1 = T.Buffer((262144,), data=B.data)
                C_1[cse_var_1] = C_1[cse_var_1] + A_1