<a href="https://colab.research.google.com/github/seongheechoi/education/blob/main/%EC%8B%A4%EC%8A%B5_2_2_work_with_tensor_expression_and_schedules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TVM 실습자료 2.2: Working with Tensor Expression and Schedules**

In [None]:
!pip install numpy==1.26.4
import numpy as np
print(np.__version__)
!pip list | grep numpy

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m111.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy

2.0.2
numpy                                 1.26.4


In [None]:
# Linux/MacOS CPU build only!
# See tlcpack.ai for other pre-built binaries including CUDA
!python -m pip install --upgrade pip
!pip install apache-tvm

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting apache-tvm
  Downloading apache_tvm-0.14.dev273-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading apache_tvm-0.14.dev273-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: apache-tvm
Successfully installed apache-tvm-0.14.dev273


# **Schedule Primitives in TVM**

In [None]:
from __future__ import absolute_import, print_function


import tvm
from tvm import te
import numpy as np

In [None]:
# declare some variables for use later
n = te.var("n")
m = te.var("m")

# declare a matrix element-wise multiply
A = te.placeholder((m, n), name="A")
B = te.placeholder((m, n), name="B")
C = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name="C")

s = te.create_schedule([C.op])

In [None]:
# lower will transform the computation from definition to the real
# callable function. With argument `simple_mode=True`, it will
# return you a readable C like statement, we use it here to print the
# schedule result.
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (m, n), strides=("stride", "stride"), buffer_type="auto")
        C_1 = T.match_buffer(C, (m, n), strides=("stride", "stride"), buffer_type="auto")
        for i, j in T.grid(m, n):
            C_2 = T.Buffer((C_1.strides[0] * m,), data=C_1.data, buffer_type="auto")
            A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
            B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
            C_2[i * C_1.strides[0] + j * C_1.strides[1]] = A_2[i * A_1.strides[0] + j * A_1.strides[1]] * B_2[i * B_1.strides[0] + j * B_1.strides[1

**split**

In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] * 2, name="B")

s = te.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=32)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        for i_outer, i_inner in T.grid((m + 31) // 32, 32):
            if T.likely(i_outer * 32 + i_inner < m):
                B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
                A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
                cse_var_1: T.int32 = i_outer * 32 + i_inner
                B_2[cse_var_1 * B_1.strides[0]] = A_2[cse_var_1 * A_1.strides[0]] * T.float32(2)


In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i], name="B")

s = te.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], nparts=32)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        for i_outer, i_inner in T.grid(32, (m + 31) // 32):
            if T.likely(i_inner + i_outer * ((m + 31) // 32) < m):
                B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
                A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
                B_2[(i_inner + i_outer * ((m + 31) // 32)) * B_1.strides[0]] = A_2[(i_inner + i_outer * ((m + 31) // 32)) * A_1.strides[0]]


**tile**

In [None]:
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (m, n), strides=("stride", "stride"), buffer_type="auto")
        for i_outer, j_outer, i_inner in T.grid((m + 9) // 10, (n + 4) // 5, 10):
            if T.likely(i_outer * 10 + i_inner < m):
                for j_inner in range(5):
                    if T.likely(j_outer * 5 + j_inner < n):
                        cse_var_2: T.int32 = j_outer * 5 + j_inner
                        cse_var_1: T.int32 = i_outer * 10 + i_inner
                        B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
                        A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.d

**fuse**

In [None]:
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
fused = s[B].fuse(xi, yi)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (m, n), strides=("stride", "stride"), buffer_type="auto")
        for i_outer, j_outer, i_inner_j_inner_fused in T.grid((m + 9) // 10, (n + 4) // 5, 50):
            if T.likely(i_outer * 10 + i_inner_j_inner_fused // 5 < m):
                if T.likely(j_outer * 5 + i_inner_j_inner_fused % 5 < n):
                    cse_var_2: T.int32 = j_outer * 5 + i_inner_j_inner_fused % 5
                    cse_var_1: T.int32 = i_outer * 10 + i_inner_j_inner_fused // 5
                    B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
                    A_2 = T.Buffer((A_1.s

In [None]:
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto")}
  buffer_map = {A_1: A, B_1: B} {
  for (i.outer: int32, 0, floordiv((m + 9), 10)) {
    for (j.outer: int32, 0, floordiv((n + 4), 5)) {
      for (i.inner.j.inner.fused: int32, 0, 50) {
        if @tir.likely((((i.outer*10) + floordiv(i.inner.j.inner.fused, 5)) < m), dtype=bool) {
          if @tir.likely((((j.outer*5) + floormod(i.inner.j.inner.fused, 5)) < n), dtype=bool) {
            let cse_var_2: int32 = ((j.outer*5) + floormod(i.inner.j.inner.fused, 5))
            let cse_var_1: int32 = ((i.outer*10) + floordiv(i.inner.j.inner.fused, 5))
            B_3: Buffer(B_2, float32, [(stride_2*m)], [], type="auto")[((

**reorder**

In [None]:
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
# then reorder the axes: (i.inner, j.outer, i.outer, j.inner)
s[B].reorder(xi, yo, xo, yi)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (m, n), strides=("stride", "stride"), buffer_type="auto")
        for i_inner, j_outer, i_outer in T.grid(10, (n + 4) // 5, (m + 9) // 10):
            if T.likely(i_outer * 10 + i_inner < m):
                for j_inner in range(5):
                    if T.likely(j_outer * 5 + j_inner < n):
                        cse_var_2: T.int32 = j_outer * 5 + j_inner
                        cse_var_1: T.int32 = i_outer * 10 + i_inner
                        B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
                        A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.d

**bind**

In [None]:
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda i: A[i] * 2, name="B")

s = te.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], factor=64)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        n = T.int32()
        A_1 = T.match_buffer(A, (n,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (n,), strides=("stride",), buffer_type="auto")
        blockIdx_x = T.launch_thread("blockIdx.x", (n + 63) // 64)
        threadIdx_x = T.launch_thread("threadIdx.x", 64)
        if T.likely(blockIdx_x * 64 + threadIdx_x < n):
            B_2 = T.Buffer((B_1.strides[0] * n,), data=B_1.data, buffer_type="auto")
            A_2 = T.Buffer((A_1.strides[0] * n,), data=A_1.data, buffer_type="auto")
            B_2[(blockIdx_x * 64 + threadIdx_x) * B_1.strides[0]] = A_2[(blockIdx_x * 64 + threadIdx_x) * A_1.strides[0]] * T.float32(2)


**compute_at**

In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        C_1 = T.match_buffer(C, (m,), strides=("stride",), buffer_type="auto")
        B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
        for i in range(m):
            A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
            B_2[i * B_1.strides[0]] = A_2[i * A_1.strides[0]] + T.float32(1)
        for i in range(m):
            C_2 = T.Buffer((C_1.strides[0] * m,), data=C_1.data, buffer_type="auto")
            C_2[i * C_1.strides[0]] = B_2[i * B_1.strides[0]] * T.float32(2)


In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_at(s[C], C.op.axis[0])
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        C_1 = T.match_buffer(C, (m,), strides=("stride",), buffer_type="auto")
        for i in range(m):
            B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
            A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
            B_2[i * B_1.strides[0]] = A_2[i * A_1.strides[0]] + T.float32(1)
            C_2 = T.Buffer((C_1.strides[0] * m,), data=C_1.data, buffer_type="auto")
            C_2[i * C_1.strides[0]] = B_2[i * B_1.strides[0]] * T.float32(2)


**compute_inline**

In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_inline()
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        C_1 = T.match_buffer(C, (m,), strides=("stride",), buffer_type="auto")
        for i in range(m):
            C_2 = T.Buffer((C_1.strides[0] * m,), data=C_1.data, buffer_type="auto")
            A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
            C_2[i * C_1.strides[0]] = (A_2[i * A_1.strides[0]] + T.float32(1)) * T.float32(2)


**compute_root**

In [None]:
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_at(s[C], C.op.axis[0])
s[B].compute_root()
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m = T.int32()
        A_1 = T.match_buffer(A, (m,), strides=("stride",), buffer_type="auto")
        B_1 = T.match_buffer(B, (m,), strides=("stride",), buffer_type="auto")
        C_1 = T.match_buffer(C, (m,), strides=("stride",), buffer_type="auto")
        B_2 = T.Buffer((B_1.strides[0] * m,), data=B_1.data, buffer_type="auto")
        for i in range(m):
            A_2 = T.Buffer((A_1.strides[0] * m,), data=A_1.data, buffer_type="auto")
            B_2[i * B_1.strides[0]] = A_2[i * A_1.strides[0]] + T.float32(1)
        for i in range(m):
            C_2 = T.Buffer((C_1.strides[0] * m,), data=C_1.data, buffer_type="auto")
            C_2[i * C_1.strides[0]] = B_2[i * B_1.strides[0]] * T.float32(2)


# **Reduction**

In [None]:
from __future__ import absolute_import, print_function


import tvm
import tvm.testing
from tvm import te
import numpy as np

**Describe Sum of Rows**

In [None]:
n = te.var("n")
m = te.var("m")
A = te.placeholder((n, m), name="A")
k = te.reduce_axis((0, m), "k")
B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")

**Schedule the Reduction**

In [None]:
s = te.create_schedule(B.op)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        n, m = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (n, m), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (n,), strides=("stride",), buffer_type="auto")
        for i in range(n):
            B_2 = T.Buffer((B_1.strides[0] * n,), data=B_1.data, buffer_type="auto")
            B_2[i * B_1.strides[0]] = T.float32(0)
            for k in range(m):
                A_2 = T.Buffer((A_1.strides[0] * n,), data=A_1.data, buffer_type="auto")
                B_2[i * B_1.strides[0]] = B_2[i * B_1.strides[0]] + A_2[i * A_1.strides[0] + k * A_1.strides[1]]


In [None]:
ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
xo, xi = s[B].split(B.op.axis[0], factor=32)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        n, m = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (n, m), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (n,), strides=("stride",), buffer_type="auto")
        for i_outer, i_inner in T.grid((n + 31) // 32, 32):
            B_2 = T.Buffer((B_1.strides[0] * n,), data=B_1.data, buffer_type="auto")
            if T.likely(i_outer * 32 + i_inner < n):
                B_2[(i_outer * 32 + i_inner) * B_1.strides[0]] = T.float32(0)
            if T.likely(i_outer * 32 + i_inner < n):
                for k_outer, k_inner in T.grid((m + 15) // 16, 16):
                    if T.likely(k_outer * 16 + k_inner < m):
                        A_2 = T.Buffer((A_1.strides[0] * n,), data=A_1.data, buffer_type="aut

**Reduction Factoring and Parallelization**

In [None]:
s = te.create_schedule(B.op)
ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
BF = s.rfactor(B, ki)
print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        n, m = T.int32(), T.int32()
        A_1 = T.match_buffer(A, (n, m), strides=("stride", "stride"), buffer_type="auto")
        B_1 = T.match_buffer(B, (n,), strides=("stride",), buffer_type="auto")
        B_rf = T.allocate([n * 16], "float32", "global")
        B_rf_1 = T.Buffer((16 * n,), data=B_rf)
        for k_inner, i in T.grid(16, n):
            B_rf_1[k_inner * n + i] = T.float32(0)
            for k_outer in range((m + 15) // 16):
                if T.likely(k_outer * 16 + k_inner < m):
                    A_2 = T.Buffer((A_1.strides[0] * n,), data=A_1.data, buffer_type="auto")
                    B_rf_1[k_inner * n + i] = B_rf_1[k_inner * n + i] + A_2[i * A_1.strides[0] + (k_outer * 16 + k_inner) * A_1.strides[1]]
        for ax

In [None]:
print(s[B].op.body)

[T.reduce(T.comm_reducer(lambda x, y: x + y, [T.float32(0)]), source=[B.rf[k_inner_v, ax0]], init=[], axis=[T.iter_var(k_inner_v, T.Range(0, 16), "CommReduce", "")], condition=T.bool(True), value_index=0)]


**Describe Convolution via 2D Reduction**

In [None]:
n = te.var("n")
Input = te.placeholder((n, n), name="Input")
Filter = te.placeholder((3, 3), name="Filter")
di = te.reduce_axis((0, 3), name="di")
dj = te.reduce_axis((0, 3), name="dj")
Output = te.compute(
    (n - 2, n - 2),
    lambda i, j: te.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),
    name="Output",
)
s = te.create_schedule(Output.op)
print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(Input: T.handle, Filter: T.Buffer((3, 3), "float32"), Output: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        n = T.int32()
        Input_1 = T.match_buffer(Input, (n, n), strides=("stride", "stride"), buffer_type="auto")
        Output_1 = T.match_buffer(Output, (n - 2, n - 2))
        for i, j in T.grid(n - 2, n - 2):
            Output_2 = T.Buffer(((n - 2) * (n - 2),), data=Output_1.data)
            Output_2[i * (n - 2) + j] = T.float32(0)
            for di, dj in T.grid(3, 3):
                Input_2 = T.Buffer((Input_1.strides[0] * n,), data=Input_1.data, buffer_type="auto")
                Filter_1 = T.Buffer((9,), data=Filter.data)
                Output_2[i * (n - 2) + j] = Output_2[i * (n - 2) + j] + Input_2[(i + di) * Input_1.strides[0] + (j + dj) * Input_1.strides[1]] * Filter_1[di * 3 +

**Define General Commutative Reduction Operation**

In [None]:
n = te.var("n")
m = te.var("m")
product = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="product")
A = te.placeholder((n, m), name="A")
k = te.reduce_axis((0, m), name="k")
B = te.compute((n,), lambda i: product(A[i, k], axis=k), name="B")

# **Intrinsics and Math Functions**

In [None]:
from __future__ import absolute_import, print_function

import numpy as np

import tvm
from tvm import te
from tvm.ir import register_op_attr, register_intrin_lowering

**Direct Declare Extern Math Call**

In [None]:
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda i: tvm.tir.call_pure_extern("float32", "__expf", A[i]), name="B")
s = te.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
f = tvm.build(s, [A, B], "cuda", name="myexp")
print(f.imported_modules[0].get_source())

**Unified Intrinsic Call**

In [None]:
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda i: te.exp(A[i]), name="B")
s = te.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
fcuda = tvm.build(s, [A, B], "cuda", name="myexp")
print(fcuda.imported_modules[0].get_source())

In [None]:
fopencl = tvm.build(s, [A, B], "opencl", name="myexp")
print(fopencl.imported_modules[0].get_source())

**Intrinsic Lowering Rule**

In [None]:
def my_cuda_math_rule(op):
    """Customized CUDA intrinsic lowering rule"""
    assert isinstance(op, tvm.tir.Call)
    name = op.op.name
    assert name.startswith("tir.")
    dispatch_name = name[4:]
    if op.dtype == "float32":
        # call float function
        return tvm.tir.call_pure_extern("float32", "%sf" % dispatch_name, op.args[0])
    elif op.dtype == "float64":
        # call double function
        return tvm.tir.call_pure_extern("float32", dispatch_name, op.args[0])
    else:
        # cannot do translation, return self.
        return op


register_intrin_lowering("tir.exp", target="cuda", f=my_cuda_math_rule, level=99)

In [None]:
fcuda = tvm.build(s, [A, B], "cuda", name="myexp")
print(fcuda.imported_modules[0].get_source())

**Add Your Own Intrinsic**

In [None]:
def mylog(x):
    """customized log intrinsic function"""
    return tvm.tir.call_intrin(x.dtype, "tir.mylog", x)


def my_cuda_mylog_rule(op):
    """CUDA lowering rule for log"""
    if op.dtype == "float32":
        return tvm.tir.call_pure_extern("float32", "logf", op.args[0])
    elif op.dtype == "float64":
        return tvm.tir.call_pure_extern("float64", "log", op.args[0])
    else:
        return op


# new op registration is triggered by registering an attribute of the op
register_op_attr("tir.mylog", "TCallEffectKind", tvm.tir.CallEffectKind.Pure)
register_intrin_lowering("tir.mylog", target="cuda", f=my_cuda_mylog_rule, level=99)

n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda i: mylog(A[i]), name="B")
s = te.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
fcuda = tvm.build(s, [A, B], "cuda", name="mylog")
print(fcuda.imported_modules[0].get_source())

# **Use Tensorize to Leverage Hardware Intrinsics**

In [None]:
from __future__ import absolute_import, print_function


import tvm
from tvm import te
import tvm.testing
import numpy as np

**Define Matrix Multiplication**

In [None]:
N, M, L = 1024, 512, 64
A = te.placeholder((N, L), name="A")
B = te.placeholder((M, L), name="B")
k = te.reduce_axis((0, L), name="k")
C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name="C")
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        for i, j in T.grid(1024, 512):
            C_1 = T.Buffer((524288,), data=C.data)
            C_1[i * 512 + j] = T.float32(0)
            for k in range(64):
                cse_var_1: T.int32 = i * 512 + j
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((32768,), data=B.data)
                C_1[cse_var_1] = C_1[cse_var_1] + A_1[i * 64 + k] * B_1[j * 64 + k]


**Schedule the Matmul**

In [None]:
factor = 16
x, y = C.op.axis
(z,) = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        for i, j_outer, j_inner in T.grid(1024, 32, 16):
            C_1 = T.Buffer((524288,), data=C.data)
            C_1[i * 512 + j_outer * 16 + j_inner] = T.float32(0)
            for k in range(64):
                cse_var_1: T.int32 = i * 512 + j_outer * 16 + j_inner
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((32768,), data=B.data)
                C_1[cse_var_1] = C_1[cse_var_1] + A_1[i * 64 + k] * B_1[j_outer * 1024 + j_inner * 64 + k]


**Define GEMV Tensorization Intrinsic**

In [None]:
def intrin_gemv(m, l):
    a = te.placeholder((l,), name="a")
    b = te.placeholder((m, l), name="b")
    k = te.reduce_axis((0, l), name="k")
    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])

    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(
            tvm.tir.call_extern(
                "int32",
                "gemv_update",
                cc.access_ptr("w"),
                aa.access_ptr("r"),
                bb.access_ptr("r"),
                m,
                l,
                bb.strides[0],
            )
        )
        return ib.get()

    return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})

In [None]:
gemv = intrin_gemv(factor, L)
s[C].tensorize(yi, gemv)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        for i, j_outer in T.grid(1024, 32):
            T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)


In [None]:
def gemv_impl():
    cc_code = """
      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
        for (int i = 0; i < m; ++i) {
            for (int j = 0; j < l; ++j) {
                cc[i] += aa[j] * bb[i * stride + j];
            }
        }
        return 0;
      }
    """
    from tvm.contrib import utils, clang

    temp = utils.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code

In [None]:
s[C].pragma(x, "import_llvm", gemv_impl())
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        i = T.int32()
        T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", metadata["tir.StringImm"][0])
        for i, j_outer in T.grid(1024, 32):
            T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)

# Metadata omitted. Use show_meta=True in script() method to show it.


In [None]:
func = tvm.build(s, [A, B, C], target="llvm", name="gemv")

from tvm.topi.utils import get_const_tuple

dtype = A.dtype
dev = tvm.device("cpu", 0)
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev)
func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c)
tvm.testing.assert_allclose(c.numpy(), np.dot(a, b.T), rtol=1e-3)

**Reduce-update for Tensorize**

In [None]:
zo, zi = s[C].split(z, factor=factor)
s[C].reorder(x, yo, zo, yi, zi)

In [None]:
def gemv_impl():
    cc_code = """
      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
        for (int i = 0; i < m; ++i) {
            for (int j = 0; j < l; ++j) {
                cc[i] += aa[j] * bb[i * stride + j];
            }
        }
        return 0;
      }
      extern "C" int gemv_reset(float *cc, int m) {
        for (int i = 0; i < m; ++i) {
            cc[i] = 0.0;
        }
        return 0;
      }
    """
    from tvm.contrib import utils, clang

    temp = utils.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code


def intrin_gemv(m, l):
    a = te.placeholder((l,), name="a")
    b = te.placeholder((m, l), name="b")
    k = te.reduce_axis((0, l), name="k")
    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])

    def intrin_func(ins, outs):
        aa, bb = ins
        cc = outs[0]

        def _body():
            ib = tvm.tir.ir_builder.create()
            ib.emit(
                tvm.tir.call_extern(
                    "int32",
                    "gemv_update",
                    cc.access_ptr("w"),
                    aa.access_ptr("r"),
                    bb.access_ptr("r"),
                    m,
                    l,
                    bb.strides[0],
                )
            )
            return ib.get()

        def _reduce_reset():
            ib = tvm.tir.ir_builder.create()
            ib.emit(tvm.tir.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
            return ib.get()

        def _reduce_update():
            return _body()

        return _body(), _reduce_reset(), _reduce_update()

    return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})

In [None]:
gemv = intrin_gemv(factor, factor)
s[C].tensorize(yi, gemv)
s[C].pragma(yo, "import_llvm", gemv_impl())

func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev)
func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c)
tvm.testing.assert_allclose(c.numpy(), np.dot(a, b.T), rtol=1e-3)

# **Compute and Reduce with Tuple Inputs**

In [None]:
from __future__ import absolute_import, print_function


import tvm
from tvm import te
import numpy as np

**Describe Batchwise Computation**

In [None]:
n = te.var("n")
m = te.var("m")
A0 = te.placeholder((m, n), name="A0")
A1 = te.placeholder((m, n), name="A1")
B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name="B")

# The generated IR code would be:
s = te.create_schedule(B0.op)
print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A0: T.handle, A1: T.handle, B: T.handle, B_1: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A0_1 = T.match_buffer(A0, (m, n), strides=("stride", "stride"), buffer_type="auto")
        A1_1 = T.match_buffer(A1, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_2 = T.match_buffer(B, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_3 = T.match_buffer(B_1, (m, n), strides=("stride", "stride"), buffer_type="auto")
        for i, j in T.grid(m, n):
            B_4 = T.Buffer((B_2.strides[0] * m,), data=B_2.data, buffer_type="auto")
            A0_2 = T.Buffer((A0_1.strides[0] * m,), data=A0_1.data, buffer_type="auto")
            B_4[i * B_2.strides[0] + j * B_2.strides[1]] = A0_2[i * A0_1.strides[0] + j * A0_1.strides[1]] + T.float3

**Describe Reduction with Collaborative Inputs**

In [None]:
# x and y are the operands of reduction, both of them is a tuple of index
# and value.
def fcombine(x, y):
    lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
    rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
    return lhs, rhs


# our identity element also need to be a tuple, so `fidentity` accepts
# two types as inputs.
def fidentity(t0, t1):
    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)


argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

# describe the reduction computation
m = te.var("m")
n = te.var("n")
idx = te.placeholder((m, n), name="idx", dtype="int32")
val = te.placeholder((m, n), name="val", dtype="int32")
k = te.reduce_axis((0, n), "k")
T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")

# the generated IR code would be:
s = te.create_schedule(T0.op)
print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(idx: T.handle, val: T.handle, T: T.handle, T_1: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        idx_1 = T.match_buffer(idx, (m, n), "int32", strides=("stride", "stride"), buffer_type="auto")
        val_1 = T.match_buffer(val, (m, n), "int32", strides=("stride", "stride"), buffer_type="auto")
        T_2 = T.match_buffer(T, (m,), "int32", strides=("stride",), buffer_type="auto")
        T_3 = T.match_buffer(T_1, (m,), "int32", strides=("stride",), buffer_type="auto")
        for i in range(m):
            T_4 = T.Buffer((T_2.strides[0] * m,), "int32", data=T_2.data, buffer_type="auto")
            T_4[i * T_2.strides[0]] = -1
            T_5 = T.Buffer((T_3.strides[0] * m,), "int32", data=T_3.data, buffer_type="auto")
            T_5[i * T_3.strides[0]] = -214748364

**Schedule Operation with Tuple Inputs**

In [None]:
n = te.var("n")
m = te.var("m")
A0 = te.placeholder((m, n), name="A0")
B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name="B")
A1 = te.placeholder((m, n), name="A1")
C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name="C")

s = te.create_schedule(C.op)
s[B0].compute_at(s[C], C.op.axis[0])
# as you can see in the below generated IR code:
print(tvm.lower(s, [A0, A1, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A0: T.handle, A1: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        m, n = T.int32(), T.int32()
        A0_1 = T.match_buffer(A0, (m, n), strides=("stride", "stride"), buffer_type="auto")
        A1_1 = T.match_buffer(A1, (m, n), strides=("stride", "stride"), buffer_type="auto")
        C_1 = T.match_buffer(C, (m, n), strides=("stride", "stride"), buffer_type="auto")
        B_v0 = T.allocate([n], "float32", "global")
        B_v1 = T.allocate([n], "float32", "global")
        for i in range(m):
            B_v0_1 = T.Buffer((n,), data=B_v0)
            for j in range(n):
                A0_2 = T.Buffer((A0_1.strides[0] * m,), data=A0_1.data, buffer_type="auto")
                B_v0_1[j] = A0_2[i * A0_1.strides[0] + j * A0_1.strides[1]] + T.float32(2)
                B_v1_1 = T.Buffer((n,),

# **Use Tensor Expression Debug Display (TEDD) for Visualization**

In [None]:
import tvm
from tvm import te
from tvm import topi
from tvm.contrib import tedd

In [None]:
batch = 1
in_channel = 256
in_size = 32
num_filter = 256
kernel = 3
stride = 1
padding = "SAME"
dilation = 1

A = te.placeholder((in_size, in_size, in_channel, batch), name="A")
W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
B = te.placeholder((1, num_filter, 1), name="bias")

with tvm.target.Target("llvm"):
    t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
    t_bias = topi.add(t_conv, B)
    t_relu = topi.nn.relu(t_bias)
    s = topi.generic.schedule_conv2d_hwcn([t_relu])

In [None]:
tedd.viz_dataflow_graph(s, dot_file_path="/tmp/dfg.dot")
# tedd.viz_dataflow_graph(s, show_svg = True)

<img src="https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_dfg.png" align="center">


In [None]:
tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree.dot")
# tedd.viz_schedule_tree(s, show_svg = True)

In [None]:
s = s.normalize()
tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree2.dot")
# tedd.viz_schedule_tree(s, show_svg = True)

<img src="https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_st.png" align="center">

In [None]:
tedd.viz_itervar_relationship_graph(s, dot_file_path="/tmp/itervar.dot")
# tedd.viz_itervar_relationship_graph(s, show_svg = True)

<img src="https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_itervar_rel.png" align="center">