In [1]:
import torch
import gc

import numpy as np
from torch import nn
from torch.nn.utils import weight_norm
from torch.profiler import profile, record_function, ProfilerActivity

import timeit

N = 1
iH = 220500
C = 128
wH = 7


def WNConv1d(*args, **kwargs):
    return weight_norm(nn.Conv1d(*args, **kwargs))


conv1d = WNConv1d(C, C, wH, dilation=1, padding=3).to('mps')

# torch.manual_seed(0)
a_np = np.random.uniform(0, 0.5, (N, iH, C)).astype("float32")
data = torch.from_numpy(a_np.transpose((0, 2, 1))).to("mps")

# with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
#     with record_function("model_inference"):
#         out = conv1d(data)

# print(out)

# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
# print("\n\n")
# print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

import torch.utils.benchmark as benchmark

# Create a timer for the Conv1D operation
t = benchmark.Timer(
    stmt="conv1d(data)",
    setup="from __main__ import conv1d, data",
    num_threads=torch.get_num_threads(),
    globals={"conv1d": conv1d, "data": data},
)

# Run the benchmark
result = t.timeit(1000)
print(result)


# # time
# execution_time = timeit.timeit(lambda: conv1d(data), number=1000)

# print(f"Average execution time: {execution_time / 1000:.6f} seconds per call")

  WeightNorm.apply(module, name, dim)


<torch.utils.benchmark.utils.common.Measurement object at 0x11cb64510>
conv1d(data)
setup: from __main__ import conv1d, data
  4.81 ms
  1 measurement, 1000 runs , 16 threads


In [1]:
import tvm

from tvm import relax
from tvm.relax.frontend import nn

from typing import Optional
from tvm import te
from tvm import dlight as dl
from tvm.target import Target
import numpy as np
import tempfile

import timeit

from mlc_dac.layers import CachedWNConv1d

conv1d = CachedWNConv1d(512, 512, 7, stride=1, dilation=9, padding=0)
mod, params = conv1d.export_tvm(
    {"forward": {"x": nn.spec.Tensor((1, 512, 62), "float32")}},
    debug=True
)

target = Target.from_device("metal")
seq = tvm.transform.Sequential(
    [
        tvm.relax.transform.LegalizeOps(),
        tvm.relax.transform.AnnotateTIROpPattern(),
        tvm.relax.transform.FoldConstant(),
        tvm.relax.transform.FuseOps(),
        tvm.relax.transform.FuseTIR(),
        dl.ApplyDefaultSchedule(
            dl.gpu.Matmul(),
            dl.gpu.GEMV(),
            dl.gpu.Reduction(),
            dl.gpu.GeneralReduction(),
            dl.gpu.Fallback(),
        ),
    ]
)

with target:
    mod = seq(mod)

device = tvm.metal()
ex = relax.build(mod, target)
vm = relax.VirtualMachine(ex, device, profile=True)
effects = vm.module["_initialize_effect"]()

params = [np.random.randn(*param.shape).astype("float32") for _, param in params]
params = [tvm.nd.array(param, device=device) for param in params]

np.random.seed(0)
audio_data = np.random.randn(1, 512, 62).astype("float32")
audio_data = tvm.nd.array(audio_data, device=device)

time_eval = vm.time_evaluator("forward", device, 10, 5)(audio_data, *effects, *params)
print(time_eval)

report = vm.profile("forward", audio_data, *effects, *params)
print(report)

[23:04:20] /Users/cfruan/Documents/tvm-unity/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.1.1 with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[23:04:20] /Users/cfruan/Documents/tvm-unity/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.1.1 with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`
[23:04:20] /Users/cfruan/Documents/tvm-unity/src/target/llvm/llvm_instance.cc:226: Error: Using LLVM 19.1.1 with `-mcpu=apple-latest` is not valid in `-mtriple=arm64-apple-macos`, using default `-mcpu=generic`


Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
   0.7746       0.7521       0.8622       0.7485       0.0439                  
Name                                 Duration (us)  Percent  Device  Count                                                                     Argument Shapes  
vm.builtin.check_tensor_info               1875.12    43.98  metal0      1                                                                 float32[1, 512, 62]  
fused_conv1d_add                           1040.79    24.41  metal0      1   float32[1, 512, 62], float32[512, 512, 7], float32[1, 512, 1], float32[1, 512, 8]  
fused_tir_square_sum                        299.46     7.02  metal0      1                                            float32[512, 512, 7], float32[512, 1, 1]  
fused_tir_sqrt_divide_multiply              271.75     6.37  metal0      1  float32[512, 1, 1], float32[512, 512, 7], float32[512, 1, 1], float32[512, 512, 7]  
vm.builtin.cached_padding_

In [None]:
import tvm

from tvm import relax
from tvm.relax.frontend import nn

from typing import Optional
from tvm import te
from tvm import dlight as dl
from tvm.target import Target
import numpy as np
import tempfile

import timeit

from mlc_dac.layers import CachedWNConv1d

conv1d = CachedWNConv1d(512, 512, 7, stride=1, dilation=9, padding=0)
mod, params = conv1d.export_tvm(
    {"forward": {"x": nn.spec.Tensor((1, 512, 62), "float32")}},
    debug=True
)

trials = 5000
target = Target.from_device("metal")

with target, tempfile.TemporaryDirectory() as tmp_dir:
    seq = tvm.transform.Sequential(
        [
            relax.get_pipeline("zero"),
            relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
            relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
        ]
    )

    mod = seq(mod)

mod.show()

In [None]:
device = tvm.metal()
with target:
    seq = dl.ApplyDefaultSchedule(
        dl.gpu.Fallback(),
    )
    vm_mod = seq(mod)

# vm_mod.show()
ex = relax.build(vm_mod, target)
vm = relax.VirtualMachine(ex, device, profile=True)
effects = vm.module["_initialize_effect"]()

tvm_params = [np.random.randn(*param.shape).astype("float32") for _, param in params]
tvm_params = [tvm.nd.array(param, device=device) for param in tvm_params]

np.random.seed(0)
audio_data = np.random.randn(1, 512, 62).astype("float32")
audio_data = tvm.nd.array(audio_data, device=device)

time_eval = vm.time_evaluator("forward", device, 10, 5)(audio_data, *effects, *tvm_params)
print(time_eval)

In [None]:
import tvm

from tvm import relax
from tvm.relax.frontend import nn

from typing import Optional
from tvm import te
from tvm import dlight as dl
from tvm.target import Target
import numpy as np
import tempfile

from mlc_dac.layers import WNConv1d

conv1d = nn.Conv1D(128, 128, 7, dilation=1, padding=3)
mod, params = conv1d.export_tvm(
    {"forward": {"x": nn.spec.Tensor((1, 128, "seq_len"), "float32")}}
)

trials = 2000
target = Target.from_device("metal")

with target, tempfile.TemporaryDirectory() as tmp_dir:
    seq = tvm.transform.Sequential(
        [
            relax.get_pipeline("zero"),
            relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
            relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
        ]
    )

    mod = seq(mod)

mod.show()

In [24]:
ex = relax.build(mod, target)
device = tvm.metal()

np.random.seed(0)
vm = relax.VirtualMachine(ex, device, profile=True)
tvm_data = tvm.nd.array(data.cpu(), device=device)
tvm_params = [np.random.randn(*param.shape).astype("float32") for _, param in params]
tvm_params = [tvm.nd.array(param, device=device) for param in tvm_params]

# output_tvm = vm["forward"](tvm_data, *tvm_params)
# output_tvm = output_tvm.asnumpy()

# output_tvm

report = vm.profile("forward", tvm_data, *tvm_params)
csv = report.csv()

with open("profile_conv1d.csv", "w", encoding="utf-8") as f:
    f.write(csv)
    print("Profile saved to profile_conv1d.csv")


Profile saved to profile_conv1d.csv


In [33]:
vm_eval = relax.VirtualMachine(ex, device)
timing_res = vm_eval.time_evaluator("forward", device, number=3, repeat=10, min_repeat_ms=100)(tvm_data, *tvm_params)
print(timing_res)

Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
   6.4725       6.4723       6.5175       6.4398       0.0245                  


In [2]:
import tvm

from tvm import relax
from tvm.relax.frontend import nn

from typing import Optional
from tvm import te
from tvm import dlight as dl
from tvm.target import Target
import numpy as np
import tempfile

from mlc_dac.layers import WNConv1d

# conv1d = WNConv1d(128, 128, 7, dilation=1, padding=3)

conv1d = nn.Conv1D(128, 128, 7, dilation=1, padding=3)
mod, params = conv1d.export_tvm(
    {"forward": {"x": nn.spec.Tensor((2, 128, 220500), "float32")}}
)

trials = 2000
target = Target.from_device("metal")

with target:
    seq = tvm.transform.Sequential(
        [
            relax.get_pipeline("zero"),
            dl.ApplyDefaultSchedule(
                dl.gpu.Matmul(),
                dl.gpu.GEMV(),
                dl.gpu.Reduction(),
                dl.gpu.GeneralReduction(),
                dl.gpu.Fallback(),
            ),
        ]
    )

    mod2 = seq(mod)

ex2 = relax.build(mod2, target)
device = tvm.metal()

np.random.seed(0)

N = 2
H = 220500
C = 128

data = np.random.uniform(0, 0.5, (N, C, H)).astype("float32")
tvm_data = tvm.nd.array(data, device=device)
tvm_params = [np.random.randn(*param.shape).astype("float32") for _, param in params]
tvm_params = [tvm.nd.array(param, device=device) for param in tvm_params]

# output_tvm = vm["forward"](tvm_data, *tvm_params)
# output_tvm = output_tvm.asnumpy()

# output_tvm

vm_eval_2 = relax.VirtualMachine(ex2, device)
timing_res = vm_eval_2.time_evaluator("forward", device, number=10, repeat=10, min_repeat_ms=100)(tvm_data, *tvm_params)
print(timing_res)

Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  25.6094      25.5991      26.0009      25.1240       0.3021                  


In [None]:
import tvm

from tvm import relax
from tvm.relax.frontend import nn

from typing import Optional
from tvm import te
from tvm import dlight as dl
from tvm.target import Target
import numpy as np
import tempfile

import timeit

from mlc_dac.layers import CachedWNConv1d

conv1d = nn.Conv1D(512, 512, 7, stride=1, dilation=3, padding=0)
mod, params = conv1d.export_tvm(
    {"forward": {"x": nn.spec.Tensor((1, 512, 62), "float32")}},
    debug=True
)

trials = 3000
target = Target.from_device("metal")

with target, tempfile.TemporaryDirectory() as tmp_dir:
    seq = tvm.transform.Sequential(
        [
            relax.get_pipeline("zero"),
            relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
            relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
        ]
    )

    mod = seq(mod)

mod.show()