In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from torch.nn.utils import weight_norm

import cached_conv as cc
import torch.utils.benchmark as benchmark

cc.use_cached_conv(True)


def WNConv1d(*args, **kwargs):
    return weight_norm(nn.Conv1d(*args, **kwargs))

def WNConv1dCached(*args, **kwargs):
    return weight_norm(cc.Conv1d(*args, **kwargs))

wn_conv1d = WNConv1d(5, 10, 7, padding=3)
wn_conv1d_cached = WNConv1dCached(5, 10, 7, padding=3)

torch.manual_seed(0)

weight_g_data = torch.randn(10, 1, 1)
weight_v_data = torch.randn(10, 5, 7)
bias_data = torch.randn(10)
wn_conv1d.weight_g.data = weight_g_data
wn_conv1d.weight_v.data = weight_v_data
wn_conv1d.bias.data = bias_data

data = torch.randn(20, 5, 1000)

t = benchmark.Timer(
    stmt='wn_conv1d(data)',
    globals={'wn_conv1d': wn_conv1d, 'data': data},
    num_threads=1,
)
res = t.timeit(100)
print("Non stream:", res)
out_nostream = wn_conv1d(data)

wn_conv1d_cached.weight_g.data = weight_g_data
wn_conv1d_cached.weight_v.data = weight_v_data
wn_conv1d_cached.bias.data = bias_data

t = benchmark.Timer(
    stmt='wn_conv1d_cached(data)',
    globals={'wn_conv1d_cached': wn_conv1d_cached, 'data': data},
    num_threads=1,
)
res = t.timeit(100)
print("Cached:", res)
out_cached = wn_conv1d_cached(data)

print(torch.allclose(out_nostream[..., :-wn_conv1d_cached.cumulative_delay], out_cached[..., wn_conv1d_cached.cumulative_delay:], atol=1e-6))
print(out_nostream[..., :-wn_conv1d_cached.cumulative_delay])
print(out_cached[..., wn_conv1d_cached.cumulative_delay:])

Non stream: <torch.utils.benchmark.utils.common.Measurement object at 0x177ccb010>
wn_conv1d(data)
  4.72 ms
  1 measurement, 100 runs , 1 thread
Cached: <torch.utils.benchmark.utils.common.Measurement object at 0x2883e9190>
wn_conv1d_cached(data)
  4.75 ms
  1 measurement, 100 runs , 1 thread
False
tensor([[[ 0.2391,  0.2759,  0.8915,  ...,  2.2475,  1.3730,  2.1704],
         [ 0.5690,  0.3283,  0.4935,  ...,  0.8048,  0.7730,  0.4228],
         [-2.4467,  0.4928, -3.4065,  ..., -1.7856, -1.5958, -0.1326],
         ...,
         [-2.7717, -0.6683, -1.0477,  ..., -0.8070, -1.1878, -0.1473],
         [ 0.9553,  0.9808,  0.7493,  ...,  0.8611,  1.7939,  1.8238],
         [ 0.8837,  0.9023,  1.0396,  ...,  1.5872,  0.4577,  1.1960]],

        [[ 0.7317,  2.8882,  1.5236,  ...,  3.4878,  4.8952,  0.2520],
         [ 0.4875,  0.8073,  0.5325,  ...,  0.0925, -0.2831,  0.4675],
         [-1.6555, -2.3700, -6.8651,  ..., -0.8240, -1.1611,  0.9850],
         ...,
         [-2.3175,  0.3974, -1

In [12]:
cc.use_cached_conv(True)

wn_conv1d = WNConv1d(1, 1, 3, padding=1)
wn_conv1d_cached = WNConv1dCached(1, 1, 3, padding=cc.get_padding(3))

torch.manual_seed(0)

data = torch.randn(1, 1, 6)

weight_g_data = torch.randn(1, 1, 1)
weight_v_data = torch.randn(1, 1, 3)
bias_data = torch.randn(1)
wn_conv1d.weight_g.data = weight_g_data
wn_conv1d.weight_v.data = weight_v_data
wn_conv1d.bias.data = bias_data

wn_conv1d_cached.weight_g.data = weight_g_data
wn_conv1d_cached.weight_v.data = weight_v_data
wn_conv1d_cached.bias.data = bias_data

chunk_size = 3
res = []
for i in range(0, data.shape[-1], chunk_size):
    # Slice along the sequence dimension and apply conv1d to each chunk
    tok = wn_conv1d_cached(data[..., i:i + chunk_size])
    res.append(tok)

# Concatenate all the chunks along the sequence dimension
chunked_output = torch.cat(res, dim=-1)

# Direct (non-chunked) computation for comparison
non_chunked_output = wn_conv1d(data)

print(chunked_output.shape, non_chunked_output.shape)
print(chunked_output)
print(non_chunked_output)

wn_conv1d_cached.cumulative_delay

torch.Size([1, 1, 6]) torch.Size([1, 1, 6])
tensor([[[-0.8099, -0.9363,  0.2203, -0.2220, -1.2132,  0.0279]]],
       grad_fn=<CatBackward0>)
tensor([[[-0.9363,  0.2203, -0.2220, -1.2132,  0.0279, -0.5633]]],
       grad_fn=<ConvolutionBackward0>)


  WeightNorm.apply(module, name, dim)


1

In [1]:
import dac

# Monkey patching the DAC class to use cc.Conv1d instead of nn.Conv1d

# Download a model
model_path = dac.utils.download(model_type="44khz")

import numpy as np
import torch
import time

dac.DAC.enable_streaming(True)
model = dac.DAC.load(model_path).to("cpu")
delay = model.encoder_cumulative_delay

torch.set_printoptions(precision=5, sci_mode=False)

# set numpy random seed
np.random.seed(0)

# Load audio signal file
silence = np.random.randn(*(1, 1, 512000)).astype(np.float32)
data = torch.tensor(silence).to("cpu")

res = []

with torch.no_grad():
    start = time.time()
    for i in range(0, data.shape[-1], 512):
        # Slice along the sequence dimension and apply conv1d to each chunk
        tok = model.encode(data[..., i:i + 512])[1]
        res.append(tok)
    end = time.time()

# Concatenate all the chunks along the sequence dimension
out = torch.cat(res, dim=-1)
print("Duration with streaming: ", end - start)

print(model.encoder_cumulative_delay)
out = out[..., delay:]
print(out.shape)

print("=============================")
print("=============================")


dac.DAC.enable_streaming(False)
model = dac.DAC.load(model_path).to("cpu")

# #print all model parameters
# for name, param in model.named_parameters():
#     print(name, param.shape)

# set numpy random seed
np.random.seed(0)

# Load audio signal file
silence = np.random.randn(*(1, 1, 512000)).astype(np.float32)
data = torch.tensor(silence).to("cpu")

start = time.time()
out_nostream = model.encode(data)[1]
end = time.time()

print("Duration without streaming: ", end - start)
out_nostream = out_nostream[..., :-delay]
print(out_nostream.shape)

print(torch.allclose(out[..., delay:-delay], out_nostream[..., delay:-delay], atol=1e-6))

  model_dict = torch.load(location, "cpu")
  WeightNorm.apply(module, name, dim)


Duration with streaming:  11.978892087936401
8
torch.Size([1, 9, 992])


  model_dict = torch.load(location, "cpu")
  WeightNorm.apply(module, name, dim)


Duration without streaming:  4.433102130889893
torch.Size([1, 9, 992])
True
