## 11 Vocos Architecture Search

Search over Compute Time vs. Architecture Size; aiming to find a nice trade off between the two. 

NOTE: All on the CPU

In [4]:
import numpy as np 
import torch 
import pandas as pd 
import sys 
import os 
import yaml 
import time

sys.path.append('../../')
from src.spectral_ops import ISTFT
from src.models import Vocos

In [5]:
# Load in Base Config
yaml_name = 'midi_vocos_1st.yaml'
with open('../../yamls/' + yaml_name, "r") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

device = 'cpu' # Running all Tests on the CPU 
vocos_config = config['vocos_config']   

source_model = Vocos(vocos_config).to(device)

source_model

Vocos(
  (backbone): VocosBackbone(
    (embed): Conv1d(1, 256, kernel_size=(7,), stride=(1,), padding=(3,))
    (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
    (convnext): ModuleList(
      (0-7): 8 x ConvNeXtBlock(
        (dwconv): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,), groups=256)
        (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
        (pwconv1): Linear(in_features=256, out_features=512, bias=True)
        (act): GELU(approximate='none')
        (pwconv2): Linear(in_features=512, out_features=256, bias=True)
      )
    )
    (final_layer_norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
  )
  (head): ISTFTHead(
    (out): Linear(in_features=256, out_features=1026, bias=True)
    (istft): ISTFT()
  )
)

In [6]:
# Init Stats
dim = config['vocos_config']['backbone']['dim']
intermediate_dim = config['vocos_config']['backbone']['intermediate_dim']
print(f'Starting Dim: {dim}')
print(f'Intermediate Dim: {intermediate_dim}')

features = torch.rand((32, 1, 18)).float().to(device) # Generate Fake Features
start = time.time()
x_hat = source_model(features)
end = time.time()

print(f'Computed Audio (CPU) in {(end-start) * 1000} ms')
print(f'Output Audio Shape: {x_hat.shape}')

Starting Dim: 256
Intermediate Dim: 512
Computed Audio (CPU) in 29.001235961914062 ms
Output Audio Shape: torch.Size([32, 1080])


In [7]:
# Define Params to Search Over
dims = [128, 256, 400]
intermediate_dims = [256, 512, 1024]
n_layers = [2, 4, 6, 8]
for idx, dim in enumerate(dims):
    for jdx, inter_dim in enumerate(intermediate_dims): 
        for kdx, n_layer in enumerate(n_layers):
            config['vocos_config']['backbone']['dim'] = dim
            config['vocos_config']['backbone']['num_layers'] = n_layer
            config['vocos_config']['head']['dim'] = dim

            config['vocos_config']['backbone']['intermediate_dim'] = inter_dim 

            source_model = Vocos(vocos_config).to(device)
            
            start = time.time()
            x_hat = source_model(features)
            end = time.time()

            print(f'(Dim {dim}, InterDim {inter_dim}, n_layer {n_layer}) {(end-start) * 1000} ms')


(Dim 128, InterDim 256, n_layer 2) 15.497446060180664 ms
(Dim 128, InterDim 256, n_layer 4) 17.052412033081055 ms
(Dim 128, InterDim 256, n_layer 6) 16.621828079223633 ms
(Dim 128, InterDim 256, n_layer 8) 16.495466232299805 ms
(Dim 128, InterDim 512, n_layer 2) 8.722305297851562 ms
(Dim 128, InterDim 512, n_layer 4) 16.12091064453125 ms
(Dim 128, InterDim 512, n_layer 6) 13.908147811889648 ms
(Dim 128, InterDim 512, n_layer 8) 20.47896385192871 ms
(Dim 128, InterDim 1024, n_layer 2) 11.663198471069336 ms
(Dim 128, InterDim 1024, n_layer 4) 12.153148651123047 ms
(Dim 128, InterDim 1024, n_layer 6) 15.850305557250977 ms
(Dim 128, InterDim 1024, n_layer 8) 24.410009384155273 ms
(Dim 256, InterDim 256, n_layer 2) 9.270429611206055 ms
(Dim 256, InterDim 256, n_layer 4) 11.172771453857422 ms
(Dim 256, InterDim 256, n_layer 6) 15.182733535766602 ms
(Dim 256, InterDim 256, n_layer 8) 49.239397048950195 ms
(Dim 256, InterDim 512, n_layer 2) 10.637760162353516 ms
(Dim 256, InterDim 512, n_layer

## Takeaways

Seems like InterDim between 256 and 512 doesn't change things too much, going to stick with Dim = 256, InterDim = 512

Lowest latency seems like 18ms on Cluster with n_layer = 8; goiung to choose 4 for 13.25 ms latency