## Load Model

In [1]:
from wavenet_model import *
from audio_data import WavenetDataset

dtype = torch.FloatTensor
ltype = torch.LongTensor

use_cuda = torch.cuda.is_available()
if use_cuda:
    print('use gpu')
    dtype = torch.cuda.FloatTensor
    ltype = torch.cuda.LongTensor

In [2]:
model = WaveNetModel(layers=10,
                     blocks=3,
                     dilation_channels=32,
                     residual_channels=32,
                     skip_channels=1024,
                     end_channels=512, 
                     output_length=16,
                     dtype=dtype, 
                     bias=True)
model = load_latest_model_from('snapshots', use_cuda=use_cuda)

model.dtype = dtype
if use_cuda:
    model.cuda()
else:
    model.cpu()
    
print('model: ', model)
print('receptive field: ', model.receptive_field)
print('parameter count: ', model.parameter_count())

load model snapshots/chaconne_model_2024-07-19_04-53-38
model:  WaveNetModel(
  (filter_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (gate_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (residual_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(1,), stride=(1,))
  )
  (skip_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 1024, kernel_size=(1,), stride=(1,))
  )
  (start_conv): Conv1d(256, 32, kernel_size=(1,), stride=(1,))
  (end_conv_1): Conv1d(1024, 512, kernel_size=(1,), stride=(1,))
  (end_conv_2): Conv1d(512, 256, kernel_size=(1,), stride=(1,))
)
receptive field:  3070
parameter count:  1834592


In [3]:
data = WavenetDataset(dataset_file='train_samples/bach_chaconne/dataset.npz',
                      item_length=model.receptive_field + model.output_length - 1,
                      target_length=model.output_length,
                      file_location='train_samples/bach_chaconne',
                      test_stride=500)
print('the dataset has ' + str(len(data)) + ' items')

one hot input
the dataset has 598277 items


## Extract Feature

In [4]:
# The input data is based on the sampled audio from data set
input_length = model.receptive_field + model.output_length - 1
sample_num = 10

start_data = torch.zeros((model.classes, input_length))
selected_indices = np.random.choice(len(data), sample_num)

slices = np.linspace(start=0, stop=input_length, num=sample_num + 1).astype(int)

for i, index in enumerate(selected_indices):
    start_index = slices[i]
    end_index = slices[i + 1]
    start_data[:, start_index:end_index] = data[index][0][:, start_index:end_index]

start_data = torch.max(start_data, 0)[1] # convert one hot vectors to integers

## Deployment

In [5]:
import torch
import torch.nn as nn
import os
import onnxruntime
import numpy as np
import onnx
import shutil
from timeit import default_timer as timer
import vai_q_onnx

In [6]:
# Specify the path to the quantized ONNZ Model
model_path = r'./models/wavenet.onnx'
onnx_model = onnx.load(model_path)

# We want to make sure we compile everytime, otherwise the tools will use the cached version
# Get the current working directory
current_directory = os.getcwd()
directory_path = os.path.join(current_directory,  r'cache\wavenet_cache')
cache_directory = os.path.join(current_directory,  r'cache')

# Check if the directory exists and delete it if it does.
if os.path.exists(directory_path):
    shutil.rmtree(directory_path)
    print(f"Directory deleted successfully. Starting Fresh.")
else:
    print(f"Directory '{directory_path}' does not exist.")

# Point to the config file path used for the VitisAI Execution Provider
config_file_path = "vaip_config.json"

aie_options = onnxruntime.SessionOptions()

aie_session = onnxruntime.InferenceSession(
    onnx_model.SerializeToString(),
    providers=['VitisAIExecutionProvider'],
    sess_options=aie_options,
    provider_options = [{'config_file': config_file_path,
                         'cacheDir': cache_directory,
                         'cacheKey': 'wavenet_cache'}]
)

Directory deleted successfully. Starting Fresh.


In [7]:
def generate(model,
             num_samples,
             first_samples=None,
             temperature=1.,
             session=None):
    model.eval()
    if first_samples is None:
        first_samples = model.dtype(1).zero_()
    generated = Variable(first_samples, volatile=True)

    num_pad = model.receptive_field - generated.size(0)
    if num_pad > 0:
        generated = constant_pad_1d(generated, model.scope, pad_start=True)
        print("pad zero")

    for i in range(num_samples):
        input = Variable(torch.FloatTensor(1, model.classes, model.receptive_field).zero_())
        input = input.scatter_(1, generated[-model.receptive_field:].view(1, -1, model.receptive_field), 1.)

        x = torch.tensor(session.run(None, {"input": input.numpy()})[0])[:, :, -1].squeeze()

        if temperature > 0:
            x /= temperature
            prob = F.softmax(x, dim=0)
            prob = prob.cpu()
            np_prob = prob.data.numpy()
            x = np.random.choice(model.classes, p=np_prob)
            x = Variable(torch.LongTensor([x]))#np.array([x])
        else:
            x = torch.max(x, 0)[1].float()

        generated = torch.cat((generated, x), 0)

    generated = (generated / model.classes) * 2. - 1
    mu_gen = mu_law_expansion(generated, model.classes)

    model.train()
    return mu_gen

generated = generate(model=model,
                     num_samples=160000,
                     first_samples=start_data,
                     temperature=1.0,
                     session=aie_session)

  generated = Variable(first_samples, volatile=True)



## Generate Audio

In [8]:
import IPython.display as ipd

ipd.Audio(generated, rate=16000)

In [9]:
import soundfile as sf

sf.write('wav/generated_clip2.wav', generated, 16000)