In [1]:
# only z is quantized
# for f0 and ld different heads in the rnn is used.

In [2]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import IPython.display as ipd
from IPython.display import Audio
import time
import matplotlib.pyplot as plt
from utils.util_funcs import generate_audio, generate_sample_test

tfkl = tf.keras.layers

#load latent space
batch_size = 32

latent_files_path = 'vq-50k-latent_space'

training_code_inds = np.load('saved_latent_spaces/{}/train_code_inds.npy'.format(latent_files_path))
training_codes = np.load('saved_latent_spaces/{}/train_codes.npy'.format(latent_files_path))
codebook = np.load('saved_latent_spaces/{}/codebook.npy'.format(latent_files_path))

code_data = tf.data.Dataset.from_tensor_slices((training_code_inds, training_codes))
# cache the dataset to memory to get a speedup while reading from it.
code_data = code_data.cache()
code_data_ready = code_data.shuffle(50000).batch(batch_size, drop_remainder=True)#.repeat()

seqlen = 1000
dim_code = codebook.shape[-1] + 1 + 1

inputs = tfkl.Input(batch_shape=(batch_size, None, dim_code))
x = tfkl.LSTM(512, return_sequences=True, stateful=True)(inputs)
x = tfkl.LSTM(512, return_sequences=True, stateful=True)(x)
x = tfkl.LSTM(512, return_sequences=True, stateful=True)(x)

f0_output = tfkl.Dense(1998)(x)
ld_output = tfkl.Dense(121)(x)
z_output = tfkl.Dense(codebook.shape[0])(x)
    
model_rnn = tf.keras.Model(inputs=inputs, outputs=[f0_output, ld_output, z_output], name='Functional-api-RNN')

EPOCHS = 20
train_steps = 200000
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
lr = tf.optimizers.schedules.PolynomialDecay(0.000001, train_steps, 0.000000001)
opt = tf.optimizers.Adam(lr)

model_rnn.summary()

Model: "Functional-api-RNN"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(32, None, 18)]     0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (32, None, 512)      1087488     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (32, None, 512)      2099200     lstm[0][0]                       
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (32, None, 512)      2099200     lstm_1[0][0]                     
_________________________________________________________________________________

In [3]:
checkpoint_path = "saved_models/lstm"

ckpt = tf.train.Checkpoint(model=model_rnn,
                           optimizer=opt)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [4]:
# @tf.function
def train(ind_batch, code_batch):
    targets_f0 = ind_batch[:, 1:, 0:1] 
    targets_f0 = tf.reshape(targets_f0, [-1,seqlen-1])
    targets_ld = ind_batch[:, 1:, 1:2] 
    targets_ld = tf.reshape(targets_ld, [-1,seqlen-1]) * (-1)
    targets_z = ind_batch[:, 1:, 2:]
    targets_z = tf.reshape(targets_z, [-1,seqlen-1])
    inp = code_batch[:, :-1]
    
    with tf.GradientTape() as tape:
        out = model_rnn(inp)
        xent_f0 = loss(targets_f0, out[0])
        xent_ld = loss(targets_ld, out[1])
        xent_z = loss(targets_z, out[2])
        xent = (xent_f0 + xent_ld + xent_z) / 3.
    grads = tape.gradient(xent, model_rnn.trainable_variables)
    opt.apply_gradients(zip(grads, model_rnn.trainable_variables))

    return xent, out

losses = []
for epoch in range(EPOCHS):
    start = time.time()
    
    for batch, (inds, codes) in enumerate(code_data_ready):
        model_rnn.reset_states()
        xent, out = train(inds, codes)
        losses.append(xent)
        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
                 epoch + 1, batch, xent))
            
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, xent))
    gen = generate_sample_test(model_rnn, codes, chunk_len=1, seqlen=1)
    gen_random = generate_audio(model_rnn)
    ipd.display(Audio(gen_random[0],rate=16000))
    ipd.display(Audio(gen[0],rate=16000))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 5.5185
Epoch 1 Batch 100 Loss 5.5115
Epoch 1 Batch 200 Loss 5.5038
Epoch 1 Batch 300 Loss 5.4904
Epoch 1 Batch 400 Loss 5.4726
Epoch 1 Batch 500 Loss 5.4364
Epoch 1 Batch 600 Loss 5.3217
Epoch 1 Batch 700 Loss 5.1298
Epoch 1 Batch 800 Loss 5.1427
Epoch 1 Batch 900 Loss 4.9406
Epoch 1 Batch 1000 Loss 4.8899
Epoch 1 Batch 1100 Loss 4.9016
Epoch 1 Batch 1200 Loss 4.8412
Epoch 1 Loss 4.8863


NameError: name 'tfd' is not defined

In [None]:
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())