In [1]:
from model.wavenet import WaveNet
from model.conv_encoder import ConvEncoder, ConvDecoder
from model.vq_vae import VQ_VAE

from vq_vae_trainer import VQ_VAE_Trainer

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [2]:
############################################################
# Current VQ-VAE implementation here
# - Input audio
# ---> Conv Encoder (Downsample)
# ---> VQ (Latent space)
# ---> Conv Decoder (Upsample)
# ---> WaveNet (Generate)
# - Output audio
############################################################
#wavenet = WaveNet()
#conv_encoder = ConvEncoder()
vq = VQ_VAE()

Model: "Convolutional-Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 16000, 1)]        0         
_________________________________________________________________
conv1d (Conv1D)              (None, 8000, 32)          160       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4000, 32)          4128      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2000, 32)          4128      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1000, 32)          4128      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 32)           4128      
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 250, 32) 

The following Variables were used a Lambda layer's call (sample_from_codebook), but
are not present in its tracked objects:
  <tf.Variable 'vector_quantizer/Codebook:0' shape=(64, 32) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 16000, 1)]   0                                            
__________________________________________________________________________________________________
conv_encoder (ConvEncoder)      (None, 250, 1)       20833       input_4[0][0]                    
_______________________________________________________________________________________

In [3]:
from scipy.io.wavfile import read
import os
DATA_PATH = r"/home/rithomas/data/IDMT-SMT-GUITAR_V2/dataset1/Fender_Strat_Clean_Neck_SC/audio"
p = os.path.join(DATA_PATH, 'G53-40100-1111-00001.wav')
print(read(p))

(44100, array([6, 5, 4, ..., 0, 0, 0], dtype=int16))


In [None]:
import tensorflow as tf

## Dummy dataset to test architecture
def generator():
    for path in os.listdir(DATA_PATH):
        if path.endswith('.wav'):
            yield tf.expand_dims(read(os.path.join(DATA_PATH, path))[1][:16000], axis=-1)
    
ds = tf.data.Dataset.from_generator(generator, output_types=tf.int32)
ds = ds.batch(4)
print(ds)

TRAINING_CONFIGS = {
    'model_path': '/home/rithomas/cache/test_model',
    'learning_rate': 0.0001,
    'num_epochs': 100,
    'print_every': 100
}
trainer = VQ_VAE_Trainer(vq, TRAINING_CONFIGS)
trainer.train(ds)

<BatchDataset shapes: <unknown>, types: tf.int32>
Initializing from scratch.
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(192.38463, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(22.902443, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.694022, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.3926806, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.126466, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.099171, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.08706, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.0740623, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(6.057068, shape=(), dtype=float32)
tf.Tensor([

tf.Tensor(5.0852027, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.104956, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.0416327, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.075542, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.053556, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.0617714, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.0565095, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.082287, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.0506473, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,), dtype=int32)
tf.Tensor(5.0933075, shape=(), dtype=float32)
tf.Tensor([    4 16000     1], shape=(3,