In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
import IPython
import librosa
from model.conv_ae import ConvolutionalEncoder, ConvolutionalDecoder, ConvolutionalAutoencoder
from model.sequence_encoder import SequenceEncoder
from model.sing import SINGModel

## Dataset related functions

In [2]:
# Prepare NSynth TFRecordDataset to pass to model
def prepare_nsynth_dataset(dataset, time_embeddings = True):
    def process_parsed_features(point):
        instr_dim_size = 1006
        instr_embedding_size = 16
        instrument = tf.one_hot(point['instrument'], instr_dim_size)
        #instr_embedding = tf.random.normal(shape = (instr_embedding_size,))
        #instrument = tf.nn.embedding_lookup(instrument, instr_embedding).eval()

        pitch_dim_size = 128
        pitch_embedding_size = 8
        pitch = tf.one_hot(point['pitch'], pitch_dim_size)
        #pitch_embedding = tf.random_normal(shape = (pitch_embedding_size,))
        #pitch = tf.nn.embedding_lookup(pitch_embedding, pitch)

        vel_dim_size = 128
        vel_embedding_size = 2
        velocity = tf.one_hot(point['velocity'], vel_dim_size)
        #velocity_embedding = tf.random_normal(shape = (vel_embedding_size,))
        #velocity = tf.nn.embedding_lookup(velocity_embedding, velocity)
        
        audio = point['audio']
        
        time_dim_size = 250
        time_embedding_size = 4
        
        original_input_size = instr_dim_size + pitch_dim_size + vel_dim_size + time_dim_size 
        embedding_input_size = instr_embedding_size + pitch_embedding_size + vel_embedding_size + time_embedding_size

        inputs = tf.concat([instrument, pitch, velocity], axis = 0)
        # Modification for time embeddings
        if time_embeddings:
            timesteps = 250
            temp = []
            for i in range(timesteps):
                x = inputs
                x = tf.concat([x, tf.one_hot(i, timesteps)], axis = 0)
                temp.append(inputs)
            inputs = tf.stack(temp, axis = -1)
            shape = tf.shape(inputs)
            inputs = tf.reshape(inputs, [250, 1262])
        return {'inputs': inputs, 'outputs': audio}

    def parse_nsynth(example_proto):
        features = {
            "audio": tf.io.FixedLenFeature((4 * 16000), tf.float32),
            "note": tf.io.FixedLenFeature((), dtype = tf.int64),
            "note_str": tf.io.FixedLenFeature((), dtype = tf.string),
            "instrument": tf.io.FixedLenFeature((), dtype = tf.int64),
            "instrument_str": tf.io.FixedLenFeature((), dtype = tf.string),
            "instrument_source": tf.io.FixedLenFeature((), dtype = tf.int64),
            "instrument_source_str": tf.io.FixedLenFeature((), dtype = tf.string),
            "instrument_family_str": tf.io.FixedLenFeature((), dtype = tf.string),
            "sample_rate": tf.io.FixedLenFeature((), dtype = tf.int64),
            "velocity": tf.io.FixedLenFeature((), dtype = tf.int64),
            "pitch": tf.io.FixedLenFeature((), dtype = tf.int64),
        }
        parsed_features = tf.io.parse_single_example(example_proto, features)
        return process_parsed_features(parsed_features)

    def tfr_dataset_eager(data, batch_size):
        data = data.apply(tf.data.experimental.shuffle_and_repeat(10000))
        data = data.apply(tf.data.experimental.map_and_batch(map_func = parse_nsynth, batch_size = batch_size))
        #data = data.prefetch(1)
        return data

    return tfr_dataset_eager(dataset, 128)

## Data processing utilities

In [3]:
def get_spectrogram_for_audio(raw_wav):
    pass

# Pass in tensor of (64000, ) to get the audio sample in return
def generate_audio_sample(raw_wav):
    np_wav = raw_wav.numpy()
    print(np_wav)
    IPython.display.display(IPython.display.Audio(np_wav, rate = 16000, autoplay = False))
    print('DOne!')

## Create various models that we want to work with

In [4]:
conv_autoencoder = ConvolutionalAutoencoder()

Building Encoder ..
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 250, 4096)         4198400   
_________________________________________________________________
activation (Activation)      (None, 250, 4096)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 250, 4096)         16781312  
_________________________________________________________________
activation_1 (Activation)    (None, 250, 4096)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 250, 4096)         16781312  
_________________________________________________________________
activation_2 (Activation)    (None, 250, 4096)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 

## Get and parse NSynth dataset

In [5]:
dataset = tf.data.TFRecordDataset('/data/NSynth/nsynth-train.tfrecord')
dataset = prepare_nsynth_dataset(dataset)

## Train the models 

In [6]:
# print('Starting to train the Convolutional Autoencoder ...')
# train_summary_writer = tf.summary.create_file_writer('model_logs/conv_ae_train')
# with train_summary_writer.as_default():
#     conv_autoencoder.train(dataset)
# print('Convolutional Autoencoder Done!')

In [7]:
sing = SINGModel()
print('Starting to train the SING Model  ...')
sing.train(dataset)
print('SING Model Done!')

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_7 (Conv1D)            (None, 250, 4096)         4198400   
_________________________________________________________________
activation_6 (Activation)    (None, 250, 4096)         0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 250, 4096)         16781312  
_________________________________________________________________
activation_7 (Activation)    (None, 250, 4096)         0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 250, 4096)         16781312  
_________________________________________________________________
activation_8 (Activation)    (None, 250, 4096)         0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 250, 128)         

W0730 09:23:40.491105 139957570455296 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f496078ab38>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "ConvolutionalDecoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_3 (Sequential)    (None, 64000)             42479617  
Total params: 42,479,617
Trainable params: 42,479,617
Non-trainable params: 0
_________________________________________________________________
None


W0730 09:23:41.742495 139957570455296 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f496072bc50>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0730 09:23:42.872797 139957570455296 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f4960677f98>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "sequence_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
model (Model)                (128, 250, 128)           26284160  
Total params: 26,284,160
Trainable params: 26,284,160
Non-trainable params: 0
_________________________________________________________________
None
Starting to train the SING Model  ...
Restored from model_logs_conv_ae/ckpt-6
Initializing pretraining of Convolutional Autoencoder ...
---------- EPOCH 0 --------------
tf.Tensor(8.947485, shape=(), dtype=float32)
tf.Tensor(8.769326, shape=(), dtype=float32)
tf.Tensor(9.049629, shape=(), dtype=float32)
tf.Tensor(4.1887064, shape=(), dtype=float32)
tf.Tensor(3.3746812, shape=(), dtype=float32)
tf.Tensor(2.6890635, shape=(), dtype=float32)
tf.Tensor(1.5240571, shape=(), dtype=float32)
tf.Tensor(2.007308, shape=(), dtype=float32)
tf.Tensor(1.8965354, shape=(), dtype=float32)
Generating audio ...
----- Original waveform --

----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 7000: model_logs_conv_ae/ckpt-7
tf.Tensor(4.057577, shape=(), dtype=float32)
tf.Tensor(3.1503472, shape=(), dtype=float32)
tf.Tensor(4.9345684, shape=(), dtype=float32)
tf.Tensor(2.632113, shape=(), dtype=float32)
tf.Tensor(6.2508984, shape=(), dtype=float32)
tf.Tensor(8.322128, shape=(), dtype=float32)
tf.Tensor(5.1185036, shape=(), dtype=float32)
tf.Tensor(4.487918, shape=(), dtype=float32)
tf.Tensor(3.9776363, shape=(), dtype=float32)
tf.Tensor(6.543234, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 8000: model_logs_conv_ae/ckpt-8
tf.Tensor(9.300043, shape=(), dtype=float32)
tf.Tensor(4.841147, shape=(), dtype=float32)
tf.Tensor(7.0228066, shape=(), dtype=float32)
tf.Tensor(6.4391794, shape=(), dtype=float32)
tf.Tensor(10.605047, shape=(), dtype=float32)
tf.Tensor(8.182633, shape=(), dtype=float32)
tf.Tensor(5.9947643, shape=(), dtype=float32)
tf.Tensor(4.926874, shape=(), dtype=float32)
tf.Tensor(2.5030348, shape=(), dtype=float32)
tf.Tensor(2.1808267, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 9000: model_logs_conv_ae/ckpt-9
tf.Tensor(1.3017652, shape=(), dtype=float32)
tf.Tensor(1.7236198, shape=(), dtype=float32)
tf.Tensor(2.576961, shape=(), dtype=float32)
tf.Tensor(3.5788596, shape=(), dtype=float32)
tf.Tensor(2.9651184, shape=(), dtype=float32)
tf.Tensor(3.1890473, shape=(), dtype=float32)
tf.Tensor(6.1260767, shape=(), dtype=float32)
tf.Tensor(6.1194754, shape=(), dtype=float32)
tf.Tensor(5.7715025, shape=(), dtype=float32)
tf.Tensor(4.206377, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 10000: model_logs_conv_ae/ckpt-10
tf.Tensor(5.426952, shape=(), dtype=float32)
tf.Tensor(3.4805856, shape=(), dtype=float32)
tf.Tensor(8.400748, shape=(), dtype=float32)
tf.Tensor(6.8993015, shape=(), dtype=float32)
tf.Tensor(5.8277063, shape=(), dtype=float32)
tf.Tensor(6.529763, shape=(), dtype=float32)
tf.Tensor(6.5670986, shape=(), dtype=float32)
tf.Tensor(10.418541, shape=(), dtype=float32)
tf.Tensor(9.037015, shape=(), dtype=float32)
tf.Tensor(3.68977, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 11000: model_logs_conv_ae/ckpt-11
tf.Tensor(3.0869694, shape=(), dtype=float32)
tf.Tensor(2.9658835, shape=(), dtype=float32)
tf.Tensor(1.5003936, shape=(), dtype=float32)
tf.Tensor(1.387525, shape=(), dtype=float32)
tf.Tensor(2.0813947, shape=(), dtype=float32)
tf.Tensor(2.3780437, shape=(), dtype=float32)
tf.Tensor(3.982792, shape=(), dtype=float32)
tf.Tensor(4.1800065, shape=(), dtype=float32)
tf.Tensor(2.8373337, shape=(), dtype=float32)
tf.Tensor(6.1414056, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 12000: model_logs_conv_ae/ckpt-12
tf.Tensor(10.950701, shape=(), dtype=float32)
tf.Tensor(5.6930084, shape=(), dtype=float32)
tf.Tensor(3.4958715, shape=(), dtype=float32)
tf.Tensor(5.1487217, shape=(), dtype=float32)
tf.Tensor(5.8628426, shape=(), dtype=float32)
tf.Tensor(8.550547, shape=(), dtype=float32)
tf.Tensor(6.070922, shape=(), dtype=float32)
tf.Tensor(7.840426, shape=(), dtype=float32)
tf.Tensor(4.821252, shape=(), dtype=float32)
tf.Tensor(9.911927, shape=(), dtype=float32)
Generating audio ...
----- Original waveform -----


----- Ouput waveform -----


----- Original waveform -----


----- Ouput waveform -----


Saved checkpoint for step 13000: model_logs_conv_ae/ckpt-13
tf.Tensor(9.769505, shape=(), dtype=float32)
tf.Tensor(6.6178627, shape=(), dtype=float32)
tf.Tensor(3.4407344, shape=(), dtype=float32)
tf.Tensor(2.775579, shape=(), dtype=float32)
tf.Tensor(3.8312511, shape=(), dtype=float32)
tf.Tensor(1.2687595, shape=(), dtype=float32)
tf.Tensor(1.8584267, shape=(), dtype=float32)
tf.Tensor(2.9416547, shape=(), dtype=float32)
tf.Tensor(6.1483335, shape=(), dtype=float32)


KeyboardInterrupt: 