In [1]:
# Test loading files and stacking them into a batch

import tensorflow as tf
import numpy as np
import librosa
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.utils.module_functions import *
from timbre_recognition.utils.io import *
  
y1, sr = load_wav_file_tf('timbre_recognition/datasets/ESC-50/audio/1-977-A-39.wav')
y2, sr = load_wav_file_librosa('timbre_recognition/datasets/ESC-50/audio/1-85168-A-39.wav')

print(y1.shape)
tensor_in = tf.concat(axis=0, values=[y1, y2])
print(tensor_in.get_shape())
print(tensor_in.dtype.name)

(1, 1, 220500, 1)
(2, 1, 220500, 1)
float32


In [2]:
# Test single pass through network

kernel_module = tf.Module()
embedding, endpoints = inception_resnet_v2(tensor_in, kernel_module, embed_dim=128)
#for name, cp in endpoints.items():
#  print(name)
#  print(' ', tf.reduce_max(cp))
#  print(' ', tf.reduce_min(cp))

print("Number of Variables:", len(kernel_module.variables))
#print_module_tree(kernel_module)
#for v in kernel_module.variables:
#  print(v)
print(tensor_in.shape)
for x in endpoints.values():
  print(x.shape)

Number of Variables: 109
(2, 1, 220500, 1)
(2, 1, 55121, 32)
(2, 1, 55102, 32)
(2, 1, 55102, 64)
(2, 1, 13771, 160)
(2, 1, 3438, 192)
(2, 1, 855, 384)
(2, 1, 855, 384)
(2, 1, 855, 384)
(2, 1, 855, 384)
(2, 1, 855, 384)
(2, 1, 855, 384)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 209, 1152)
(2, 1, 48, 2144)
(2, 1, 48, 2144)
(2, 1, 48, 2144)
(2, 1, 48, 2144)
(2, 1, 1, 2144)
(2, 128)
(2, 128)


In [3]:
# Test accessing kernels and computing params

import matplotlib.pyplot as plt
import librosa
import librosa.display

shape = kernel_module.Inception_Resnet_A1_Block.Branch_2.Conv2d_0b_20.shape
print(shape)
data = kernel_module.Inception_Resnet_A1_Block.Branch_2.Conv2d_0b_20.numpy()
print(data[0,:,0,0])
print(compute_num_params(kernel_module).numpy())

(1, 20, 32, 48)
[-0.02117196  0.01057281  0.01032447 -0.0133171   0.02197524  0.03374906
  0.04677583  0.06712596  0.02091689 -0.03943025 -0.03660249  0.01657343
  0.02763744 -0.03944011  0.0175179   0.07570648 -0.00486042  0.04881059
 -0.01953663  0.06294896]
45326976


In [5]:
# Test loading dataset

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.modeling.inception_v4_train import *
import tensorflow as tf
import numpy as np
import librosa

BATCH_SIZE = 12
EMBED_DIM = 128
LOSS_MARGIN = 0.01

data_directory = 'timbre_recognition/datasets/ESC-50/audio/'
d, l = load_esc50_dataset(data_directory)
print(d.shape)
print(l.shape)
print(tf.strings.to_number(l[:, 3], tf.dtypes.int32))

(2000, 1, 220500, 1)
(2000, 4)
tf.Tensor([ 0 14 36 ... 25  8  0], shape=(2000,), dtype=int32)


In [5]:
# Test loading dataset, batching, single pass through network, and loss

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.modeling.inception_resnet_v2_train import *
from timbre_recognition.utils.module_functions import print_module_tree
import tensorflow as tf

BATCH_SIZE = 12
EMBED_DIM = 128
LOSS_MARGIN = 0.01
BUFFER_SIZE = 2000

data_directory = 'timbre_recognition/datasets/ESC-50/audio/'

"""
with tf.Graph().as_default():
  data_placeholder = tf.placeholder(tf.float32)
  labels_placeholder = tf.placeholder(tf.string)
  dataset = tf.data.Dataset.from_tensor_slices((data_placeholder, labels_placeholder))
  dataset = dataset.batch(BATCH_SIZE)
  #iterator = dataset.make_initializable_iterator()
  batch_data, batch_labels = next(iter(dataset)) #iterator.get_next()
  batch_data = tf.expand_dims(batch_data, 1)
  batch_data = tf.expand_dims(batch_data, 3)
  embeddings, endpoints = inception_v4(batch_data, embed_dim=EMBED_DIM, reuse=tf.AUTO_REUSE)
  triplet_loss = batch_hard_triplet_loss(batch_labels[:, 3], embeddings, LOSS_MARGIN, True) 
  
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    data, labels = load_esc50_dataset(data_directory, sess)
    sess.run(iterator.initializer, {data_placeholder : data,
                                   labels_placeholder : labels})
    print('embedding shape:',sess.run(embeddings).shape)
    print('loss:', sess.run(triplet_loss))
    l = sess.run(batch_labels)
"""

data, labels = load_esc50_dataset(data_directory)
dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
batch_data, batch_labels = next(iter(dataset))
kernel_module = tf.Module()
embeddings, endpoints = inception_resnet_v2(batch_data, kernel_module, embed_dim=EMBED_DIM)
triplet_loss = batch_hard_triplet_loss(batch_labels[:, 3], embeddings, LOSS_MARGIN, True, 'mahalanobis', kernel_module)
print(triplet_loss)



tf.Tensor(0.010543306, shape=(), dtype=float32)


In [1]:
# Test running multiple epochs with checkpoints

from timbre_recognition.modeling.inception_resnet_v2_train import *
from timbre_recognition.utils.io import *
from timbre_recognition.configs.config import *
import tensorflow as tf

config_file = 'timbre_recognition/configs/test.yaml'
data_directory = 'timbre_recognition/datasets/ESC-50/audio/'

def main():
  print('Loading configuration file...')
  merge_cfg_from_file(config_file)
  print('Configurations loaded')
  
  data, labels = load_esc50_dataset(data_directory)
  kernel_module = tf.Module()
  ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
  manager = tf.train.CheckpointManager(ckpt, 'timbre_recognition/models/test/ckpts', max_to_keep=None)
  
  for epoch in range(cfg.TRAIN.NUM_EPOCHS):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(cfg.DATASET.BUFFER_SIZE).batch(cfg.TRAIN.BATCH_SIZE)
    batch_data, batch_labels = next(iter(dataset), (None, None))
    while batch_data is not None:
      inception_resnet_v2_train(batch_data, batch_labels, kernel_module, epoch)
      batch_data, batch_labels = next(iter(dataset), (None, None))
      # These are here only because waiting for an epoch is too long
      ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
      manager.save()
    # This is where they would actually go
    # ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
    #  manager.save()

main()

Loading configuration file...
Configurations loaded


W1017 19:25:34.163916 4435789248 deprecation.py:323] From /Users/reed/HonorsThesis/timbre_recognition/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1220: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Loss: tf.Tensor(0.099705815, shape=(), dtype=float32)
Loss: tf.Tensor(5.5740185, shape=(), dtype=float32)
Loss: tf.Tensor(0.23420058, shape=(), dtype=float32)
Loss: tf.Tensor(0.15302522, shape=(), dtype=float32)


KeyboardInterrupt: 

In [2]:
# Test restoring a model

y1, sr = load_wav_file_tf('timbre_recognition/datasets/ESC-50/audio/1-977-A-39.wav')
kernel_module = init_inception_resnet_kernels(y1.shape, cfg.MODEL.EMBED_DIM)
ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
manager = tf.train.CheckpointManager(ckpt, 'timbre_recognition/modeling/test/ckpts', max_to_keep=None)
print(manager.checkpoints)
print(manager.latest_checkpoint)
ckpt.restore(manager.checkpoints[0])
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])
ckpt.restore(manager.checkpoints[1])
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])
ckpt.restore(manager.checkpoints[2])
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])

['timbre_recognition/ckpts/ckpt-4', 'timbre_recognition/ckpts/ckpt-1', 'timbre_recognition/ckpts/ckpt-2', 'timbre_recognition/ckpts/ckpt-3']
timbre_recognition/ckpts/ckpt-3
-0.16371967
0.16113403
0.16136993
