In [2]:
# Test loading files and stacking them into a batch

import tensorflow as tf
import numpy as np
import librosa
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.utils.module_functions import *
from timbre_recognition.utils.io import *
  
y1, sr = load_wav_file_tf('timbre_recognition/datasets/ESC-50/audio/1-977-A-39.wav')
y2, sr = load_wav_file_librosa('timbre_recognition/datasets/ESC-50/audio/1-85168-A-39.wav')

print(y1.shape)
tensor_in = tf.concat(axis=0, values=[y1, y2])
print(tensor_in.get_shape())
print(tensor_in.dtype.name)

(1, 1, 220500, 1)
(2, 1, 220500, 1)
float32


In [3]:
# Test single pass through network

kernel_module = tf.Module()
embeddings, endpoints = inception_resnet_v2(tensor_in, kernel_module, embed_dim=128)
#for name, cp in endpoints.items():
#  print(name)
#  print(' ', tf.reduce_max(cp))
#  print(' ', tf.reduce_min(cp))

print("Number of Variables:", len(kernel_module.variables))
#print_module_tree(kernel_module)
#for v in kernel_module.variables:
#print(v)
print(tensor_in.shape)
for x in endpoints.values():
  print(x.shape)

Number of Variables: 109
(2, 1, 220500, 1)
(2, 1, 36747, 32)
(2, 1, 36728, 32)
(2, 1, 36728, 64)
(2, 1, 6119, 160)
(2, 1, 1017, 192)
(2, 1, 167, 384)
(2, 1, 167, 384)
(2, 1, 167, 384)
(2, 1, 167, 384)
(2, 1, 167, 384)
(2, 1, 167, 384)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 74, 1152)
(2, 1, 28, 2144)
(2, 1, 28, 2144)
(2, 1, 28, 2144)
(2, 1, 28, 2144)
(2, 1, 1, 2144)
(2, 128)
(2, 128)


In [6]:
embeddings.get_shape()[1]
e1 = embeddings[0]
e2 = embeddings[1]
M = tf.random.uniform([128,128])
e1 = tf.expand_dims(e1, 0)
e2 = tf.expand_dims(e2, 0)
tf.matmul(tf.matmul(e1-e2, M), tf.transpose(e1-e2))


<tf.Tensor: id=5123, shape=(1, 1), dtype=float32, numpy=array([[-0.01500663]], dtype=float32)>

In [3]:
# Test accessing kernels and computing params

import matplotlib.pyplot as plt
import librosa
import librosa.display

shape = kernel_module.Inception_Resnet_A1_Block.Branch_2.Conv2d_0b_20.shape
print(shape)
data = kernel_module.Inception_Resnet_A1_Block.Branch_2.Conv2d_0b_20.numpy()
print(data[0,:,0,0])
print(compute_num_params(kernel_module).numpy())

(1, 20, 32, 48)
[-0.02117196  0.01057281  0.01032447 -0.0133171   0.02197524  0.03374906
  0.04677583  0.06712596  0.02091689 -0.03943025 -0.03660249  0.01657343
  0.02763744 -0.03944011  0.0175179   0.07570648 -0.00486042  0.04881059
 -0.01953663  0.06294896]
45326976


In [5]:
# Test loading dataset

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.modeling.inception_v4_train import *
import tensorflow as tf
import numpy as np
import librosa

BATCH_SIZE = 12
EMBED_DIM = 128
LOSS_MARGIN = 0.01

data_directory = 'timbre_recognition/datasets/ESC-50/audio/'
d, l = load_esc50_dataset(data_directory)
print(d.shape)
print(l.shape)
print(tf.strings.to_number(l[:, 3], tf.dtypes.int32))

(2000, 1, 220500, 1)
(2000, 4)
tf.Tensor([ 0 14 36 ... 25  8  0], shape=(2000,), dtype=int32)


In [1]:
# Test loading dataset, batching, single pass through network, and loss

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.modeling.inception_resnet_v2_train import *
from timbre_recognition.utils.module_functions import print_module_tree
import tensorflow as tf

BATCH_SIZE = 12
EMBED_DIM = 128
LOSS_MARGIN = 0.01
BUFFER_SIZE = 2000

data_directory = 'timbre_recognition/datasets/ESC-50/audio/'

data, labels = load_esc50_dataset(data_directory)
dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
batch_data, batch_labels = next(iter(dataset))
kernel_module = tf.Module()
embeddings, endpoints = inception_resnet_v2(batch_data, kernel_module, embed_dim=EMBED_DIM)
triplet_loss = batch_triplet_semihard_loss(batch_labels[:, 3], embeddings, LOSS_MARGIN, kernel_module)
print(triplet_loss)



tf.Tensor(0.008171577, shape=(), dtype=float32)


In [1]:
# Test running multiple epochs with checkpoints

from timbre_recognition.modeling.inception_resnet_v2_train import *
from timbre_recognition.utils.io import *
from timbre_recognition.configs.config import *
import tensorflow as tf

config_file = 'timbre_recognition/configs/esc-50.yaml'
data_directory = 'timbre_recognition/datasets/ESC-50/audio/'

def main():
  print('Loading configuration file...')
  merge_cfg_from_file(config_file)
  print('Configurations loaded')
  
  data, labels = load_esc50_dataset(data_directory)
  kernel_module = tf.Module()
  ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
  manager = tf.train.CheckpointManager(ckpt, 'timbre_recognition/models/test/ckpts', max_to_keep=None)
  
  for epoch in range(cfg.TRAIN.NUM_EPOCHS):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(cfg.DATASET.BUFFER_SIZE).batch(cfg.TRAIN.BATCH_SIZE)
    batch_data, batch_labels = next(iter(dataset), (None, None))
    while batch_data is not None:
      inception_resnet_v2_train(batch_data, batch_labels, kernel_module, epoch)
      batch_data, batch_labels = next(iter(dataset), (None, None))
      # These are here only because waiting for an epoch is too long
      ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
      manager.save()
    # This is where they would actually go
    # ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
    #  manager.save()

main()

Loading configuration file...
Configurations loaded


W1104 00:57:55.254482 4579198400 deprecation.py:323] From /Users/reed/HonorsThesis/timbre_recognition/ops/triplet_loss.py:345: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Loss: 0.19843207
Loss: 3.0237114


KeyboardInterrupt: 

In [32]:
# Test restoring a model

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.configs.config import *

y1, sr = load_wav_file_tf('timbre_recognition/datasets/ESC-50/audio/1-977-A-39.wav')
s = y1.shape.as_list()
s[0] = 1
print(s)
kernel_module = init_inception_resnet_kernels(s, cfg.MODEL.EMBED_DIM)
ckpt = tf.train.Checkpoint(kernel_module=kernel_module)
manager = tf.train.CheckpointManager(ckpt, 'timbre_recognition/models/test/ckpts', max_to_keep=None)
print(manager.checkpoints)
print(manager.latest_checkpoint)
ckpt.restore(manager.checkpoints[0])
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])
ckpt.restore(manager.checkpoints[1])
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])
ckpt.restore('timbre_recognition/models/test/ckpts/ckpt-3')
print(kernel_module.Stem.Conv2d_1a_20.numpy()[0,0,0,0])

[1, 1, 220500, 1]
0.022859508


In [20]:
import tensorflow as tf
from timbre_recognition.utils.evaluate import *
from timbre_recognition.ops.triplet_loss import *

truth_mask = tf.random.uniform([32,32], maxval=2, dtype=tf.dtypes.int32)
truth_mask = truth_mask < 1
a = tf.logical_not(tf.linalg.band_part(tf.logical_not(truth_mask), -1, 0))
b = tf.logical_not(tf.linalg.band_part(truth_mask, 0, -1))
congruent_mask = tf.logical_and(a, b)

correct = tf.math.logical_not(tf.math.logical_xor(congruent_mask, truth_mask))
correct = tf.dtypes.cast(correct, tf.dtypes.int32)
# Remove the diagonal since it is trivial (any sample will have distance 0 from itself)
# The lower triangle may not be symmetric since matrix multiplication is not commutative
correct -= tf.linalg.band_part(correct, 0, 0)

accuracy = tf.reduce_sum(correct) / (2*sum(range(correct.shape[0])))
print(accuracy)

tf.Tensor(0.5, shape=(), dtype=float64)


In [22]:
data, labels = load_esc50_test_set('timbre_recognition/datasets/ESC-50/audio/')
labels = tf.strings.to_number(labels[:, 3], tf.dtypes.int32)

dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(cfg.DATASET.BUFFER_SIZE).batch(cfg.TRAIN.BATCH_SIZE)
dataset_iter = iter(dataset)
batch_data, batch_labels = next(dataset_iter, (None, None))

truth_mask = tf.convert_to_tensor([tf.math.equal(batch_labels, x) for x in batch_labels])

In [29]:
correct = tf.dtypes.cast(truth_mask, tf.dtypes.int32)
print(tf.reduce_sum(correct))

tf.Tensor(134, shape=(), dtype=int32)


In [191]:
# Test loading dataset, batching, single pass through network, and loss

from timbre_recognition.utils.io import *
from timbre_recognition.ops.triplet_loss import *
from timbre_recognition.modeling.inception_resnet_v2 import *
from timbre_recognition.modeling.inception_resnet_v2_train import *
from timbre_recognition.utils.module_functions import print_module_tree
import tensorflow as tf

BATCH_SIZE = 32
EMBED_DIM = 128
LOSS_MARGIN = 0.01
BUFFER_SIZE = 2000
NUM_CLASSES = 50

data_directory = 'timbre_recognition/datasets/ESC-50/audio/'

data, labels = load_esc50_dataset(data_directory)
dataset = tf.data.Dataset.from_tensor_slices((data, labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
batch_data, batch_labels = next(iter(dataset))
kernel_module = tf.Module()
embeddings, endpoints = inception_resnet_v2(batch_data, kernel_module, embed_dim=EMBED_DIM)

#==========================================

data, labels = load_esc50_test_set(data_directory)
class_centers = tf.random.uniform([NUM_CLASSES, EMBED_DIM], maxval=49, dtype=tf.dtypes.int32)
labels = tf.strings.to_number(labels[:, 3], tf.dtypes.int32)

dataset = tf.data.Dataset.from_tensor_slices((data, labels)).batch(BATCH_SIZE)
dataset_iter = iter(dataset)
batch_data, batch_labels = next(dataset_iter, (None, None))

accuracies = []

while batch_data is not None:
  embeddings, endpoints = inception_resnet_v2(batch_data, kernel_module, embed_dim=EMBED_DIM)
  distances = pairwise_mahalanobis_distances(embeddings, class_centers, kernel_module)
  # shape [batch_size]
  predictions = tf.argsort(distances, 1)[:,0] # 0 index holds class center closest to embedding, taken along entire batch
  results = tf.math.equal(batch_labels, predictions)
  num_correct = tf.math.count_nonzero(results)
  batch_accuracy = num_correct / len(results)
  accuracies += [batch_accuracy]
  break

  batch_data, batch_labels = next(dataset_iter, (None, None))
accuracy = tf.reduce_mean(accuracies)
print("Accuracy: {}".accuracy)