In [7]:
import tensorflow as tf
from glob import glob
import h5py
import tensorflow.keras.backend as k

import matplotlib.pyplot as plt
from random import seed, shuffle
import numpy as np

In [8]:
# Load the pca parameters from vggish, to undo their postprocessing
params = np.load('vggish_pca_params.npz')
pca_matrix = params['pca_eigen_vectors']
pca_means = params['pca_means']
pca_matrix_inv = np.linalg.inv(pca_matrix)

In [9]:
def un_post_process(x):
    """
    VGGish postprocesses the features by:
        1. Applying the PCA matrix
        2. Clipping the resulting embeddings (this is lossy, no way to undo)
        3. Scaling to be between 0 and 255
        
    To undo these operations we need to know the following parameters:
        * QUANTIZE_MIN_VAL & QUANTIZE_MAX_VAL (from vggish params)
            = -2.0, +2.0
        * pca_matrix & pca_means (from downloadable npz file)
        
    We then apply the following operations to reverse to post-processing:
        1. Scale from (0, 255) to (QUANTIZE_MIN_VAL, QUANTIZE_MAX_VAL)
        2. Invert the PCA application:
            a) x = x.T
            b) x = dot (pca_matrix_inv, x)
            c) x = x.T + pca_means
    """
    x *= 4.0/255.0
    x -= 2.0
    x = k.transpose(x)
    x = k.dot(pca_matrix_inv, x)
    x = k.transpose(x) + pca_means
    return x

In [10]:
f = h5py.File('./packed_features/unbal_train.h5', 'r')

In [11]:
all_y = f['y']
all_x = f['x']

In [12]:
N = len(all_y)
full_indices =[i for i in range(N)]

In [13]:
RANDOM_SEED = 999

seed(RANDOM_SEED)
shuffle(full_indices)

tf.set_random_seed(RANDOM_SEED)

In [14]:
test_number = 200000

indices = full_indices[:-test_number]

In [15]:
y = all_y[:,0][indices] # zero because it's the speech index

In [16]:
x = all_x.value[indices].astype('float32') 
def normalize_x_inplace(x):
    x -= 128.0
    x /= 255.0
    x *= 2.0 # This is because vggish has QUANTIZE_<MIN/MAX>_VALUE = <-2, 2.0>
normalize_x_inplace(x)



MemoryError: Unable to allocate array with shape (2041789, 10, 128) and data type uint8

In [None]:
input_shape = shape=x[0].shape

features_input = tf.keras.layers.Input(input_shape)

# First we undo the post-processing. I don't THINK this changes the shape,
# so the output shape is the same as the input shape. Maybe..
# l = tf.keras.layers.Lambda(un_post_process, input_shape)(features_input)
l = tf.keras.layers.Flatten()(features_input)
l = tf.keras.layers.Dense(64, activation='relu')(l)
l = tf.keras.layers.Dense(32, activation='relu')(l)
l = tf.keras.layers.Dense(1, activation='sigmoid')(l)

model = tf.keras.Model(features_input, l, name='AudioClassifier')

In [None]:
model.compile('adam', loss='binary_crossentropy')

In [None]:
model.summary()

In [None]:
history = model.fit(x, y, batch_size=128, epochs=100, validation_split=0.1, callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
test_indices = full_indices[-test_number:]
test_x = all_x.value[test_indices].astype('float32')
test_y = all_y[:,0][test_indices]
#normalize_x_inplace(test_x)

In [None]:
pred_y = model.predict(test_x)
threshold = 0.5
pred_boolean = np.array(pred_y.shape)
pred_boolean = pred_y > threshold

In [None]:
print(np.mean(pred_y))
print(np.std(pred_y))

pred_correctness = pred_boolean.ravel() == test_y
total = pred_correctness.shape[0]
numerator = np.sum(pred_correctness)
print(f"accuracy: {numerator/total}")

In [None]:
# model.save_weights('classifier_weights')