In [None]:
import librosa
import librosa.display
import numpy as np
import utils.data_util as data_util
import matplotlib.pyplot as plt
from tensorflow.keras.applications import Xception
from tensorflow.keras.layers import Dense, Flatten, Concatenate
from tensorflow.keras import Model
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import load_model
import tensorflow.keras as keras

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
        print(gpu)
        tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


# Load Data

In [None]:
# X_orig_audio, Y_orig_audio = data_util.load_audio_data(preprocess = True)

In [None]:
# X_orig_face, Y_orig_face = data_util.load_facial_data(preprocess = True)

In [None]:
X_orig_audio, Y_orig_audio = data_util.load_audio_filenames()
X_orig_face, Y_orig_face = data_util.load_facial_filenames()

# X_orig_audio, Y_orig_audio = data_util.load_audio_data()
# X_orig_face, Y_orig_face = data_util.load_facial_data()
print(X_orig_audio.shape)
print(X_orig_face.shape)

# Split Data

In [None]:
# split data
print(X_orig_face.shape)
print(X_orig_audio.shape)

#Sanity check of the loaded data
for i in range(Y_orig_face.shape[0]):
    for j in range(Y_orig_face.shape[1]):
        if(not(Y_orig_face[i][j] == Y_orig_audio[i][j])):
            print("invalid1")
            break

# Split the data using train_test_split utility
train_test_distribution = train_test_split(X_orig_face, Y_orig_face, X_orig_audio, Y_orig_audio,
                                          test_size = 0.2, shuffle = True)
X_train_face = train_test_distribution[0]
X_test_face = train_test_distribution[1]
Y_train_face = train_test_distribution[2]
Y_test_face = train_test_distribution[3]

X_train_audio = train_test_distribution[4]
X_test_audio = train_test_distribution[5]
Y_train_audio = train_test_distribution[6]
Y_test_audio = train_test_distribution[7]

#Sanity check after splitting
for i in range(Y_test_face.shape[0]):
    for j in range(Y_test_face.shape[1]):
        if(not(Y_test_face[i][j] == Y_test_audio[i][j])):
            print("invalid")
            break

for i in range(Y_train_face.shape[0]):
    for j in range(Y_train_face.shape[1]):
        if(not(Y_train_face[i][j] == Y_train_audio[i][j])):
            print("invalid")
            break

#Y_train_face and Y_train_speech are equivalent
# and Y_test_face and Y_test_speech are also equivalent
Y_train = Y_train_face
Y_test = Y_test_face

In [None]:
X_train_face_gen = data_util.DataGenerator(X_train_face, Y_train_face, 8)
X_val_face_gen = data_util.DataGenerator(X_test_face, Y_test_face, 8)

In [None]:
# librosa.display.specshow(X_orig_audio[0,:,:,0])
# fig = plt.figure(frameon=False)
# ax = plt.Axes(fig, [0., 0., 1., 1.])
# ax.set_axis_off()
# fig.add_axes(ax)

# ax.pcolormesh(X_orig_audio[0,:,:,0])
# fig.savefig('./temp/fig.jpg', dpi = 320, optimize = True)

## Use VGG16 for training facial data

In [None]:
model = Xception()

In [None]:
output_layer = Dense(8, activation = 'softmax', name = 'fc3')(model.layers[-2].output)
model = Model(model.input, output_layer)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_face_gen, epochs = 200)
# history = model.fit(X_train_face, Y_train_face, validation_data = [X_test_face, Y_test_face], batch_size = 8, epochs = 20)

In [None]:
# model.save('./temp/xception_trained_face.h5')

In [None]:
# plt.plot(history.history['accuracy'])

In [None]:
model = load_model('./temp/xception_trained_face.h5')

In [None]:
model.evaluate(X_test_face, Y_test_face, batch_size = 8)

## Use VGG16 for audio data

In [None]:
model_audio = Xception()

In [None]:
output_layer = Dense(8, activation = 'softmax', name = 'f3')(model_audio.layers[-2].output)
model_audio = Model(model_audio.input, output_layer)
model_audio.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])

In [None]:
history = model_audio.fit(X_train_audio, Y_train_audio, validation_data = [X_test_audio, Y_test_audio], batch_size = 16, epochs = 500)

In [None]:
print(history)

In [None]:
plt.plot(history.history['loss'])

In [None]:
plt.plot(history.history['categorical_accuracy'])

In [None]:
model_audio.evaluate(X_test_audio, Y_test_audio)

In [None]:
model_audio.save('./temp/audio_model.h5')

In [None]:
np.save('audio_history.npy',history.history)

In [None]:
# model = Model(input_layer, layer)
# model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])
# model.summary()

In [None]:
model_face = Xception()
for layer in model_face.layers:
    layer._name = layer.name + '_face'
output_layer_face = (model_face.layers[-2].output)

model_audio = Xception()
for layer in model_audio.layers:
    layer._name = layer.name + '_audio'
output_layer_audio = (model_audio.layers[-2].output)

layer = Concatenate()([output_layer_face, output_layer_audio])
layer = Dense(2048)(layer)
layer = Dense(8, activation = 'softmax')(layer)
model = Model([model_face.input, model_audio.input], layer)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])


In [None]:
model.summary()

In [None]:
# with tf.device('/CPU:0'):
history = model.fit(X_train_face_gen, epochs = 500)