In [None]:
import librosa
import librosa.display
import numpy as np
import utils.data_util as data_util
import matplotlib.pyplot as plt
from tensorflow.keras.applications import Xception, VGG16
from tensorflow.keras.layers import Dense, Flatten, Concatenate
from tensorflow.keras import Model
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import load_model
import tensorflow.keras as keras

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
        print(gpu)
        tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

# Load Data

In [None]:
X_orig_audio, Y_orig_audio = data_util.load_audio_filenames()
X_orig_face, Y_orig_face = data_util.load_facial_filenames()

print(X_orig_audio.shape)
print(X_orig_face.shape)

# Split Data

In [None]:
# split data
print(X_orig_face.shape)
print(X_orig_audio.shape)

#Sanity check of the loaded data
for i in range(Y_orig_face.shape[0]):
    for j in range(Y_orig_face.shape[1]):
        if(not(Y_orig_face[i][j] == Y_orig_audio[i][j])):
            print("invalid1")
            break

# Split the data using train_test_split utility
train_test_distribution = train_test_split(X_orig_face, Y_orig_face, X_orig_audio, Y_orig_audio,
                                          test_size = 0.2, shuffle = True)
X_train_face = train_test_distribution[0]
X_test_face = train_test_distribution[1]
Y_train_face = train_test_distribution[2]
Y_test_face = train_test_distribution[3]

X_train_audio = train_test_distribution[4]
X_test_audio = train_test_distribution[5]
Y_train_audio = train_test_distribution[6]
Y_test_audio = train_test_distribution[7]

#Sanity check after splitting
for i in range(Y_test_face.shape[0]):
    for j in range(Y_test_face.shape[1]):
        if(not(Y_test_face[i][j] == Y_test_audio[i][j])):
            print("invalid")
            break

for i in range(Y_train_face.shape[0]):
    for j in range(Y_train_face.shape[1]):
        if(not(Y_train_face[i][j] == Y_train_audio[i][j])):
            print("invalid")
            break

#Y_train_face and Y_train_speech are equivalent
# and Y_test_face and Y_test_speech are also equivalent
Y_train = Y_train_face
Y_test = Y_test_face

In [None]:
X_train_face_gen = data_util.FaceDataGenerator(X_train_face, Y_train_face, 4, 224, 224)
X_val_face_gen = data_util.FaceDataGenerator(X_test_face, Y_test_face, 4, 224, 224)

In [None]:
X_train_audio_gen = data_util.AudioDataGenerator(X_train_audio, Y_train_audio, 4, 224, 224)
X_val_audio_gen = data_util.AudioDataGenerator(X_test_audio, Y_test_audio, 4, 224, 224)

# Pretrain the Model

## Define function to pretrain the model

In [None]:
def pretrain(train_data_gen, val_data_gen, epochs, name):
    model = VGG16()
    output_layer = Dense(8, activation = 'softmax')(model.layers[-2].output)
    model = Model(model.input, output_layer, name = 'pretrained_' + model.name + '_' + name)
    model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])

    history = model.fit(train_data_gen, validation_data = val_data_gen, epochs = epochs)

    plt.plot(history.history['categorical_accuracy'])

    model_save_dir_path = './temp/saved_models/pretrained'
    if(not os.path.exists(model_save_dir_path)):
        os.makedirs(model_save_dir_path)
    model.save(model_save_dir_path + '/' + model.name + '.h5')

    model_history_path = './temp/history/pretrained'
    if(not os.path.exists(model_history_path)):
        os.makedirs(model_history_path)
    np.save(model_history_path + '/' + model.name + '_history.npy',history.history)

### Pretrain Audio and Facial Network

In [None]:
pretrain(X_train_face_gen, X_val_face_gen, 200, 'face')

In [None]:
pretrain(X_train_audio_gen, X_val_audio_gen, 200, 'audio')