In [2]:
import tensorflow as tf 
from tensorflow import keras 
import tensorflow_addons as tfa 
import pandas as pd
import numpy as np 
from models import load_scene_model, load_face_model, load_audio_model, load_text_glove_model

### Load models

In [3]:
scene_model = load_scene_model()
face_model  = load_face_model()
audio_model = load_audio_model()
text_model  = load_text_glove_model()

### Freeze layers

In [4]:
for layer in scene_model.layers:
    layer.trainable = False 
    layer._name = 'Scene_' + layer._name
scene_model.summary()

Model: "scene_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Scene_Input (InputLayer)        [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Scene_Rescaling (TimeDistribute (None, 10, 224, 224, 0           Scene_Input[0][0]                
__________________________________________________________________________________________________
Scene_time_distributed (TimeDis (None, 10, 224, 224, 0           Scene_Input[0][0]                
__________________________________________________________________________________________________
Scene_vgg16 (TimeDistributed)   (None, 10, 512)      14714688    Scene_Rescaling[0][0]            
________________________________________________________________________________________

In [5]:
for layer in face_model.layers:
    layer.trainable = False 
    layer._name = 'Face_' + layer._name
face_model.summary()

Model: "face_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Face_Input (InputLayer)         [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Face_Rescaling (TimeDistributed (None, 10, 224, 224, 0           Face_Input[0][0]                 
__________________________________________________________________________________________________
Face_time_distributed_2 (TimeDi (None, 10, 224, 224, 0           Face_Input[0][0]                 
__________________________________________________________________________________________________
Face_vgg16 (TimeDistributed)    (None, 10, 512)      14714688    Face_Rescaling[0][0]             
_________________________________________________________________________________________

In [6]:
for layer in audio_model.layers:
    layer.trainable = False
    layer._name = 'Audio_' + layer._name
audio_model.summary()

Model: "audio_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Audio_input_2 (InputLayer)   [(None, 15, 128)]         0         
_________________________________________________________________
Audio_conv1d (Conv1D)        (None, 14, 32)            8224      
_________________________________________________________________
Audio_dropout_6 (Dropout)    (None, 14, 32)            0         
_________________________________________________________________
Audio_conv1d_1 (Conv1D)      (None, 13, 64)            4160      
_________________________________________________________________
Audio_dropout_7 (Dropout)    (None, 13, 64)            0         
_________________________________________________________________
Audio_lstm_8 (LSTM)          (None, 13, 512)           1181696   
_________________________________________________________________
Audio_lstm_9 (LSTM)          (None, 256)               

In [7]:
for layer in text_model.layers:
    layer.trainable = False 
    layer._name = 'Text_' + layer._name
text_model.summary()

Model: "text_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Text_input_3 (InputLayer)       [(None, 50)]         0                                            
__________________________________________________________________________________________________
Text_embedding (Embedding)      (None, 50, 100)      1105200     Text_input_3[0][0]               
__________________________________________________________________________________________________
Text_conv1d_2 (Conv1D)          (None, 48, 16)       4816        Text_embedding[0][0]             
__________________________________________________________________________________________________
Text_conv1d_4 (Conv1D)          (None, 48, 32)       9632        Text_embedding[0][0]             
_________________________________________________________________________________________

### Build model

In [31]:
scene_inputs = keras.layers.Input(shape=(10,224,224,3), name='Scene_input')
face_inputs  = keras.layers.Input(shape=(10,224,224,3), name='Face_input')
audio_inputs = keras.layers.Input(shape=(15,128), name='Audio_input')
text_inputs  = keras.layers.Input(shape=(50), name='Text_input')

x = scene_model.layers[-13].output
y = face_model.layers[-13].output

v = keras.layers.Average()([x,y])
v = keras.layers.Dense(64, activation='relu')(v)

a = audio_model.layers[-6].output
t = text_model.layers[-8].output
t = keras.layers.Dense(64, activation='relu')(t)

a1 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(v,a) # video->audio
a2 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(a,v) # audio->video

a3 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(v,t) # video->text
a4 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(t,v) # text->video

a5 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(a,t) # audio->text
a6 = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(t, a) # text-audio



o = keras.layers.Concatenate(axis=1)([a1, a2, a3, a4, a5, a6])
o = keras.layers.GlobalAveragePooling1D()(o)


o = keras.layers.Dense(64, activation='relu')(o)
o = keras.layers.Dense(5, activation='sigmoid')(o)

atten_model = keras.models.Model(inputs=[scene_model.input, face_model.input, audio_model.input, text_model.input], outputs=o)

atten_model.compile(loss='mse', optimizer=tfa.optimizers.RectifiedAdam(), metrics=['mae'])




In [32]:
import datetime
t = datetime.datetime.now().strftime("%m%d_%H%M%S")

early_stopping = keras.callbacks.EarlyStopping(patience=10, verbose=0)
check_point    = keras.callbacks.ModelCheckpoint(filepath='./weights/cross_atten/'+str(t)+'/attention.t5',
                             monitor='val_mae',
                             mode='min',
                             save_best_only=True,
                             save_weights_only=True,
                             verbose=0)

### Load data

In [33]:
AUTOTUNE = tf.data.AUTOTUNE

# Train
scene_train_ds = tf.data.experimental.load('./data/fullscene/train_ds/')
face_train_ds  = tf.data.experimental.load('./data/faces/train_ds/')
audio_train_ds = tf.data.experimental.load('./data/audio/train_ds/')
text_train_ds  = tf.data.experimental.load('./data/text/train_ds/').batch(batch_size=32)

scene_xtrain = scene_train_ds.map(lambda x,y: x)
face_xtrain  = face_train_ds.map(lambda x,y: x)
audio_xtrain = audio_train_ds.map(lambda x,y: x)
text__xtrain = text_train_ds.map(lambda x,y: x)
y_train      = scene_train_ds.map(lambda x,y: y)

train_ds = tf.data.Dataset.zip(((scene_xtrain, face_xtrain, audio_xtrain, text__xtrain), y_train)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)


# Valid
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)

train_ds, valid_ds

(<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>,
 <PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>)

### Train

In [34]:
history = atten_model.fit(train_ds, validation_data=valid_ds, batch_size=32, epochs=100, callbacks=[early_stopping, check_point], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


### Load weights

In [11]:
atten_model.load_weights('./weights/cross_atten/0302_223958/attention.t5')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21df0766dc0>

## Evaluation

### Validation data

In [35]:
AUTOTUNE = tf.data.AUTOTUNE
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).prefetch(buffer_size=AUTOTUNE)


In [36]:
from sklearn.metrics import mean_absolute_error 

y_true = np.concatenate([y for x,y in valid_ds], axis=0)
y_pred = atten_model.predict(valid_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.574974, 92.207054, 91.53847 , 91.64992 , 91.32228 ],
       dtype=float32),
 91.65853783488274)

### Test data

In [37]:
scene_test_ds = tf.data.experimental.load('../fullscene/test_ds/')
face_test_ds  = tf.data.experimental.load('../faces/test_ds/')
audio_test_ds = tf.data.experimental.load('../audio/test_ds') 
text_test_ds  = tf.data.experimental.load('../text/test_ds/').batch(batch_size=32)


scene_xtest = scene_test_ds.map(lambda x,y: x)
face_xtest  = face_test_ds.map(lambda x,y: x)
audio_xtest = audio_test_ds.map(lambda x,y: x)
text_xtest  = text_test_ds.map(lambda x,y: x)

y_test      = scene_test_ds.map(lambda x,y: y)

test_ds = tf.data.Dataset.zip(((scene_xtest, face_xtest, audio_xtest, text_xtest), y_test)).prefetch(buffer_size=AUTOTUNE)

test_ds

<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>

In [38]:
y_true = np.concatenate([y for x,y in test_ds], axis=0)
y_pred = atten_model.predict(test_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.1931 , 92.16037, 91.49895, 91.25758, 91.33639], dtype=float32),
 91.48927703499794)

In [None]:
import pickle
with open('./histories/attention_cross.pkl', 'wb') as f:
    pickle.dump(history.history, f)