In [1]:
import tensorflow as tf 
from tensorflow import keras 
import tensorflow_addons as tfa 
import pandas as pd
import numpy as np 
from models import load_scene_model, load_face_model, load_audio_model, load_text_glove_model

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


### Load models

In [2]:
scene_model = load_scene_model()
face_model  = load_face_model()
audio_model = load_audio_model()
text_model  = load_text_glove_model()

### Freeze layers

In [3]:
for layer in scene_model.layers:
    layer.trainable = False 
    layer._name = 'Scene_' + layer._name
scene_model.summary()

Model: "scene_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Scene_Input (InputLayer)        [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Scene_Rescaling (TimeDistribute (None, 10, 224, 224, 0           Scene_Input[0][0]                
__________________________________________________________________________________________________
Scene_time_distributed (TimeDis (None, 10, 224, 224, 0           Scene_Input[0][0]                
__________________________________________________________________________________________________
Scene_vgg16 (TimeDistributed)   (None, 10, 512)      14714688    Scene_Rescaling[0][0]            
________________________________________________________________________________________

In [4]:
for layer in face_model.layers:
    layer.trainable = False 
    layer._name = 'Face_' + layer._name
face_model.summary()

Model: "face_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Face_Input (InputLayer)         [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Face_Rescaling (TimeDistributed (None, 10, 224, 224, 0           Face_Input[0][0]                 
__________________________________________________________________________________________________
Face_time_distributed_2 (TimeDi (None, 10, 224, 224, 0           Face_Input[0][0]                 
__________________________________________________________________________________________________
Face_vgg16 (TimeDistributed)    (None, 10, 512)      14714688    Face_Rescaling[0][0]             
_________________________________________________________________________________________

In [5]:
for layer in audio_model.layers:
    layer.trainable = False
    layer._name = 'Audio_' + layer._name
audio_model.summary()

Model: "audio_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Audio_input_2 (InputLayer)   [(None, 15, 128)]         0         
_________________________________________________________________
Audio_conv1d (Conv1D)        (None, 14, 32)            8224      
_________________________________________________________________
Audio_dropout_6 (Dropout)    (None, 14, 32)            0         
_________________________________________________________________
Audio_conv1d_1 (Conv1D)      (None, 13, 64)            4160      
_________________________________________________________________
Audio_dropout_7 (Dropout)    (None, 13, 64)            0         
_________________________________________________________________
Audio_lstm_8 (LSTM)          (None, 13, 512)           1181696   
_________________________________________________________________
Audio_lstm_9 (LSTM)          (None, 256)               

In [6]:
for layer in text_model.layers:
    layer.trainable = False 
    layer._name = 'Text_' + layer._name
text_model.summary()

Model: "text_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Text_input_3 (InputLayer)       [(None, 50)]         0                                            
__________________________________________________________________________________________________
Text_embedding (Embedding)      (None, 50, 100)      1105200     Text_input_3[0][0]               
__________________________________________________________________________________________________
Text_conv1d_2 (Conv1D)          (None, 48, 16)       4816        Text_embedding[0][0]             
__________________________________________________________________________________________________
Text_conv1d_4 (Conv1D)          (None, 48, 32)       9632        Text_embedding[0][0]             
_________________________________________________________________________________________

### Build model

In [7]:
scene_inputs = keras.layers.Input(shape=(10,224,224,3), name='Scene_input')
face_inputs  = keras.layers.Input(shape=(10,224,224,3), name='Face_input')
audio_inputs = keras.layers.Input(shape=(15,128), name='Audio_input')
text_inputs  = keras.layers.Input(shape=(50), name='Text_input')

x = scene_model.layers[-13].output
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.MultiHeadAttention(num_heads=2, key_dim=64)(x,x)
print('Scene shape:', x.shape)

y = face_model.layers[-13].output
y = keras.layers.Dense(64, activation='relu')(y)
y = keras.layers.MultiHeadAttention(num_heads=2,key_dim=64)(y,y)
print('Face shape:', y.shape)

z = audio_model.layers[-6].output
z = keras.layers.MultiHeadAttention(num_heads=2,key_dim=64)(z,z)
print('Audio shape:', z.shape)

w = text_model.layers[-8].output
w = keras.layers.Dense(64, activation='relu')(w)
w = keras.layers.MultiHeadAttention(num_heads=2,key_dim=64)(w,w)
print('Text shape:', w.shape)

o = keras.layers.Concatenate(axis=1)([x,y,z,w])

o = keras.layers.GlobalAveragePooling1D()(o)


o = keras.layers.Dense(128, activation='relu')(o)
o = keras.layers.Dropout(0.2)(o)
o = keras.layers.Dense(5, activation='sigmoid')(o)
# x.shape, y.shape, z.shape, w.shape, o.shape



atten_model = keras.models.Model(inputs=[scene_model.input, face_model.input, audio_model.input, text_model.input], outputs=o)
atten_model.compile(loss='mse', optimizer=tfa.optimizers.RectifiedAdam(), metrics=['mae'])


# keras.utils.plot_model(atten_model, show_shapes=True)
atten_model.summary()

Scene shape: (None, 10, 64)
Face shape: (None, 10, 64)
Audio shape: (None, 13, 64)
Text shape: (None, 46, 64)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Scene_Input (InputLayer)        [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Face_Input (InputLayer)         [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Audio_input_2 (InputLayer)      [(None, 15, 128)]    0                                            
__________________________________________________________________________________________________
Text_input_3 (InputLayer)       [(None, 50)]         0                             

In [8]:
import datetime
t = datetime.datetime.now().strftime("%m%d_%H%M%S")

early_stopping = keras.callbacks.EarlyStopping(patience=10, verbose=0)
check_point    = keras.callbacks.ModelCheckpoint(filepath='./weights/self_attention/'+str(t)+'/attention.t5',
                             monitor='val_mae',
                             mode='min',
                             save_best_only=True,
                             save_weights_only=True,
                             verbose=0)

### Load data

In [9]:
AUTOTUNE = tf.data.AUTOTUNE

# Train
scene_train_ds = tf.data.experimental.load('./data/fullscene/train_ds/')
face_train_ds  = tf.data.experimental.load('./data/faces/train_ds/')
audio_train_ds = tf.data.experimental.load('./data/audio/train_ds/')
text_train_ds  = tf.data.experimental.load('./data/text/train_ds/').batch(batch_size=32)

scene_xtrain = scene_train_ds.map(lambda x,y: x)
face_xtrain  = face_train_ds.map(lambda x,y: x)
audio_xtrain = audio_train_ds.map(lambda x,y: x)
text__xtrain = text_train_ds.map(lambda x,y: x)
y_train      = scene_train_ds.map(lambda x,y: y)

train_ds = tf.data.Dataset.zip(((scene_xtrain, face_xtrain, audio_xtrain, text__xtrain), y_train)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)


# Valid
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)

train_ds, valid_ds

(<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>,
 <PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>)

### Train model

In [10]:
history = atten_model.fit(train_ds, validation_data=valid_ds, batch_size=4, epochs=100, callbacks=[early_stopping, check_point], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


### Load weights

In [11]:
atten_model.load_weights('./weights/self_attention/0301_223332/attention.t5')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21df0766dc0>

## Evaluation

### Validation data

In [12]:
AUTOTUNE = tf.data.AUTOTUNE
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).prefetch(buffer_size=AUTOTUNE)


In [13]:
from sklearn.metrics import mean_absolute_error 

y_true = np.concatenate([y for x,y in valid_ds], axis=0)
y_pred = atten_model.predict(valid_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.53201, 92.27915, 91.48858, 91.7785 , 91.33944], dtype=float32),
 91.68353825807571)

### Test data

In [14]:
scene_test_ds = tf.data.experimental.load('./data/fullscene/test_ds/')
face_test_ds  = tf.data.experimental.load('./data/faces/test_ds/')
audio_test_ds = tf.data.experimental.load('./data/audio/test_ds') 
text_test_ds  = tf.data.experimental.load('./data/text/test_ds/').batch(batch_size=32)


scene_xtest = scene_test_ds.map(lambda x,y: x)
face_xtest  = face_test_ds.map(lambda x,y: x)
audio_xtest = audio_test_ds.map(lambda x,y: x)
text_xtest  = text_test_ds.map(lambda x,y: x)

y_test      = scene_test_ds.map(lambda x,y: y)

test_ds = tf.data.Dataset.zip(((scene_xtest, face_xtest, audio_xtest, text_xtest), y_test)).prefetch(buffer_size=AUTOTUNE)

test_ds

<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>

In [15]:
y_true = np.concatenate([y for x,y in test_ds], axis=0)
y_pred = atten_model.predict(test_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.289116, 92.245865, 91.66372 , 91.486595, 91.36632 ],
       dtype=float32),
 91.61032140254974)

In [None]:
import pickle
with open('./histories/attention_conc.pkl', 'wb') as f:
    pickle.dump(history.history, f)