In [1]:
import tensorflow as tf 
from tensorflow import keras 
import tensorflow_addons as tfa 
import pandas as pd
import numpy as np 
from sklearn.metrics import mean_absolute_error
from models import load_ef_model

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


### Load EarlyFusion model

In [2]:
ef_model = load_ef_model()
ef_model.summary()

Model: "ef_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Scene_input (InputLayer)        [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Face_input (InputLayer)         [(None, 10, 224, 224 0                                            
__________________________________________________________________________________________________
Audio_input (InputLayer)        [(None, 15, 128)]    0                                            
__________________________________________________________________________________________________
Text_input (InputLayer)         [(None, 50)]         0                                            
___________________________________________________________________________________________

### Load data

In [3]:
AUTOTUNE = tf.data.AUTOTUNE

# Train
scene_train_ds = tf.data.experimental.load('./data/fullscene/train_ds/')
face_train_ds  = tf.data.experimental.load('./data/faces/train_ds/')
audio_train_ds = tf.data.experimental.load('./data/audio/train_ds/')
text_train_ds  = tf.data.experimental.load('./data/text/train_ds/').batch(batch_size=32)

scene_xtrain = scene_train_ds.map(lambda x,y: x)
face_xtrain  = face_train_ds.map(lambda x,y: x)
audio_xtrain = audio_train_ds.map(lambda x,y: x)
text__xtrain = text_train_ds.map(lambda x,y: x)
y_train      = scene_train_ds.map(lambda x,y: y)

train_ds = tf.data.Dataset.zip(((scene_xtrain, face_xtrain, audio_xtrain, text__xtrain), y_train)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)


# Valid
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).shuffle(buffer_size=1000).prefetch(buffer_size=AUTOTUNE)

train_ds, valid_ds

(<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>,
 <PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>)

### Compile and Train model

In [10]:
import datetime
t = datetime.datetime.now().strftime("%m%d_%H%M%S")

early_stopping = keras.callbacks.EarlyStopping(patience=10, verbose=0)
check_point    = keras.callbacks.ModelCheckpoint(filepath='./weights/ef/'+str(t)+'/ef.t5',
                             monitor='val_mae',
                             mode='min',
                             save_best_only=True,
                             save_weights_only=True,
                             verbose=0)

optimizer = tfa.optimizers.RectifiedAdam()
ef_model.compile(loss='mse', optimizer=optimizer, metrics=['mae'])
history = ef_model.fit(train_ds, validation_data=valid_ds, batch_size=32, epochs=100, callbacks=[early_stopping, check_point])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


## Evaluation

### Validation data

In [6]:
AUTOTUNE = tf.data.AUTOTUNE
scene_valid_ds = tf.data.experimental.load('./data/fullscene/valid_ds/')
face_valid_ds  = tf.data.experimental.load('./data/faces/valid_ds/')
audio_valid_ds = tf.data.experimental.load('./data/audio/valid_ds') 
text_valid_ds  = tf.data.experimental.load('./data/text/valid_ds/').batch(batch_size=32)

scene_xvalid = scene_valid_ds.map(lambda x,y: x)
face_xvalid  = face_valid_ds.map(lambda x,y: x)
audio_xvalid = audio_valid_ds.map(lambda x,y: x)
text_xvalid  = text_valid_ds.map(lambda x,y: x)
y_valid      = scene_valid_ds.map(lambda x,y: y)

valid_ds = tf.data.Dataset.zip(((scene_xvalid, face_xvalid, audio_xvalid, text_xvalid), y_valid)).prefetch(buffer_size=AUTOTUNE)


### Load weights

In [9]:
ef_model.load_weights('./weights/ef/0227_074102/ef.t5')
loss, mae = ef_model.evaluate(valid_ds)
(1-mae)*100



91.74359366297722

### Validation data

In [10]:
y_true = np.concatenate([y for x,y in valid_ds], axis=0)
y_pred = ef_model.predict(valid_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.630356, 92.24704 , 91.63761 , 91.83563 , 91.3673  ],
       dtype=float32),
 91.74358993768692)

### Test data

In [11]:
scene_test_ds = tf.data.experimental.load('./data/fullscene/test_ds/')
face_test_ds  = tf.data.experimental.load('./data/faces/test_ds/')
audio_test_ds = tf.data.experimental.load('./data/audio/test_ds') 
text_test_ds  = tf.data.experimental.load('./data/text/test_ds/').batch(batch_size=32)


scene_xtest = scene_test_ds.map(lambda x,y: x)
face_xtest  = face_test_ds.map(lambda x,y: x)
audio_xtest = audio_test_ds.map(lambda x,y: x)
text_xtest  = text_test_ds.map(lambda x,y: x)

y_test      = scene_test_ds.map(lambda x,y: y)

test_ds = tf.data.Dataset.zip(((scene_xtest, face_xtest, audio_xtest, text_xtest), y_test)).prefetch(buffer_size=AUTOTUNE)

test_ds

<PrefetchDataset shapes: (((None, 10, 224, 224, 3), (None, 10, 224, 224, 3), (None, 15, 128), (None, 50)), (None, 5)), types: ((tf.float32, tf.float32, tf.float32, tf.int32), tf.float32)>

In [12]:
with tf.device('/gpu:0'):
    loss, mae = ef_model.evaluate(test_ds)
(1-mae)*100



91.70725345611572

In [13]:
y_true = np.concatenate([y for x,y in test_ds], axis=0)
y_pred = ef_model.predict(test_ds)

mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
(1-mae)*100, (1-np.mean(mae))*100

(array([91.49651, 92.27196, 91.80983, 91.43613, 91.52185], dtype=float32),
 91.70725718140602)