The following code defines LSTM model setup trained on both data modalities - audio and visual data combined together. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler

In [None]:
import sys
sys.path.insert(1, '../')

from data_preparation import prepare_x_data, get_Y_labels, reshape_Y, reshape_X, unscale_Y

In [None]:
scaler = StandardScaler()

## Data preparation

Data preparation includes: <br>
<ul>
  <li>selecting necessary features from source files</li>
  <li>creating combined dataset for the model training</li>
  <li>reshaping data for model training.</li>

### X data - audio and visual features

In [None]:
x_train_visual = prepare_x_data('../Data/LLDs_video_openface/train',',', 5, scaler)
x_train_audio = prepare_x_data('../Data/LLDs_audio_eGeMAPS/train',';', 2, scaler) 

In [None]:
x_test_visual = prepare_x_data('../Data/LLDs_video_openface/dev',',', 5, scaler) 
x_test_audio = prepare_x_data('../Data/LLDs_audio_eGeMAPS/dev',';', 2, scaler) 

### Y data - YMRS score

In [None]:
y_train = get_Y_labels('../Data/labels_metadata.csv', 60, 164, scaler)
y_train = reshape_Y(y_train,len(x_train_visual),1,1)

In [None]:
y_test = get_Y_labels('../Data/labels_metadata.csv', 0, 60, scaler)
y_test = reshape_Y(y_test,len(x_test_visual),1,1)

## Model setup - LSTM

Proposed model for LSTM recurrent neural network architecture for multi-modal dataset. Both data modalities are concatenated together before the last Dense layer.

In [None]:
visual_input = keras.Input(shape=(None,465), name="visual")
audio_input = keras.Input(shape=(None,23), name="audio")

In [None]:
visual_features = layers.LSTM(units = 207, input_shape=(None, 465), return_sequences=True)(visual_input)
visual_features = layers.Dropout(0.004)(visual_features)
visual_features = layers.LSTM(units = 207, input_shape=(None, 207), return_sequences=False)(visual_features)
visual_features = layers.Dropout(0.004)(visual_features)

In [None]:
audio_features = layers.LSTM(units = 12, input_shape=(None,23), return_sequences=True)(audio_input)
audio_features = layers.Dropout(0.004)(audio_features)
audio_features = layers.LSTM(units = 11, input_shape=(None,12), return_sequences=False)(audio_features)
audio_features = layers.Dropout(0.004)(audio_features)

In [None]:
x = layers.concatenate([visual_features, audio_features])

In [None]:
x = layers.Dense(1, activation='linear')(x)

In [None]:
y_pred =layers.Dense(1, name="ymrs")(x)

In [None]:
model = keras.Model(
    inputs=[visual_input, audio_input],
    outputs=[y_pred]
)

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.004),
    loss='mse',           
    metrics= [keras.metrics.MeanAbsoluteError()] #['mean_absolute_error']
)

In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model=model, show_dtype=True, show_layer_names=True, show_shapes=True, to_file='LSTM_multimodal.png')

### Model training

Model training setup is based on an iterative approach where model is trained one file at a time, then learned parameters are saved and loaded in the next iterative step. This setup is necessary due to the fact that source files does not have an uniform size and differ in number of frames.

In [None]:
train_index = 0
val_index = 0

loss = {}
train_eval = []
train_loss = []
train_mae = []

for train_visual, train_audio in zip(x_train_visual, x_train_audio):    
        train_visual = np.array(train_visual).reshape((1, train_visual.shape[0], -1))
        train_audio = np.array(train_audio).reshape((1, train_audio.shape[0], -1))
                
        history = model.fit(
            [train_visual, train_audio], 
            y_train[train_index],
            epochs=10,
        )

        loss.update({train_index:history.history})

        train_index += 1

        model.save(r'LSTM_train_multimodal', include_optimizer = True)
        model = keras.models.load_model(r'LSTM_train_multimodal')

        scores = model.evaluate([train_visual, train_audio], y_train[train_index], verbose = 0)
        train_eval.append(scores)
        train_loss.append(scores[0])
        train_mae.append(scores[1])
        
        val_index += 1

##### Model evaluation - train set

In [None]:
avg_train_loss = sum(train_loss) / len(train_loss)
avg_train_mae = sum(train_mae) / len(train_mae)
print("Train loss (avg):", avg_train_loss, "Train MAE (avg):", avg_train_mae)

In [None]:
plt.xlabel("Iterations")
plt.ylabel("Scaled YMRS value")
plt.plot(train_loss, label="MSE")
plt.plot(train_mae, label="MAE")
plt.legend()
plt.show()

### Model evaluation - validation set

Subset taken from test data is defined as a validation set.

In [None]:
x_val_visual, x_val_audio = x_test_visual[:30], x_test_audio[:30]
y_val = y_test[:30]

In [None]:
eval_loss = []
eval_mae = []
eval_index = 0

for input_visual, input_audio in zip(x_val_visual, x_val_audio):
    input_visual = reshape_X(input_visual)
    input_audio = reshape_X(input_audio)

    scores = model.evaluate(
        [input_visual,input_audio], 
        y_val[eval_index],
        verbose = 0)
        
    eval_loss.append(scores[0])
    eval_mae.append(scores[1])
    
    eval_index += 1

In [None]:
avg_eval_loss = sum(eval_loss) / len(eval_loss)
avg_eval_mae = sum(eval_mae) / len(eval_mae)
print("Validation loss (avg):", avg_eval_loss, "Validation MAE (avg):", avg_eval_mae)

In [None]:
plt.xlabel("Iterations")
plt.ylabel("Scaled YMRS value")
plt.plot(eval_loss, label="MSE")
plt.plot(eval_mae, label="MAE")
plt.legend()
plt.show()

## Prediction

Prediction is made on different subset taken from the test dataset. Then actual and predicted YMRS values are compared.

In [None]:
x_pred_visual, x_pred_audio = x_test_visual[30:], x_test_audio[30:]
y_pred_actual = y_test[30:]

In [None]:
y_prediction = []

pred_scores = []
pred_loss = []
pred_mae = []

pred_index = 0


for test_visual, test_audio in zip(x_pred_visual, x_pred_audio):
    test_visual = np.array(test_visual).reshape((1, test_visual.shape[0], -1))
    test_audio = np.array(test_audio).reshape((1, test_audio.shape[0], -1))
    
    pred_y = y_prediction.append(model.predict([test_visual, test_audio]))
    
    scores = model.evaluate([test_visual, test_audio], y_pred_actual[pred_index], verbose=0)
    pred_scores.append(scores)
    pred_loss.append(scores[0])
    pred_mae.append(scores[1])
    
    pred_index += 1

##### Model evaluation - prediction set

In [None]:
avg_pred_loss = sum(pred_loss) / len(pred_loss)
avg_pred_mae = sum(pred_mae) / len(pred_mae)
print("Prediction loss (avg):", avg_pred_loss, "Prediction MAE (avg):", avg_pred_mae)

In [None]:
plt.xlabel("Iterations")
plt.ylabel("Scaled YMRS value")
plt.plot(pred_loss, label="MSE")
plt.plot(pred_mae, label="MAE")
plt.legend()
plt.show()

#### Actual vs predicted comparison

In [None]:
y_prediction = np.array(y_prediction).reshape(-1, 1)
y_pred_actual = np.array(y_pred_actual).reshape(-1, 1)

y_prediction = unscale_Y(y_prediction, scaler)
y_pred_actual = unscale_Y(y_pred_actual, scaler)

In [None]:
pred_df = pd.DataFrame(data=np.column_stack((y_pred_actual,y_prediction)),columns=['y_actual','y_pred'])
pred_df['pred_error'] = pred_df['y_actual'] - pred_df['y_pred']
pred_df = pred_df.sort_values(by=['y_actual']).reset_index()
pred_df['y_actual'] = pred_df['y_actual'].apply(np.int64)
pred_df

In [None]:
pred_df.plot('y_actual', 'y_pred', kind='scatter')
plt.xlabel("Actual YMRS")
plt.ylabel("Predicted YMRS")
plt.show()

In [None]:
pred_df.plot('y_actual', 'pred_error', kind='scatter')
plt.xlabel("Actual YMRS")
plt.ylabel("Prediction error")
plt.show()

In [None]:
plt.xlabel("Test subject IDs")
plt.ylabel("Target value (YMRS)")
plt.plot(pred_df['y_actual'], label="Actual")
plt.plot(pred_df['y_pred'], label="Predicted")
plt.legend()
plt.show()