In [1]:
!pip install tqdm
import argparse
import pickle
import sys

import numpy as np

from data_prep import batch_iter, createOneHotMosei3way, get_raw_data

seed = 1234

np.random.seed(seed)
import tensorflow as tf
from tqdm import tqdm

from model import LSTM_Model

from sklearn.metrics import f1_score

tf.set_random_seed(seed)

unimodal_activations = {}


[33mYou are using pip version 10.0.1, however version 19.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


Using TensorFlow backend.


In [2]:
def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

In [9]:
def multimodal(unimodal_activations, data, classes, attn_fusion=True, enable_attn_2=False, use_raw=True):
    """
    Concatenating three modalities, attention-based lstm network training and choosing the best model
    params: 
    data --> MOSEI is the dataset we are using
    classes --> 3 (sentiments)
    attn_fusion --> attention based lstm network to reduce the output of Bi-lstm network shape
    use_raw --> use raw dataset provided for mosei.
    """
    if use_raw:
        if attn_fusion:
            attn_fusion = False

        train_data, test_data, audio_train, audio_test, text_train, text_test, video_train, video_test, train_label, test_label, seqlen_train, seqlen_test, train_mask, test_mask = get_raw_data(
            data, classes)

    else:
        print("starting multimodal")
        text_train = unimodal_activations['text_train']
        audio_train = unimodal_activations['audio_train']
        video_train = unimodal_activations['video_train']

        text_test = unimodal_activations['text_test']
        audio_test = unimodal_activations['audio_test']
        video_test = unimodal_activations['video_test']

        train_mask = unimodal_activations['train_mask']
        test_mask = unimodal_activations['test_mask']

        print('train_mask', train_mask.shape)

        train_label = unimodal_activations['train_label']
        print('train_label', train_label.shape)
        test_label = unimodal_activations['test_label']
        print('test_label', test_label.shape)

        seqlen_train = np.sum(train_mask, axis=-1)
        print('seqlen_train', seqlen_train.shape)
        seqlen_test = np.sum(test_mask, axis=-1)
        print('seqlen_test', seqlen_test.shape)

    a_dim = audio_train.shape[-1]
    v_dim = video_train.shape[-1]
    t_dim = text_train.shape[-1]
    if attn_fusion:
        print('With attention fusion')
    allow_soft_placement = True
    log_device_placement = False

    # Multimodal model
    session_conf = tf.ConfigProto(
        allow_soft_placement=allow_soft_placement,
        log_device_placement=log_device_placement,
        gpu_options=tf.GPUOptions(allow_growth=True))
    gpu_device = 0
    best_acc = 0
    best_loss_accuracy = 0
    best_loss = 10000000.0
    best_epoch = 0
    best_epoch_loss = 0
    epochs=5
    batch_size = 20
    with tf.device('/device:GPU:%d' % gpu_device):
        print('Using GPU - ', '/device:GPU:%d' % gpu_device)
        with tf.Graph().as_default():
            tf.set_random_seed(seed)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                model = LSTM_Model(text_train.shape[1:], 0.0001, a_dim=a_dim, v_dim=v_dim, t_dim=t_dim,
                                   emotions=classes, attn_fusion=attn_fusion,
                                   unimodal=False, enable_attn_2=enable_attn_2,
                                   seed=seed)
                sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

                test_feed_dict = {
                    model.t_input: text_test,
                    model.a_input: audio_test,
                    model.v_input: video_test,
                    model.y: test_label,
                    model.seq_len: seqlen_test,
                    model.mask: test_mask,
                    model.lstm_dropout: 0.0,
                    model.lstm_inp_dropout: 0.0,
                    model.dropout: 0.0,
                    model.dropout_lstm_out: 0.0
                }

                # print('\n\nDataset: %s' % (data))
                print("\nEvaluation before training:")
                # Evaluation after epoch
                step, loss, accuracy = sess.run(
                    [model.global_step, model.loss, model.accuracy],
                    test_feed_dict)
                print("EVAL: epoch {}: step {}, loss {:g}, acc {:g}".format(0, step, loss, accuracy))

                for epoch in range(epochs):
                    epoch += 1

                    batches = batch_iter(list(
                        zip(text_train, audio_train, video_train, train_mask, seqlen_train, train_label)),
                        batch_size)

                    # Training loop. For each batch...
                    print('\nTraining epoch {}'.format(epoch))
                    l = []
                    a = []
                    for i, batch in tqdm(enumerate(batches)):
                        b_text_train, b_audio_train, b_video_train, b_train_mask, b_seqlen_train, b_train_label = zip(
                            *batch)
                        feed_dict = {
                            model.t_input: b_text_train,
                            model.a_input: b_audio_train,
                            model.v_input: b_video_train,
                            model.y: b_train_label,
                            model.seq_len: b_seqlen_train,
                            model.mask: b_train_mask,
                            model.lstm_dropout: 0.4,
                            model.lstm_inp_dropout: 0.0,
                            model.dropout: 0.2,
                            model.dropout_lstm_out: 0.2
                        }

                        _, step, loss, accuracy = sess.run(
                            [model.train_op, model.global_step, model.loss, model.accuracy],
                            feed_dict)
                        l.append(loss)
                        a.append(accuracy)

                    print("\t \tEpoch {}:, loss {:g}, accuracy {:g}".format(epoch, np.average(l), np.average(a)))
                    # Evaluation after epoch
                    step, loss, accuracy, preds, y, mask = sess.run(
                        [model.global_step, model.loss, model.accuracy, model.preds, model.y, model.mask],
                        test_feed_dict)
                    f1 = f1_score(np.ndarray.flatten(tf.argmax(y, -1, output_type=tf.int32).eval()),
                                  np.ndarray.flatten(tf.argmax(preds, -1, output_type=tf.int32).eval()),
                                  sample_weight=np.ndarray.flatten(tf.cast(mask, tf.int32).eval()), average="weighted")
                    print("EVAL: After epoch {}: step {}, loss {:g}, acc {:g}, f1 {:g}".format(epoch, step,
                                                                                               loss / test_label.shape[
                                                                                                   0],
                                                                                               accuracy, f1))
                    if accuracy > best_acc:
                        best_epoch = epoch
                        best_acc = accuracy
                    if loss < best_loss:
                        best_loss = loss
                        best_loss_accuracy = accuracy
                        best_epoch_loss = epoch

                print(
                    "\n\nBest epoch: {}\nBest test accuracy: {}\nBest epoch loss: {}\nBest test accuracy when loss is least: {}".format(
                        best_epoch, best_acc, best_epoch_loss, best_loss_accuracy))
                saver = tf.train.Saver()
                saver.save(sess,"./dataset/network")

In [8]:
multimodal(unimodal_activations,'mosei',3, True,True, use_raw=True)

Shape for audio test data (678, 98, 74)
Shape for text test data (678, 98, 300)
Shape for video test data (678, 98, 35)
audio train shape is (2250, 98, 74)
audio test shape is (678, 98, 74)
Trimodal Train data shape (2250, 98, 409)
Trimodal Test data shape (678, 98, 409)
Using GPU -  /device:GPU:0
Trainable parameters: 314403

Evaluation before training:


0it [00:00, ?it/s]

EVAL: epoch 0: step 0, loss 0.000379732, acc 1

Training epoch 1


113it [01:22,  2.56it/s]


	 	Epoch 1:, loss 0.000375799, accuracy 0.431751


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
0it [00:00, ?it/s]

EVAL: After epoch 1: step 113, loss 5.51662e-07, acc 0.715865, f1 0.834407

Training epoch 2


113it [00:51,  2.26it/s]


	 	Epoch 2:, loss 0.000382683, accuracy 0.420226


0it [00:00, ?it/s]

EVAL: After epoch 2: step 226, loss 5.88749e-07, acc 1, f1 1

Training epoch 3


113it [00:53,  2.20it/s]


	 	Epoch 3:, loss 0.000431739, accuracy 0.485287


0it [00:00, ?it/s]

EVAL: After epoch 3: step 339, loss 6.98223e-07, acc 1, f1 1

Training epoch 4


113it [00:51,  2.47it/s]


	 	Epoch 4:, loss 0.000529975, accuracy 0.510638


0it [00:00, ?it/s]

EVAL: After epoch 4: step 452, loss 8.7748e-07, acc 1, f1 1

Training epoch 5


113it [00:51,  2.51it/s]


	 	Epoch 5:, loss 0.000674671, accuracy 0.523098
EVAL: After epoch 5: step 565, loss 1.12529e-06, acc 1, f1 1


Best epoch: 2
Best test accuracy: 1.0
Best epoch loss: 1
Best test accuracy when loss is least: 0.7158647775650024


In [None]:
import tensorflow as tf

sess =  tf.Session() 
saver = tf.train.import_meta_graph('./dataset/network.meta')
saver.restore(sess,tf.train.latest_checkpoint('./dataset'))
graph = tf.get_default_graph()