In [30]:
import math
import random
import skimage.io as io
import matplotlib.pyplot as plt
import keras
import pickle
import numpy as np
import cv2
import os
from os import listdir
from os.path import isfile, join
import sys
import dlib
# import skvideo.io
import json
from keras.preprocessing import image
import glob

# Prepare Data

## Phonemes

In [22]:
phoneme_list = [] 
phoneme_dict = {}

with open("/n/fs/scratch/jiaqis/cmudict-master/cmudict.phones", 'r') as fp:
    i = 0
    line = fp.readline()
    while line:
        phoneme = line.split()[0].strip()
        phoneme_property = line.split()[1].strip()
        phoneme_list.append((phoneme, phoneme_property))
        phoneme_dict[phoneme] = i
        line = fp.readline()
        i=i+1

print(phoneme_list, phoneme_dict)

([('AA', 'vowel'), ('AE', 'vowel'), ('AH', 'vowel'), ('AO', 'vowel'), ('AW', 'vowel'), ('AY', 'vowel'), ('B', 'stop'), ('CH', 'affricate'), ('D', 'stop'), ('DH', 'fricative'), ('EH', 'vowel'), ('ER', 'vowel'), ('EY', 'vowel'), ('F', 'fricative'), ('G', 'stop'), ('HH', 'aspirate'), ('IH', 'vowel'), ('IY', 'vowel'), ('JH', 'affricate'), ('K', 'stop'), ('L', 'liquid'), ('M', 'nasal'), ('N', 'nasal'), ('NG', 'nasal'), ('OW', 'vowel'), ('OY', 'vowel'), ('P', 'stop'), ('R', 'liquid'), ('S', 'fricative'), ('SH', 'fricative'), ('T', 'stop'), ('TH', 'fricative'), ('UH', 'vowel'), ('UW', 'vowel'), ('V', 'fricative'), ('W', 'semivowel'), ('Y', 'semivowel'), ('Z', 'fricative'), ('ZH', 'fricative')], {'IY': 17, 'W': 35, 'DH': 9, 'Y': 36, 'HH': 15, 'CH': 7, 'JH': 18, 'ZH': 38, 'EH': 10, 'NG': 23, 'TH': 31, 'AA': 0, 'B': 6, 'AE': 1, 'D': 8, 'G': 14, 'F': 13, 'AH': 2, 'K': 19, 'M': 21, 'L': 20, 'AO': 3, 'N': 22, 'IH': 16, 'S': 28, 'R': 27, 'EY': 12, 'T': 30, 'AW': 4, 'V': 34, 'AY': 5, 'Z': 37, 'ER': 1

In [23]:
pron_dict = cmudict.dict()

In [24]:
import re
from collections import defaultdict

import nltk
from nltk.corpus import cmudict

def clean_pron(pron):
    """Remove stress from pronunciations."""
    return re.sub(r"\d", "", pron)

def make_triphones(pron):
    """Output triphones from a word's pronunciation."""
    if len(pron) < 3:
        return []
    # Junk on end is to make word boundaries work
    return ([((pron[idx - 2], pron[idx - 1]), pron[idx])
             for idx in range(2, len(pron))] + [(('#', '#'), pron[0])] +
            [((pron[-2], pron[-1]), '#')])
                                                
def triphone_probs(prons):
    """Calculate triphone probabilities for pronunciations."""
    context_counts = defaultdict(lambda: defaultdict(int))
    for pron in prons:
        for (context, phoneme) in make_triphones(pron):
            context_counts[context][phoneme] += 1
            
    for (context, outcomes) in context_counts.items():
        total_outcomes = sum(outcomes.values())
        for outcome, count in outcomes.items():
            context_counts[context][outcome] = float(count) / total_outcomes
        
    return context_counts

## Video Volume and Facial Features

In [31]:
DATA_DIR = "/n/fs/scratch/jiaqis/LRS3-TED/"
SAVE_DIR = "/n/fs/scratch/jiaqis/LRS3-TED-Extracted/"

In [41]:
def get_dataset_list(dataDir, setName):
    # Images, facial/mouth features, text-> phonetic
    data_list = []
    for urlDir in glob.glob(os.path.join(dataDir, setName, "*/")):
        url = urlDir.split('/')[-2]
        for idFilename in glob.glob(os.path.join(urlDir, '*.txt')):
            ID = idFilename.split('/')[-1]
            data_list.append((url, ID))
    return data_list

In [42]:
test_ID_list = get_dataset_list(SAVE_DIR, "test")

In [44]:
print(len(test_ID_list))

1320


# Data Loader

In [45]:
FPS = 25
FRAME_ROWS = 120
FRAME_COLS = 120
NFRAMES = 5 # size of input volume of frames
MARGIN = NFRAMES/2
COLORS = 1 # grayscale
CHANNELS = COLORS*NFRAMES
MAX_FRAMES_COUNT= 250 # corresponding to 10 seconds, 25Hz*10

EXAMPLE_FILEPATH = "/n/fs/scratch/jiaqis/"

In [47]:
def prepare_data(filepath, video_tensor_size, keypoint_size, label_seq_size):
    # images
    # frames x rows x cols x channels
    visual_cube = np.zeros(video_target_size, dtype="float16")
    # keypoint features
    feature_cube = np.zeros((video_target_size[0], video_target_size[1], video_target_size[2], keypoint_size), dtype="float16")
    features = json.load(open(filepath + ".json", 'r'))
    # Target Text/phonemes
    labels = []
    text = open(filepath+".txt", 'r').readline()
    words = text.strip().split()
    for i in range(words):
        word_phonemes = pron_dict[word]
        labels.extend([phoneme_dict[phon] for phon in word_phonemes])
    labels = np.array(labels)
    
    acc = 0
    for imgFilename in sorted(glob.glob(filepath + "_*.jpg")):
        img = image.img_to_array(
              image.load_img(imgFilename, target_size=img_target_size))
        img = preprocess_input(img)
        visual_cube[acc,:,:,:] = img
        
        framenum = int(imgFilename.split("_")[-1].split(".")[0])
        f_feature = features[framenum]
        for ft_index in range(keypoint_size):
            # TODO: check range of outputs
            keypoint_x = math.floor(f_feature[ft_index, 0]) 
            keypoint_y = math.floor(f_feature[ft_index, 1])
            feature_cube[acc, keypoint_y, keypoint_x, ft_index] = 1.0 
        acc+=1
    return visual_cube, feature_cube, labels

In [None]:
prepare_data()

In [None]:
import numpy as np
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, vqa, features, answers, vocabulary, batch_size=32,
                       n_classes=1000, shuffle=True, 
                       max_seq_len=30, feature_size=2048):
        'Initialization'
        self.batch_size = batch_size
        self.feature_size = feature_size
        self.features = features
        self.answers = answers
        self.vocabulary = vocabulary
        self.max_seq_len = max_seq_len
        self.vqa = vqa
        self.list_IDs = vqa.getQuesIds()
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(float(len(self.list_IDs)) / float(self.batch_size)))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, Y = self.data_generation(list_IDs_temp)

        return X, Y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        Q = np.zeros((len(list_IDs_temp), self.max_seq_len))
        I = np.zeros((len(list_IDs_temp), self.feature_size))
        A = np.zeros((len(list_IDs_temp), self.n_classes))
        for it, quesID in enumerate(list_IDs_temp):
          # Add question as a index sequence
          question = self.vqa.qqa[quesID]['question']
          words = question.split()
          for i, word in enumerate(words):
            new_word = preprocess_word(word)
            if new_word in self.vocabulary:
              Q[it, i] = self.vocabulary[new_word]
          # Add image feature
          ann = self.vqa.qa[quesID]
          imgId = ann['image_id']
          I[it, :] = self.features[imgId]
          # Majority vote for answer
          for ans in ann['answers']:
            if ans['answer'] in self.answers:
              ans_index = self.answers[ans['answer']]
              A[it, ans_index] = A[it, ans_index] + 1.0
          A[it, :] = A[it, :] / 10.0
        
        return [Q, I], A

In [None]:
train_generator =  DataGenerator(train_vqa, train_features, answers_1000_dict, 
                              vocabulary_dict, batch_size=batch_size,
                              n_classes=n_classes, shuffle=True, 
                              max_seq_len=max_seq_len, feature_size=feature_size)
val_generator = DataGenerator(val_vqa, val_features, answers_1000_dict, 
                              vocabulary_dict, batch_size=batch_size,
                              n_classes=n_classes, shuffle=True, 
                              max_seq_len=max_seq_len, feature_size=feature_size)

# Model

In [None]:
import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, BatchNormalization,ZeroPadding2D, Embedding, LSTM, Bidirectional, Add, Multiply, Activation, Masking, Concatenate

In [None]:
##################
# Baseline Model #
##################
input_Q_tensor = Input(shape=(max_seq_len,), name="Q")
input_I_tensor = Input(shape=(feature_size,), name="I")

# Question Branch
q_embed_tensor = Embedding(len(vocabulary)+1, 300, mask_zero=True, input_length=None)(input_Q_tensor)
print(q_embed_tensor)
masked_q_embed_tensor = Masking(mask_value=0)(q_embed_tensor)
print(masked_q_embed_tensor)
q_embed_act = Activation("tanh")(masked_q_embed_tensor)
print(q_embed_act)

q_tensor_seq = LSTM(lstm_hidden_units, dropout=0.2, recurrent_dropout=0.5, 
                              return_sequences=False, return_state=False)(q_embed_act)
q_feature = LSTM(lstm_hidden_units, dropout=0.2, recurrent_dropout=0.5, 
                              return_sequences=False, return_state=False)(q_tensor_seq)

# Image Branch
i_feature = Dense(lstm_hidden_units, activation='relu')(input_I_tensor)

# Merge
combo_feature = Add()([i_feature, q_feature])
scores = Dense(n_classes, activation="softmax")(combo_feature)

model = Model(inputs=[input_Q_tensor, input_I_tensor], outputs=scores)

print(model.summary())