In [None]:
import math
import random
import skimage.io as io
import matplotlib.pyplot as plt
import keras
import pickle
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import sys
# import skvideo.io
import json
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
import glob
import re
from collections import defaultdict

import nltk
from nltk.corpus import cmudict
import tensorflow as tf
import wer

In [None]:
!nvidia-smi

In [None]:
os.environ['CUDA_VISIBLE_DEVICES']='3'

In [None]:
phoneme_list = [] 
phoneme_dict = {}

with open("/n/fs/scratch/jiaqis/cmudict-master/cmudict.phones", 'r') as fp:
    i = 0
    line = fp.readline()
    while line:
        phoneme = line.split()[0].strip()
        phoneme_property = line.split()[1].strip()
        phoneme_list.append((phoneme, phoneme_property))
        phoneme_dict[phoneme] = i+3
        line = fp.readline()
        i=i+1
        
phoneme_dict['START'] = 0
phoneme_dict["END"] = 1
phoneme_dict["UNK"] = 2
print(phoneme_list, phoneme_dict)

In [None]:
pron_dict = cmudict.dict()

In [None]:
DATA_DIR = "/n/fs/scratch/jiaqis/LRS3-TED/"
SAVE_DIR = "/n/fs/scratch/jiaqis/LRS3-TED-Extracted/"

In [None]:
test_ID_list = json.load(open('test_ID_list.json', "r"))
trainval_ID_list = json.load(open('trainval_ID_list.json', "r"))

In [None]:
import logging

import numpy as np
import tensorflow as tf
from tensorflow.contrib import layers


GO_TOKEN = 0
END_TOKEN = 1
UNK_TOKEN = 2

In [None]:
def clean_pron(pron):
    """Remove stress from pronunciations."""
    return re.sub(r"\d", "", pron)

def make_triphones(pron):
    """Output triphones from a word's pronunciation."""
    if len(pron) < 3:
        return []
    # Junk on end is to make word boundaries work
    return ([((pron[idx - 2], pron[idx - 1]), pron[idx])
             for idx in range(2, len(pron))] + [(('#', '#'), pron[0])] +
            [((pron[-2], pron[-1]), '#')])
                                                
def triphone_probs(prons):
    """Calculate triphone probabilities for pronunciations."""
    context_counts = defaultdict(lambda: defaultdict(int))
    for pron in prons:
        for (context, phoneme) in make_triphones(pron):
            context_counts[context][phoneme] += 1
            
    for (context, outcomes) in context_counts.items():
        total_outcomes = sum(outcomes.values())
        for outcome, count in outcomes.items():
            context_counts[context][outcome] = float(count) / total_outcomes
        
    return context_counts

In [None]:
def prepare_data(filepath, img_size, keypoint_img_size, keypoint_size):
    # images
    # frames x rows x cols x channels
    visual_cube = []
    # keypoint features
    feature_cube = []
    features = json.load(open(filepath + ".json", 'r'))
    # Target Text/phonemes
    labels = []
    text = open(filepath+".txt", 'r').readline()
    words = text[5:].lower().strip().split()
    for word in words:
        word_phonemes = pron_dict[word][0]
        word_indices = [phoneme_dict[clean_pron(phon)] for phon in word_phonemes]
        labels.extend(word_indices)
        
    acc = 0
    for imgFilename in sorted(glob.glob(filepath + "_*_mouth.jpg")): # 
#         if 'mouth' in imgFilename: # 
#             continue               #
        x = image.img_to_array(
              image.load_img(imgFilename, target_size=img_size))/255.0
#         x = np.expand_dims(x, axis=0)
#         x = preprocess_input(x)
        visual_cube.append(x)
        
        mask = np.zeros((keypoint_img_size[0], keypoint_img_size[1], keypoint_size))
        framenum = str(int(imgFilename.split("_")[-2].split(".")[0])) # 
        f_feature = features[framenum]['mouthCoords']
        for ft_index in range(keypoint_size):
            # TODO: check range of outputs
            keypoint_x = min(f_feature[ft_index][0] - 1, 223)
            keypoint_y = min(f_feature[ft_index][1] - 1, 223)
            mask[keypoint_y, keypoint_x, ft_index] = 1.0
        feature_cube.append(mask)
        acc+=1
    return np.array(visual_cube), np.array(feature_cube), labels

In [None]:
def frame_wise_op(inputs, operation, **kwargs):
    # batch x timesteps x w x h x c
    # or
    # batch x timesteps x c
    inputs_flat = tf.reshape(inputs, tf.concat([tf.constant(-1, shape=(1,)), 
                                                tf.shape(inputs)[2:]], axis=0))
    print(inputs, inputs_flat)
    outputs_flat = operation(inputs_flat, **kwargs)

    output_shape = tf.concat([tf.shape(inputs)[:2], tf.shape(outputs_flat)[1:]], 0)
    outputs = tf.reshape(outputs_flat, output_shape)

    return outputs

In [None]:
def seq2seq(mode, features, labels, params):
    is_training = True
    
    vocab_size = params['vocab_size']
    embed_dim = params['embed_dim']
    num_units = params['num_units']
    num_encoder_rnns = params['num_encoder_rnns']
    num_decoder_rnns = params['num_decoder_rnns']
    input_max_length = params['input_max_length']
    output_max_length = params['output_max_length']
    
    inp = features['input']
    output = features['output']
    batch_size = tf.shape(inp)[0]
    start_tokens = tf.zeros([batch_size], dtype=tf.int64)
    print(inp, start_tokens)
    train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1)
    # b x f x w x h x c
    input_lengths = tf.reduce_sum(tf.to_int32(tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 0.0)), 
                                                            axis=(2,3,4))>0), 1)
    # b x seq
    output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, END_TOKEN)), 1)
    print(train_output, input_lengths, output_lengths, output)

    with tf.variable_scope('video_embed', reuse=tf.AUTO_REUSE):
        # 224 x 224 x 64
        conv1 = frame_wise_op(inp,
                 operation=tf.layers.conv2d,
                 filters=64,
                 kernel_size=[3, 3],
                 padding='same')
        conv1 = tf.layers.batch_normalization(
                                                conv1,
                                                axis=-1,
                                                training=is_training,
                                                trainable=True
                                            )
        conv1 = tf.nn.relu(conv1)

        # 112 x 112 x 64
        down1 = frame_wise_op(conv1,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
#         down1 = tf.layers.max_pooling3d(conv1,
#                             pool_size=[2, 2, 2],
#                             strides=2)
        # 112 x 112 x 128
        conv2 = frame_wise_op(down1,
                     operation=tf.layers.conv2d,
                     filters=128,
                     kernel_size=[3, 3],
                     padding='same',)
        conv2 = tf.layers.batch_normalization(
            conv2,
            axis=-1,
            training=is_training,
            trainable=True
        )
        conv2 = tf.nn.relu(conv2)
        
        # 56 x 56 x 128
        down2 = frame_wise_op(conv2,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 56 x 56 x 256
        conv3 = frame_wise_op(down2,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same')
        conv3 = tf.layers.batch_normalization(
            conv3,
            axis=-1,
            training=is_training,
            trainable=True
        )
        conv3 = tf.nn.relu(conv3)
        
        # 28 x 28 x 256
        down3 = frame_wise_op(conv3,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 28 x 28 x 256
        conv4 = frame_wise_op(down3,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same')
        conv4 = tf.layers.batch_normalization(
            conv4,
            axis=-1,
            training=is_training,
            trainable=True
        )
        conv4 = tf.nn.relu(conv4)
        
        # 14 x 14 x 256
        down4 = frame_wise_op(conv4,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 14 x 14 x 256
        conv5 = frame_wise_op(down4,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same')
        conv5 = tf.layers.batch_normalization(
            conv5,
            axis=-1,
            training=is_training,
            trainable=True
        )
        conv5 = tf.nn.relu(conv5)
        
        # 7 x 7 x 256
        down5 = frame_wise_op(conv5,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 256
        down5_shape =down5.shape.as_list()
        flattened = frame_wise_op(down5, 
                           operation=tf.reshape,
                           shape=(-1, down5_shape[-1]*down5_shape[-2]*down5_shape[-3]))
        fc6 = frame_wise_op(flattened, 
                           operation=tf.layers.dense,
                           units=256,
                           activation='relu')
    
    output_embed = layers.embed_sequence(
        train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed')
    with tf.variable_scope('embed', reuse=True):
        embeddings = tf.get_variable('embeddings', dtype=tf.float32)

    encoder_cells = [tf.contrib.rnn.GRUCell(num_units=num_units) for i in range(num_encoder_rnns)]
    stacked_encoder_cell = tf.nn.rnn_cell.MultiRNNCell(encoder_cells)
    
    encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(stacked_encoder_cell, fc6, dtype=tf.float32)
    print(encoder_outputs, encoder_final_state)
    
    tiled_context_vector = tf.tile(tf.reshape(encoder_final_state, (-1, 1, 256)),
                                       multiples=[1,tf.shape(output_embed)[1],1]
                                      )
    print(embeddings, tiled_context_vector)
    
#     def re_embed(x):
#         indices = tf.argmax(x, axis=-1)
#         return tf.nn.embedding_lookup(embeddings, indices)
    
    train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths)
    
    
#     train_helper = tf.contrib.seq2seq.ScheduledOutputTrainingHelper(
#         inputs = output_embed,
#         sequence_length = output_lengths,
#         next_inputs_fn = re_embed,
#         auxiliary_inputs = tiled_context_vector, # condtional on inputs
#         sampling_probability = 0.0, # for fullly inference
#     )
#     train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
#         output_embed, output_lengths, embeddings, 0.1
#     )
    
#     def re_embed_inference(outputs):
#         sample_ids = tf.argmax(x, axis=-1)
#         finished = tf.equal(sample_ids, END_TOKEN)
#         all_finished = tf.reduce_all(finished)
#         next_inputs = tf.cond(
#             all_finished,
#             # If we're finished, the next_inputs value doesn't matter
#             lambda: tf.nn.embedding_lookup(embeddings, start_tokens),
#             lambda: tf.nn.embedding_lookup(embeddings, sample_ids))
#         print("next_inputs:", next_inputs)
#         return tf.concat([tf.to_float32(next_inputs), encoder_final_state], axis=-1)
    
#     pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
#                         embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)
    pred_helper = tf.contrib.seq2seq.SampleEmbeddingHelper(
        embeddings,
        start_tokens=tf.to_int32(start_tokens), end_token=1,
    )
#     pred_helper =tf.contrib.seq2seq.InferenceHelper(
#                             sample_fn = (lambda outputs: outputs),
#                             sample_shape = tf.TensorShape([vocab_size]),
#                             sample_dtype = tf.float32,
#                             start_inputs = tf.nn.embedding_lookup(embeddings, start_tokens),
#                             end_fn = (lambda x: tf.argmax(x, axis=-1)==END_TOKEN),
#                             next_inputs_fn = re_embed_inference,
#                         )
    print("here")
    beam_width = 2
    
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_outputs, multiplier=beam_width)
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
        encoder_final_state, multiplier=beam_width)
    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
        input_lengths, multiplier=beam_width)
    
    def decode(helper, scope, reuse=None):
        with tf.variable_scope(scope, reuse=reuse):
#             attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
#                 num_units=num_units, memory=tiled_inputs,
#                 memory_sequence_length=tiled_sequence_length)
            decoder_cells = [tf.contrib.rnn.GRUCell(num_units=num_units) for i in range(num_decoder_rnns)]
            stacked_decoder_cell = tf.nn.rnn_cell.MultiRNNCell(decoder_cells)
#             attn_cell = tf.contrib.seq2seq.AttentionWrapper(
#                 cell, attention_mechanism, attention_layer_size=num_units / 2)
            out_cell = tf.contrib.rnn.OutputProjectionWrapper(
                stacked_decoder_cell, vocab_size, reuse=reuse
            )
#             decoder_initial_state = attention_cell.zero_state(
#                 dtype, batch_size=true_batch_size * beam_width)
#             decoder_initial_state = decoder_initial_state.clone(
#                 cell_state=tiled_encoder_final_state)
            
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                        out_cell,
                        embeddings,
                        start_tokens = tf.to_int32(start_tokens),
                        end_token = 1,
                        initial_state = tiled_encoder_final_state ,
                        beam_width = beam_width,
                        flength_penalty_weight = 0.2,
                    )
    
#             decoder = tf.contrib.seq2seq.BasicDecoder(
#                 cell=out_cell, helper=helper,
#                 initial_state=encoder_final_state)
#                 initial_state=encoder_final_state)

            outputs = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder, output_time_major=False,
                impute_finished=True, maximum_iterations=output_max_length
            )
            return outputs[0]
    
    train_outputs = decode(train_helper, 'decode')
    print(train_outputs, train_outputs.rnn_output)
    
    pred_outputs = decode(pred_helper, 'decode', reuse=True)

    tf.identity(train_outputs.sample_id[0], name='train_pred')
    weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1))
    loss = tf.contrib.seq2seq.sequence_loss(
        train_outputs.rnn_output, output, weights=weights)
    train_op = layers.optimize_loss(
        loss, tf.train.get_global_step(),
        optimizer=params.get('optimizer', 'Adam'),
        learning_rate=params.get('learning_rate', 0.001),
        learning_rate_decay_fn = (lambda p1, p2: tf.train.exponential_decay(
                                                        p1,
                                                        p2,
                                                        1000,
                                                        0.9,
                                                        staircase=False,
                                                    )),
        summaries=['loss', 'learning_rate'])

    tf.identity(pred_outputs.sample_id[0], name='predictions')
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={'train_pred':train_outputs.sample_id, 'predictions':pred_outputs.sample_id},
        loss=loss,
        train_op=train_op
    )

In [None]:
def tokenize_and_map(line, vocab):
    return [vocab.get(token, UNK_TOKEN) for token in line.split(' ')]

def make_input_fn(
        data_dir, subset, list_IDs, prons, phonemes,
        input_max_length, output_max_length,
        img_size, keypoint_img_size,
        keypoint_size,
        batch_size=1,
        input_process=tokenize_and_map, 
        output_process=tokenize_and_map):

    def input_fn():
        inp = tf.placeholder(tf.float32, shape=[None, None, img_size[0], img_size[1], img_size[2]], name='input')
        output = tf.placeholder(tf.int64, shape=[None, None], name='output')
        tf.identity(inp[0], 'input_0')
        tf.identity(output[0], 'output_0')
        return {
            'input': inp,
            'output': output,
        }, None

    def sampler():
        while True:
            rnd = random.randint(0, len(list_IDs)-1)
            v_ID = list_IDs[rnd]
            v_url, v_index = v_ID
            filepath = os.path.join(data_dir, subset, v_url, v_index)
            v_V, v_F, v_T = prepare_data(filepath, img_size, keypoint_img_size,
                                           keypoint_size)
            num_frames = v_V.shape[0]
            yield {
                    'input':  v_V[:min(num_frames,input_max_length), :, :, :], # v_F[:input_max_length]),
                    'output': v_T[:output_max_length - 1] + [END_TOKEN]
                }

    sample_me = sampler()

    def feed_fn():
        inputs, outputs = [], []
        input_length, output_length = 0, 0
        for i in range(batch_size):
            rec = sample_me.next()
            inputs.append(rec['input'])
            outputs.append(rec['output'])
            num_frames = inputs[-1].shape[0] #
#             print(inputs[-1].shape)
            input_length = max(input_length, num_frames)
            output_length = max(output_length, len(outputs[-1]))
        # Pad me right with </S> token.
        for i in range(batch_size):
            num_frames = inputs[i].shape[0]
            if input_length>num_frames:
                new_input_V = np.pad(inputs[i], [(input_length - num_frames, 0), (0,0), (0,0), (0,0)], 'constant')
                # new_input_F = np.pad(inputs[i][1], [output_length - num_frames, 0, 0, 0], 'constant')
                inputs[i] = new_input_V
            outputs[i] += [END_TOKEN] * (output_length - len(outputs[i]))
        return {
            'input:0': inputs,
            'output:0': outputs
        }

    return input_fn, feed_fn


def get_rev_vocab(vocab):
    return {idx: key for key, idx in vocab.iteritems()}

def get_formatter(keys, vocab):
    rev_vocab = get_rev_vocab(vocab)
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return ' '.join(tokens)

    def format(values):
        res = []
        for key in keys:
            res.append("%s = %s" % (key, to_str(values[key])))
        return '\n'.join(res)
    return format

def get_editdist_formatter(keys, vocab):
    rev_vocab = get_rev_vocab(vocab)
    
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return tokens

    def format(values):
        labels = values[keys[0]]  
        clean_labels = []
        for label in labels:
            if label == END_TOKEN:
                break
            else:
                clean_labels.append(label)
        clean_labels = to_str(clean_labels)
        
        preds =  values[keys[1]]
        clean_preds = []
        for pred in preds:
            if pred == END_TOKEN:
                break
            else:
                clean_preds.append(pred)
        clean_preds = to_str(clean_preds)
        wer.wer(clean_labels, clean_preds)
        return '\n====== WER '+ keys[1]+' ======\n'
    
    return format

In [None]:
def train_seq2seq(params, data_dir, subset, model_dir, list_IDs, prons, phonemes):
    est = tf.estimator.Estimator(
        model_fn=seq2seq,
        model_dir=model_dir, params=params)

    input_fn, feed_fn = make_input_fn(
                            data_dir, subset, list_IDs, prons, phonemes,
                            params['input_max_length'], params['output_max_length'],
                            params['img_size'], params['keypoint_img_size'],
                            params['keypoint_size'],
                            params['batch_size'])

    # Make hooks to print examples of inputs/predictions.
    print_inputs = tf.train.LoggingTensorHook(
        ['output_0'], every_n_iter=100,
        formatter=get_formatter(['output_0'], phonemes))
    print_predictions = tf.train.LoggingTensorHook(
        ['predictions', 'train_pred'], every_n_iter=100,
        formatter=get_formatter(['predictions', 'train_pred'], phonemes))
    print_train_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'train_pred'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'train_pred'], phonemes))
    print_pred_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'predictions'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'predictions'], phonemes))

    est.train(
        input_fn=input_fn,
        hooks=[tf.train.FeedFnHook(feed_fn), print_inputs, print_predictions, 
                       print_train_edit_distance, print_pred_edit_distance],
        steps=100000)

In [None]:
def make_single_pass_input_fn(
        data_dir, subset, list_IDs, prons, phonemes,
        input_max_length, output_max_length,
        img_size, keypoint_img_size,
        keypoint_size,
        batch_size=1,
        input_process=tokenize_and_map, 
        output_process=tokenize_and_map):

    def input_fn():
        inp = tf.placeholder(tf.float32, shape=[None, None, img_size[0], img_size[1], img_size[2]], name='input')
        output = tf.placeholder(tf.int64, shape=[None, None], name='output')
        tf.identity(inp[0], 'input_0')
        tf.identity(output[0], 'output_0')
        return {
            'input': inp,
            'output': output,
        }, None

    def sampler():
        for rnd in range(len(list_IDs)):
            v_ID = list_IDs[rnd]
            v_url, v_index = v_ID
            filepath = os.path.join(data_dir, subset, v_url, v_index)
            v_V, v_F, v_T = prepare_data(filepath, img_size, keypoint_img_size,
                                           keypoint_size)
            num_frames = v_V.shape[0]
            yield {
                    'input':  v_V[:min(num_frames,input_max_length), :, :, :], # v_F[:input_max_length]),
                    'output': v_T[:output_max_length - 1] + [END_TOKEN]
                }
        return
#         raise tf.errors.OutOfRangeError

    sample_me = sampler()

    def feed_fn():
        inputs, outputs = [], []
        input_length, output_length = 0, 0
        for i in range(batch_size):
            rec = sample_me.next()
            inputs.append(rec['input'])
            outputs.append(rec['output'])
            num_frames = inputs[-1].shape[0] #
#             print(inputs[-1].shape)
            input_length = max(input_length, num_frames)
            output_length = max(output_length, len(outputs[-1]))
        # Pad me right with </S> token.
        for i in range(batch_size):
            num_frames = inputs[i].shape[0]
            if input_length>num_frames:
                new_input_V = np.pad(inputs[i], [(0, input_length - num_frames), (0,0), (0,0), (0,0)], 'constant')
                # new_input_F = np.pad(inputs[i][1], [output_length - num_frames, 0, 0, 0], 'constant')
                inputs[i] = new_input_V
            outputs[i] += [END_TOKEN] * (output_length - len(outputs[i]))
        return {
            'input:0': inputs,
            'output:0': outputs
        }

    return input_fn, feed_fn


In [None]:
def eval_seq2seq(params, data_dir, subset, model_dir, list_IDs, prons, phonemes):
    est = tf.estimator.Estimator(
        model_fn=seq2seq,
        model_dir=model_dir, params=params)

    input_fn, feed_fn = make_single_pass_input_fn(
                            data_dir, subset, list_IDs, prons, phonemes,
                            params['input_max_length'], params['output_max_length'],
                            params['img_size'], params['keypoint_img_size'],
                            params['keypoint_size'],
                            params['batch_size'])

    # Make hooks to print examples of inputs/predictions.
    print_inputs = tf.train.LoggingTensorHook(
        ['output_0'], every_n_iter=100,
        formatter=get_formatter(['output_0'], phonemes))
    print_predictions = tf.train.LoggingTensorHook(
        ['predictions', 'train_pred'], every_n_iter=100,
        formatter=get_formatter(['predictions', 'train_pred'], phonemes))
    print_train_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'train_pred'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'train_pred'], phonemes))
    print_pred_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'predictions'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'predictions'], phonemes))
    
    eval_predictions = est.predict(
        input_fn = input_fn,
        hooks=[tf.train.FeedFnHook(feed_fn)], 
#                print_inputs, print_predictions, 
#                        print_train_edit_distance, print_pred_edit_distance],
    )
    return eval_predictions

In [None]:
params = {
    'vocab_size': 42,
    'batch_size': 8,
    'input_max_length': 100,
    'output_max_length': 100,
    'embed_dim': 100,
    'num_units': 256,
    'img_size': (120, 120, 3), 
    'keypoint_img_size': (224, 224),
    'keypoint_size': 20,
    'n_classes':39,
    'num_tokens': 42,
    'num_encoder_rnns':2,
    'num_decoder_rnns':2
}

In [None]:
def get_editdist(labels, preds, vocab):
    rev_vocab = get_rev_vocab(vocab)
    
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return tokens

    clean_labels = []
    for label in labels:
        if label == END_TOKEN:
            break
        else:
            clean_labels.append(label)
    str_clean_labels = to_str(clean_labels)

    clean_preds = []
    for pred in preds:
        if pred == END_TOKEN:
            break
        else:
            clean_preds.append(pred)
    str_clean_preds = to_str(clean_preds)        
    score = wer.wer(str_clean_labels, str_clean_preds)
    
    return score

In [None]:
#     tf.logging.logger.setLevel(logging.INFO)
train_seq2seq(params, SAVE_DIR, 'trainval', './model/seq2seq_copiedstates_no_attentions_bn_2rnn', trainval_ID_list, pron_dict, phoneme_dict)

In [None]:
#     tf.logging.logger.setLevel(logging.INFO)
eval_predictions = eval_seq2seq(params, SAVE_DIR, 'trainval', './model/seq2seq_copiedstates_no_attentions', trainval_ID_list, pron_dict, phoneme_dict)

In [None]:
def calculate_accuracy(eval_predictions, subset, ID_list, phoneme_dict):
    my_results = {}
    i=0
    for item in eval_predictions:
        print(item, ID_list[i])
        item_id =  ID_list[i][0]+" "+ ID_list[i][1]
        my_results[item_id] = item
        i+=1
    
    acc_edit_distance = []
    per_accuracy = []
    per_predictions = [] # for confusion matrix
    per_gt = []
    
    for key in my_results:
        vs = key.split()
        v_url = vs[0]
        v_index = vs[1]
        train_pred = my_results[key]['train_pred']
        predictions = my_results[key]['predictions']
        filepath = os.path.join(SAVE_DIR, subset, v_url, v_index)
        v_V, v_F, v_T = prepare_data(filepath,  params['img_size'], params['keypoint_img_size'], params['keypoint_size'])
    #     print(v_T, my_results[key])
        ed = get_editdist(list(v_T), predictions, phoneme_dict)
        acc_edit_distance.append(ed)
        
        end_idx = len(v_T)
        
        accuracy = float(np.sum(np.equal(train_pred[2:end_idx], np.array(v_T[2:]))))/end_idx
        per_accuracy.append(accuracy)
        
        per_predictions.append(train_pred[2:end_idx])
        per_gt.append(v_T[2:end_idx])
    return np.mean(acc_edit_distance), \
            np.mean(accuracy), per_predictions, per_gt


In [None]:
eval_test_predictions = eval_seq2seq(params, SAVE_DIR, 'test', './model/seq2seq_copiedstates_no_attentions', test_ID_list, pron_dict, phoneme_dict)

In [None]:
test_ed, test_accuracies, test_predictions, test_gt = calculate_accuracy(eval_test_predictions,'test', test_ID_list, phoneme_dict)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
#     print(unique_labels(y_true, y_pred))
#     classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fig, ax = plt.subplots(figsize=(20, 20))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
#     fmt = '.2f' if normalize else 'd'
#     thresh = cm.max() / 2.
#     for i in range(cm.shape[0]):
#         for j in range(cm.shape[1]):
#             ax.text(j, i, format(cm[i, j], fmt),
#                     ha="center", va="center",
#                     color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
import sklearn
flattened_test_gt = []
for item in test_gt:
    flattened_test_gt.extend(item)

flattened_test_predictions = []
for item in test_predictions:
    flattened_test_predictions.extend(item)

In [None]:
cf_mat=sklearn.metrics.confusion_matrix(flattened_test_gt, flattened_test_predictions, labels=None)

In [None]:
plot_confusion_matrix(flattened_test_gt, flattened_test_predictions, phoneme_re_list,
                          normalize=True,
                          title=None,
                          cmap=plt.cm.Blues)

In [None]:
phoneme_re_list = sorted(phoneme_dict.iteritems(), key=lambda x:x[1]) 