In [None]:
import math
import random
import skimage.io as io
import matplotlib.pyplot as plt
import keras
import pickle
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import sys
# import skvideo.io
import json
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
import glob
import re
from collections import defaultdict

import nltk
from nltk.corpus import cmudict
import tensorflow as tf
import wer

In [None]:
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [None]:
phoneme_list = [] 
phoneme_dict = {}

with open("/n/fs/scratch/jiaqis/cmudict-master/cmudict.phones", 'r') as fp:
    i = 0
    line = fp.readline()
    while line:
        phoneme = line.split()[0].strip()
        phoneme_property = line.split()[1].strip()
        phoneme_list.append((phoneme, phoneme_property))
        phoneme_dict[phoneme] = i+3
        line = fp.readline()
        i=i+1
        
phoneme_dict['START'] = 0
phoneme_dict["END"] = 1
phoneme_dict["UNK"] = 2
print(phoneme_list, phoneme_dict)

In [None]:
pron_dict = cmudict.dict()

In [None]:
DATA_DIR = "/n/fs/scratch/jiaqis/LRS3-TED/"
SAVE_DIR = "/n/fs/scratch/jiaqis/LRS3-TED-Extracted/"

In [None]:
test_ID_list = json.load(open('test_ID_list.json', "r"))
trainval_ID_list = json.load(open('trainval_ID_list.json', "r"))

In [None]:
import logging

import numpy as np
import tensorflow as tf
from tensorflow.contrib import layers


GO_TOKEN = 0
END_TOKEN = 1
UNK_TOKEN = 2

In [None]:
def clean_pron(pron):
    """Remove stress from pronunciations."""
    return re.sub(r"\d", "", pron)

def make_triphones(pron):
    """Output triphones from a word's pronunciation."""
    if len(pron) < 3:
        return []
    # Junk on end is to make word boundaries work
    return ([((pron[idx - 2], pron[idx - 1]), pron[idx])
             for idx in range(2, len(pron))] + [(('#', '#'), pron[0])] +
            [((pron[-2], pron[-1]), '#')])
                                                
def triphone_probs(prons):
    """Calculate triphone probabilities for pronunciations."""
    context_counts = defaultdict(lambda: defaultdict(int))
    for pron in prons:
        for (context, phoneme) in make_triphones(pron):
            context_counts[context][phoneme] += 1
            
    for (context, outcomes) in context_counts.items():
        total_outcomes = sum(outcomes.values())
        for outcome, count in outcomes.items():
            context_counts[context][outcome] = float(count) / total_outcomes
        
    return context_counts

In [None]:
def prepare_data(filepath, img_size, keypoint_img_size, keypoint_size):
    # images
    # frames x rows x cols x channels
    visual_cube = []
    # keypoint features
    feature_cube = []
    features = json.load(open(filepath + ".json", 'r'))
    # Target Text/phonemes
    labels = []
    text = open(filepath+".txt", 'r').readline()
    words = text[5:].lower().strip().split()
    for word in words:
        word_phonemes = pron_dict[word][0]
        word_indices = [phoneme_dict[clean_pron(phon)] for phon in word_phonemes]
        labels.extend(word_indices)
        
    acc = 0
    for imgFilename in sorted(glob.glob(filepath + "_*_mouth.jpg")): # 
#         if 'mouth' in imgFilename: # 
#             continue               #
        x = image.img_to_array(
              image.load_img(imgFilename, target_size=img_size))/255.0
#         x = np.expand_dims(x, axis=0)
#         x = preprocess_input(x)
        visual_cube.append(x)
        
        mask = np.zeros((keypoint_img_size[0], keypoint_img_size[1], keypoint_size))
        framenum = str(int(imgFilename.split("_")[-2].split(".")[0])) # 
        f_feature = features[framenum]['mouthCoords']
        for ft_index in range(keypoint_size):
            # TODO: check range of outputs
            keypoint_x = f_feature[ft_index][0] - 1
            keypoint_y = f_feature[ft_index][1] - 1
            mask[keypoint_y, keypoint_x, ft_index] = 1.0
        feature_cube.append(mask)
        acc+=1
    return np.array(visual_cube), np.array(feature_cube), labels

In [None]:
def frame_wise_op(inputs, operation, **kwargs):
    # batch x timesteps x w x h x c
    # or
    # batch x timesteps x c
    inputs_flat = tf.reshape(inputs, tf.concat([tf.constant(-1, shape=(1,)), 
                                                tf.shape(inputs)[2:]], axis=0))
    print(inputs, inputs_flat)
    outputs_flat = operation(inputs_flat, **kwargs)

    output_shape = tf.concat([tf.shape(inputs)[:2], tf.shape(outputs_flat)[1:]], 0)
    outputs = tf.reshape(outputs_flat, output_shape)

    return outputs

In [None]:
def seq2seq(mode, features, labels, params):
    vocab_size = params['vocab_size']
    embed_dim = params['embed_dim']
    num_units = params['num_units']
    input_max_length = params['input_max_length']
    output_max_length = params['output_max_length']

    inp = features['input']
    output = features['output']
    batch_size = tf.shape(inp)[0]
    start_tokens = tf.zeros([batch_size], dtype=tf.int64)
    print(inp, start_tokens)
    train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1)
    # b x f x w x h x c
    input_lengths = tf.reduce_sum(tf.to_int32(tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 0.0)), 
                                                            axis=(2,3,4))>0), 1)
    # b x seq
    output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, END_TOKEN)), 1)
    print(train_output, input_lengths, output_lengths, output)

    with tf.variable_scope('video_embed', reuse=tf.AUTO_REUSE):
        # 224 x 224 x 64
        conv1 = frame_wise_op(inp,
                     operation=tf.layers.conv2d,
                     filters=64,
                     kernel_size=[3, 3],
                     padding='same',
                     activation='relu')
        # 112 x 112 x 64
        down1 = frame_wise_op(conv1,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 112 x 112 x 128
        conv2 = frame_wise_op(down1,
                     operation=tf.layers.conv2d,
                     filters=128,
                     kernel_size=[3, 3],
                     padding='same',
                     activation='relu')
        # 56 x 56 x 128
        down2 = frame_wise_op(conv2,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 56 x 56 x 256
        conv3 = frame_wise_op(down2,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same',
                     activation='relu')
        # 28 x 28 x 256
        down3 = frame_wise_op(conv3,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 28 x 28 x 256
        conv4 = frame_wise_op(down3,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same',
                     activation='relu')
        # 14 x 14 x 256
        down4 = frame_wise_op(conv4,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 14 x 14 x 256
        conv5 = frame_wise_op(down4,
                     operation=tf.layers.conv2d,
                     filters=256,
                     kernel_size=[3, 3],
                     padding='same',
                     activation='relu')
        # 7 x 7 x 256
        down5 = frame_wise_op(conv5,
                            operation=tf.layers.max_pooling2d,
                            pool_size=[2, 2],
                            strides=2)
        # 256
        down5_shape =down5.shape.as_list()
        flattened = frame_wise_op(down5, 
                           operation=tf.reshape,
                           shape=(-1, down5_shape[-1]*down5_shape[-2]*down5_shape[-3]))
        fc6 = frame_wise_op(flattened, 
                           operation=tf.layers.dense,
                           units=256,
                           activation='relu')
    
    output_embed = layers.embed_sequence(
        train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed')
    with tf.variable_scope('embed', reuse=True):
        embeddings = tf.get_variable('embeddings')

    cell = tf.contrib.rnn.GRUCell(num_units=num_units)
    encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, fc6, dtype=tf.float32)
    print(encoder_outputs)

    beam_width = 10

    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_outputs, multiplier=beam_width)
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
        encoder_final_state, multiplier=beam_width)
    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
        input_lengths, multiplier=beam_width)
    
    train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths)
    # train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
    #     output_embed, output_lengths, embeddings, 0.3
    # )
#     pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
#         embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)

    pred_helper = tf.contrib.seq2seq.SampleEmbeddingHelper(
        embeddings,
        start_tokens=tf.to_int32(start_tokens), end_token=1,
    )
    def decode(helper, scope, reuse=None, beam=True):
        with tf.variable_scope(scope, reuse=reuse):
            if beam:
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=num_units, memory=tiled_encoder_outputs,
                    memory_sequence_length=tiled_sequence_length)
            else:
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=num_units, memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            
            cell = tf.contrib.rnn.GRUCell(num_units=num_units)
            attn_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell, attention_mechanism, attention_layer_size=num_units / 2)
            out_cell = tf.contrib.rnn.OutputProjectionWrapper(
                attn_cell, vocab_size, reuse=reuse
            )
            
            if beam:
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                            out_cell,
                            embeddings,
                            start_tokens = tf.to_int32(start_tokens),
                            end_token = 1,
                            initial_state = out_cell.zero_state(
                                    dtype=tf.float32, batch_size=batch_size * beam_width),
                            beam_width = beam_width,
                            length_penalty_weight = 0.2,
#                             coverage_penalty_weight = 0.2
                        )
            else:
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=out_cell, helper=helper,
                    initial_state=out_cell.zero_state(
                                    dtype=tf.float32, batch_size=batch_size))

            outputs = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder, output_time_major=False,
                impute_finished=(not beam), maximum_iterations=output_max_length
            )
            return outputs[0]
    train_outputs = decode(train_helper, 'decode', beam=False)
#     print(train_outputs, train_outputs.rnn_output)
    pred_outputs = decode(pred_helper, 'decode', reuse=True)

    tf.identity(train_outputs.sample_id[0], name='train_pred')
    weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1))
    loss = tf.contrib.seq2seq.sequence_loss(
        train_outputs.rnn_output, output, weights=weights)
    train_op = layers.optimize_loss(
        loss, tf.train.get_global_step(),
        optimizer=params.get('optimizer', 'Adam'),
        learning_rate=params.get('learning_rate', 0.001),
        learning_rate_decay_fn = (lambda p1, p2: tf.train.exponential_decay(
                                                        p1,
                                                        p2,
                                                        1000,
                                                        0.9,
                                                        staircase=False,
                                                    )),
        summaries=['loss', 'learning_rate'])

    tf.identity(pred_outputs.predicted_ids[0, :, 0], name='predictions')
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={'train_pred':train_outputs.sample_id, 'predictions':pred_outputs.predicted_ids[:, :, 0]},
        loss=loss,
        train_op=train_op
    )

In [None]:
def tokenize_and_map(line, vocab):
    return [vocab.get(token, UNK_TOKEN) for token in line.split(' ')]

def make_input_fn(
        data_dir, subset, list_IDs, prons, phonemes,
        input_max_length, output_max_length,
        img_size, keypoint_img_size,
        keypoint_size,
        batch_size=1,
        input_process=tokenize_and_map, 
        output_process=tokenize_and_map):

    def input_fn():
        inp = tf.placeholder(tf.float32, shape=[None, None, img_size[0], img_size[1], img_size[2]], name='input')
        output = tf.placeholder(tf.int64, shape=[None, None], name='output')
        tf.identity(inp[0], 'input_0')
        tf.identity(output[0], 'output_0')
        return {
            'input': inp,
            'output': output,
        }, None

    def sampler():
        while True:
            rnd = random.randint(0, len(list_IDs)-1)
            v_ID = list_IDs[rnd]
            v_url, v_index = v_ID
            filepath = os.path.join(data_dir, subset, v_url, v_index)
            v_V, v_F, v_T = prepare_data(filepath, img_size, keypoint_img_size,
                                           keypoint_size)
            num_frames = v_V.shape[0]
            yield {
                    'input':  v_V[:min(num_frames,input_max_length), :, :, :], # v_F[:input_max_length]),
                    'output': v_T[:output_max_length - 1] + [END_TOKEN]
                }

    sample_me = sampler()

    def feed_fn():
        inputs, outputs = [], []
        input_length, output_length = 0, 0
        for i in range(batch_size):
            rec = sample_me.next()
            inputs.append(rec['input'])
            outputs.append(rec['output'])
            num_frames = inputs[-1].shape[0] #
#             print(inputs[-1].shape)
            input_length = max(input_length, num_frames)
            output_length = max(output_length, len(outputs[-1]))
        # Pad me right with </S> token.
        for i in range(batch_size):
            num_frames = inputs[i].shape[0]
            if input_length>num_frames:
                new_input_V = np.pad(inputs[i], [(0, input_length - num_frames), (0,0), (0,0), (0,0)], 'constant')
                # new_input_F = np.pad(inputs[i][1], [output_length - num_frames, 0, 0, 0], 'constant')
                inputs[i] = new_input_V
            outputs[i] += [END_TOKEN] * (output_length - len(outputs[i]))
        return {
            'input:0': inputs,
            'output:0': outputs
        }

    return input_fn, feed_fn


def get_rev_vocab(vocab):
    return {idx: key for key, idx in vocab.iteritems()}

def get_formatter(keys, vocab):
    rev_vocab = get_rev_vocab(vocab)
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return ' '.join(tokens)

    def format(values):
        res = []
        for key in keys:
            res.append("%s = %s" % (key, to_str(values[key])))
        return '\n'.join(res)
    return format

def get_editdist_formatter(keys, vocab):
    rev_vocab = get_rev_vocab(vocab)
    
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return tokens

    def format(values):
        labels = values[keys[0]]  
        clean_labels = []
        for label in labels:
            if label == END_TOKEN:
                break
            else:
                clean_labels.append(label)
        clean_labels = to_str(clean_labels)
        
        preds =  values[keys[1]]
        clean_preds = []
        for pred in preds:
            if pred == END_TOKEN:
                break
            else:
                clean_preds.append(pred)
        clean_preds = to_str(clean_preds)
        wer.wer(clean_labels, clean_preds)
        return '\n====== WER '+ keys[1]+' ======\n'
    
    return format

In [None]:
def train_seq2seq(data_dir, subset, model_dir, list_IDs, prons, phonemes):
    params = {
        'vocab_size': 42,
        'batch_size': 8,
        'input_max_length': 100,
        'output_max_length': 100,
        'embed_dim': 100,
        'num_units': 256,
        'img_size': (120, 120, 3), 
        'keypoint_img_size': (224, 224),
        'keypoint_size': 20,
        'n_classes':39,
        'num_tokens': 42
    }
    est = tf.estimator.Estimator(
        model_fn=seq2seq,
        model_dir=model_dir, params=params)

    input_fn, feed_fn = make_input_fn(
                            data_dir, subset, list_IDs, prons, phonemes,
                            params['input_max_length'], params['output_max_length'],
                            params['img_size'], params['keypoint_img_size'],
                            params['keypoint_size'],
                            params['batch_size'])

    # Make hooks to print examples of inputs/predictions.
    print_inputs = tf.train.LoggingTensorHook(
        ['output_0'], every_n_iter=100,
        formatter=get_formatter(['output_0'], phonemes))
    print_predictions = tf.train.LoggingTensorHook(
        ['predictions', 'train_pred'], every_n_iter=100,
        formatter=get_formatter(['predictions', 'train_pred'], phonemes))
    print_train_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'train_pred'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'train_pred']))
    print_pred_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'predictions'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'predictions']))

    est.train(
        input_fn=input_fn,
        hooks=[tf.train.FeedFnHook(feed_fn), print_inputs, print_predictions, 
                       print_train_edit_distance, print_pred_edit_distance],
        steps=50000)

In [None]:
#     tf.logging.logger.setLevel(logging.INFO)
train_seq2seq(SAVE_DIR, 'trainval', './model/seq2seq', trainval_ID_list, pron_dict, phoneme_dict)

## Inference

In [20]:
def make_single_pass_input_fn(
        data_dir, subset, list_IDs, prons, phonemes,
        input_max_length, output_max_length,
        img_size, keypoint_img_size,
        keypoint_size,
        batch_size=1,
        input_process=tokenize_and_map, 
        output_process=tokenize_and_map):

    def input_fn():
        inp = tf.placeholder(tf.float32, shape=[None, None, img_size[0], img_size[1], img_size[2]], name='input')
        output = tf.placeholder(tf.int64, shape=[None, None], name='output')
        tf.identity(inp[0], 'input_0')
        tf.identity(output[0], 'output_0')
        return {
            'input': inp,
            'output': output,
        }, None

    def sampler():
        for rnd in range(len(list_IDs)):
            v_ID = list_IDs[rnd]
            v_url, v_index = v_ID
            filepath = os.path.join(data_dir, subset, v_url, v_index)
            v_V, v_F, v_T = prepare_data(filepath, img_size, keypoint_img_size,
                                           keypoint_size)
            num_frames = v_V.shape[0]
            yield {
                    'input':  v_V[:min(num_frames,input_max_length), :, :, :], # v_F[:input_max_length]),
                    'output': v_T[:output_max_length - 1] + [END_TOKEN]
                }
        return
#         raise tf.errors.OutOfRangeError

    sample_me = sampler()

    def feed_fn():
        inputs, outputs = [], []
        input_length, output_length = 0, 0
        for i in range(batch_size):
            rec = sample_me.next()
            inputs.append(rec['input'])
            outputs.append(rec['output'])
            num_frames = inputs[-1].shape[0] #
#             print(inputs[-1].shape)
            input_length = max(input_length, num_frames)
            output_length = max(output_length, len(outputs[-1]))
        # Pad me right with </S> token.
        for i in range(batch_size):
            num_frames = inputs[i].shape[0]
            if input_length>num_frames:
                new_input_V = np.pad(inputs[i], [(0, input_length - num_frames), (0,0), (0,0), (0,0)], 'constant')
                # new_input_F = np.pad(inputs[i][1], [output_length - num_frames, 0, 0, 0], 'constant')
                inputs[i] = new_input_V
            outputs[i] += [END_TOKEN] * (output_length - len(outputs[i]))
        return {
            'input:0': inputs,
            'output:0': outputs
        }

    return input_fn, feed_fn


In [21]:
def eval_seq2seq(params, data_dir, subset, model_dir, list_IDs, prons, phonemes):
    est = tf.estimator.Estimator(
        model_fn=seq2seq,
        model_dir=model_dir, params=params)

    input_fn, feed_fn = make_single_pass_input_fn(
                            data_dir, subset, list_IDs, prons, phonemes,
                            params['input_max_length'], params['output_max_length'],
                            params['img_size'], params['keypoint_img_size'],
                            params['keypoint_size'],
                            params['batch_size'])

    # Make hooks to print examples of inputs/predictions.
    print_inputs = tf.train.LoggingTensorHook(
        ['output_0'], every_n_iter=100,
        formatter=get_formatter(['output_0'], phonemes))
    print_predictions = tf.train.LoggingTensorHook(
        ['predictions', 'train_pred'], every_n_iter=100,
        formatter=get_formatter(['predictions', 'train_pred'], phonemes))
    print_train_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'train_pred'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'train_pred'], phonemes))
    print_pred_edit_distance = tf.train.LoggingTensorHook(
        ['output_0', 'predictions'], every_n_iter=100,
        formatter=get_editdist_formatter(['output_0', 'predictions'], phonemes))
    
    eval_predictions = est.predict(
        input_fn = input_fn,
        hooks=[tf.train.FeedFnHook(feed_fn)], 
#                print_inputs, print_predictions, 
#                        print_train_edit_distance, print_pred_edit_distance],
    )
    return eval_predictions

In [22]:
params = {
    'vocab_size': 42,
    'batch_size': 8,
    'input_max_length': 100,
    'output_max_length': 100,
    'embed_dim': 100,
    'num_units': 256,
    'img_size': (120, 120, 3), 
    'keypoint_img_size': (224, 224),
    'keypoint_size': 20,
    'n_classes':39,
    'num_tokens': 42,
    'num_encoder_rnns':2,
    'num_decoder_rnns':2
}

In [23]:
def get_editdist(labels, preds, vocab):
    rev_vocab = get_rev_vocab(vocab)
    
    def to_str(sequence):
        tokens = [
            rev_vocab.get(x, "<UNK>") for x in sequence]
        return tokens

    clean_labels = []
    for label in labels:
        if label == END_TOKEN:
            break
        else:
            clean_labels.append(label)
    str_clean_labels = to_str(clean_labels)

    clean_preds = []
    for pred in preds:
        if pred == END_TOKEN:
            break
        else:
            clean_preds.append(pred)
    str_clean_preds = to_str(clean_preds)        
    score = wer.wer(str_clean_labels, str_clean_preds)
    
    return score

In [24]:
def calculate_accuracy(eval_predictions, subset, ID_list, phoneme_dict):
    my_results = {}
    i=0
    for item in eval_predictions:
        print(item, ID_list[i])
        item_id =  ID_list[i][0]+" "+ ID_list[i][1]
        my_results[item_id] = item
        i+=1
        break
    
    acc_edit_distance = []
    per_accuracy = []
    per_predictions = [] # for confusion matrix
    per_gt = []
    
    for key in my_results:
        vs = key.split()
        v_url = vs[0]
        v_index = vs[1]
        train_pred = my_results[key]['train_pred']
        predictions = my_results[key]['predictions']
        filepath = os.path.join(SAVE_DIR, subset, v_url, v_index)
        v_V, v_F, v_T = prepare_data(filepath,  params['img_size'], params['keypoint_img_size'], params['keypoint_size'])
    #     print(v_T, my_results[key])
        ed = get_editdist(list(v_T), predictions, phoneme_dict)
        acc_edit_distance.append(ed)
        
        end_idx = len(v_T)
        
        accuracy = float(np.sum(np.equal(train_pred[2:end_idx], np.array(v_T[2:]))))/end_idx
        per_accuracy.append(accuracy)
        
        per_predictions.append(train_pred[2:end_idx])
        per_gt.append(v_T[2:end_idx])
    return np.mean(acc_edit_distance), \
            np.mean(accuracy), per_predictions, per_gt

In [36]:
def calculate_accuracy(eval_predictions, subset, ID_list, phoneme_dict):
    my_results = {}
    i=0
    for item in eval_predictions:
        print(item, ID_list[i])
        item_id =  ID_list[i][0]+" "+ ID_list[i][1]
        my_results[item_id] = item
        i+=1
    
    acc_edit_distance = []
    per_accuracy = []
    per_predictions = [] # for confusion matrix
    per_gt = []
    
    for key in my_results:
        vs = key.split()
        print(vs)
        v_url = vs[0]
        v_index = vs[1]
        train_pred = my_results[key]['train_pred']
        predictions = my_results[key]['predictions']
        filepath = os.path.join(SAVE_DIR, subset, v_url, v_index)
        v_V, v_F, v_T = prepare_data(filepath,  params['img_size'], params['keypoint_img_size'], params['keypoint_size'])
    #     print(v_T, my_results[key])
        ed = get_editdist(list(v_T), predictions, phoneme_dict)
        acc_edit_distance.append(ed)
        
        end_idx = len(v_T)
        
        accuracy = float(np.sum(np.equal(train_pred[2:end_idx], np.array(v_T[2:]))))/end_idx
        per_accuracy.append(accuracy)
        
        per_predictions.append(train_pred[2:end_idx])
        per_gt.append(v_T[2:end_idx])
    return np.mean(acc_edit_distance), \
            np.mean(accuracy), per_predictions, per_gt


In [37]:
#     tf.logging.logger.setLevel(logging.INFO)
eval_predictions = eval_seq2seq(params, SAVE_DIR, 'trainval', './model/seq2seq', trainval_ID_list[:500], pron_dict, phoneme_dict)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe2c1360290>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './model/seq2seq', '_train_distribute': None, '_save_summary_steps': 100}


In [None]:
tv_ed, tv_accuracies, tv_predictions, tv_gt = calculate_accuracy(eval_predictions,'trainval', trainval_ID_list[:500], phoneme_dict)

In [None]:
print(tv_ed, tv_accuracies, tv_predictions, tv_gt)

In [38]:
#     tf.logging.logger.setLevel(logging.INFO)
eval_test_predictions = eval_seq2seq(params, SAVE_DIR, 'test', './model/seq2seq', test_ID_list, pron_dict, phoneme_dict)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe2c43b2d50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './model/seq2seq', '_train_distribute': None, '_save_summary_steps': 100}


In [39]:
test_ed, test_accuracies, test_predictions, test_gt = calculate_accuracy(eval_test_predictions,'test', test_ID_list, phoneme_dict)

INFO:tensorflow:Calling model_fn.
(<tf.Tensor 'input:0' shape=(?, ?, 120, 120, 3) dtype=float32>, <tf.Tensor 'zeros:0' shape=(?,) dtype=int64>)
(<tf.Tensor 'concat:0' shape=(?, ?) dtype=int64>, <tf.Tensor 'Sum_1:0' shape=(?,) dtype=int32>, <tf.Tensor 'Sum_2:0' shape=(?,) dtype=int32>, <tf.Tensor 'output:0' shape=(?, ?) dtype=int64>)
(<tf.Tensor 'input:0' shape=(?, ?, 120, 120, 3) dtype=float32>, <tf.Tensor 'video_embed/Reshape:0' shape=(?, 120, 120, 3) dtype=float32>)
(<tf.Tensor 'video_embed/Reshape_1:0' shape=(?, ?, 120, 120, 64) dtype=float32>, <tf.Tensor 'video_embed/Reshape_2:0' shape=(?, 120, 120, 64) dtype=float32>)
(<tf.Tensor 'video_embed/Reshape_3:0' shape=(?, ?, 60, 60, 64) dtype=float32>, <tf.Tensor 'video_embed/Reshape_4:0' shape=(?, 60, 60, 64) dtype=float32>)
(<tf.Tensor 'video_embed/Reshape_5:0' shape=(?, ?, 60, 60, 128) dtype=float32>, <tf.Tensor 'video_embed/Reshape_6:0' shape=(?, 60, 60, 128) dtype=float32>)
(<tf.Tensor 'video_embed/Reshape_7:0' shape=(?, ?, 30, 30, 

({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 11, 27, 25, 33, 25, 27, 19, 12, 19, 33, 12, 36, 25,  0,  0,  0,
        0,  0,  0,  0], dtype=int32)}, [u'HSercnrqeN4', u'00002'])
({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11,  8, 13, 31, 38,  5, 33,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0], dtype=int32)}, [u'HSercnrqeN4', u'00004'])
({'predictions': array([ 5, 25, 11, 18, 20, 31, 13, 11, 38, 13, 23, 39, 36, 16,  7, 25, 11,
       12, 13, 24,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12,  5, 33, 12, 20, 25, 23, 25, 27, 30, 39,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0], dtype=int32)}, [u'wmMrZyVANUo', u'00002'])
({'predictions': array([ 5, 25, 11

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27, 12, 30, 30, 20,  5, 23,  5, 19,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'66koScWSHBU', u'00005'])
({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  4, 18, 14, 33, 24, 13, 33, 30, 24, 40,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'66koScWSHBU', u'00003'])
({'predictions': array([19, 33, 38,  3, 40,  5, 25, 33,  6, 23, 38, 15, 40, 23,  8, 22, 12,
       19, 31,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  7, 11, 35, 11, 12, 25, 35, 33, 14,  4, 24,  5, 36, 25, 11,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'shG0ezBeeJc

({'predictions': array([19, 33, 38,  3, 40,  5, 25, 33,  6, 23, 38, 15, 40, 23,  8, 22, 12,
       19, 31,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  7, 11,  6, 30, 12,  9, 22,  5, 25,  8, 23,  5,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'2UStOghblfE', u'00004'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  7, 11, 36, 38, 15, 11, 30, 30, 29, 33, 14,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'2UStOghblfE', u'00003'])
({'predictions': array([12,  4, 33, 31,  5, 17, 30, 15, 33, 29, 23, 15, 31, 33, 36,  9, 20,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 36, 31,  8, 33, 22, 20, 14, 33, 33, 24,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'3OgsRa7VDtI

({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 33, 31,  4, 40,  5,  3,  1,  5, 25,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'TScetimMBL8', u'00004'])
({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12,  4, 33, 31, 38,  3, 24, 19, 26, 40, 29, 28, 25,  5, 23, 19,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'TScetimMBL8', u'00003'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 24, 27, 25, 33, 25, 20, 25, 12,  8, 16,  4, 26,  1, 24, 14, 13,
       13

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12,  4, 16, 19, 23, 19, 19, 33,  6, 16, 13,  5, 17, 20, 13,  1, 25,
       13, 13, 18,  5,  0,  0,  0,  0,  0], dtype=int32)}, [u'6wNif5SlN08', u'00001'])
({'predictions': array([19, 33, 38,  3, 40,  5, 25, 33,  6, 23, 38, 15, 40, 23,  8, 22, 12,
       19, 31,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 27, 39, 36, 23, 20, 23, 13, 24,  5, 14,  1, 19, 25,  1, 11,  1,
       13, 40,  5,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'U6PoUg7jXsA', u'00002'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 12, 13, 30,  4, 22, 10,  5,
       38,  5, 23, 20, 11, 36, 19, 26,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 15, 31, 19, 40, 38,  4, 33, 33, 13, 22,  5, 32, 38,  8, 22, 31,
        4, 33, 29,  7, 25, 23, 23, 40,  0], dtype=int32)}, [u

({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  4, 18, 15, 11, 33,  4, 24, 13, 25, 14, 19,  0,  0,  0,  0,  0,
        0], dtype=int32)}, [u'SWiwSBLXS3k', u'00002'])
({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27, 12,  5, 33, 11,  5, 11, 12, 15, 11, 36,  1,  0,  0,  0,  0,
        0], dtype=int32)}, [u'SWiwSBLXS3k', u'00008'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31,  5, 29,  6, 30, 33,  7, 14, 16, 30, 13,
       25, 11, 40,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11, 12,  7, 33, 31,  5, 31, 33,  5,  5, 33,  1,  0,  0,  0,
        0], dtype=int32)}, [u'SWiwSBLXS3k', u'00001'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
  

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 24, 27, 39, 36, 25,  9, 27, 19, 31, 33,  4, 25, 19, 39,  4, 33,
        1,  0,  0,  0], dtype=int32)}, [u'SN7wO06Yz1E', u'00009'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 33, 19, 40, 12,  5, 31, 25, 14,  5, 12,  4, 12,  1,  5, 25,
        1,  0,  0,  0], dtype=int32)}, [u'SN7wO06Yz1E', u'00003'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 38,  3, 40, 11, 30,  8, 37,
       19, 26, 19, 33,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 19, 40,  5,  4, 33, 15, 30, 14,  5, 15, 15, 22,  5, 24,  1,  6,
       11, 25, 14, 40], dtype=int32)}, [u'8nbeD1mwCdo', u'00001'])
({'predictions': array([ 5, 25, 11

({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 13, 22,  8,  8, 19, 20, 40,  5, 11, 19, 11, 20, 23,  4, 11, 33,
       36,  9, 20, 31,  5, 23, 19, 30,  1,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'Us70DN2XSfM', u'00001'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11, 12, 33, 31, 40, 25, 33, 29, 30, 30, 33,  5, 11, 33,  1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'uzKBGtf0i0M', u'00002'])
({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 38, 35, 22, 12, 33, 18, 36, 23, 19, 22, 39, 31, 39, 31,  0,  0,
        0

({'predictions': array([12,  4, 33, 31,  5, 17, 30, 15, 33, 29, 23, 15, 31, 33, 36,  9, 20,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 18, 25, 17, 20, 37, 19, 25, 33, 30, 19, 25, 19, 25, 20, 24,
        6, 30,  1,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'0Fi83BHQsMA', u'00002'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 12, 13, 30,  4, 22, 10,  5,
       38,  5, 23, 20, 11, 36, 19, 26,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 22, 23,  5, 37,  5,  7, 25, 33,  5, 38, 15, 27, 19, 26, 33, 36,
       11, 27, 33, 12, 33,  3,  0,  0,  0], dtype=int32)}, [u'0Fi83BHQsMA', u'00004'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12,  5, 33,  8,  5, 22, 38, 13, 31, 10,  5, 25, 32, 40, 32,  4, 37,
        1, 22,  1,  4, 22, 33, 25,  1,  0], dtype=int32)}, [u

({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 33, 31, 30, 40, 31,  8, 30,  1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)}, [u'eVFYhbHpfqU', u'00011'])
({'predictions': array([38,  5, 33, 11, 36,  8, 24, 20, 25,  9,  8, 12,  4, 33,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 27,  8,  3, 33, 33,  4, 40,  5, 37,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)}, [u'eVFYhbHpfqU', u'00008'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31,  5, 29,  6, 30, 33,  7, 14, 16, 30, 13,
       25, 11, 40,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 33, 31,  5,  8, 30, 23,  9, 38, 16, 25,  4,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)}, [u'eVFYhbHpfqU', u'00009'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13

({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  5, 23, 12,  3, 33, 22, 36, 25, 19,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'kwYxHPXIaao', u'00001'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 33, 31, 25,  3, 10, 14, 19, 25, 20,  6, 24, 31, 31, 31, 36, 37,
       38,  8, 24, 33,  5,  1,  0], dtype=int32)}, [u'7gXq1ml6B1E', u'00001'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 38,  3, 40, 11, 30,  8, 37,
       19, 26, 19, 33,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27, 15, 23, 20, 40,  5, 30, 12, 37, 15, 40, 19, 26,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'7kkRkhAXZGg', u'00006']

({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11,  8,  4, 25,  8, 13, 33, 38,  5, 33,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'qsJYrwzfd6w', u'00005'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 15, 33, 31, 38,  3, 33, 12, 30, 23, 20, 37, 33,  1,  7, 23, 29,
       23, 31,  5, 33,  5, 23,  1], dtype=int32)}, [u'tzJYY2p0QIc', u'00003'])
({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 15, 22, 19, 20, 24,  5, 26, 38,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'6OaIdwUdSxE', u'00003']

({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11,  8,  4, 33, 31, 19, 30, 33,  8,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'AMVgX8cXsHA', u'00002'])
({'predictions': array([18,  7, 11, 36, 38, 20, 11, 36, 12,  4, 33,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  4, 13, 33, 22,  4, 33, 25,  5, 25,  5, 33,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'JISHzvXk5bk', u'00001'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12,  7, 11,  6, 30, 12,  9, 14, 34, 19, 40, 12,  4, 31, 15, 29, 33,
       26,  5,  4, 24,  1,  0

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 25, 11, 12, 20, 33, 38,  3, 40, 17,  5, 29, 36, 24, 23, 23,  1,
       36, 24, 12,  4, 33,  1,  0,  0,  0], dtype=int32)}, [u'NA7krbsdXFA', u'00003'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31,  5, 29,  6, 30, 33,  7, 14, 16, 30, 13,
       25, 11, 40,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 20, 18, 14, 11, 17,  5, 31, 33, 25, 27, 33, 19, 27, 31, 25, 19,
       11, 31,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'npNYP2vzaPo', u'00002'])
({'predictions': array([12,  4, 33, 31,  5, 17, 30, 15, 33, 29, 23, 15, 31, 33, 36,  9, 20,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 24, 25, 19, 25, 20,  6, 37, 39, 24, 22, 20,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u

({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 12, 13, 30,  4, 22, 10,  5,
       38,  5, 23, 20, 11, 36, 19, 26,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 36, 30, 24, 19, 19, 33, 19, 25, 12,  5, 19, 11, 20,  9,  5, 20,
       11, 25,  4,  7, 30, 33, 19, 25, 33, 19,  3, 22,  1], dtype=int32)}, [u'fC9da6eqaqg', u'00001'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 25, 11, 12, 20, 11, 40, 24,  4, 33, 29,  4, 30,  5,  9, 23, 23,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'u4L130DkdOw', u'00001'])
({'predictions': array([34,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 15, 22, 19, 33, 39, 30, 19, 10, 15, 24,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0

({'predictions': array([38,  5, 33, 11, 36,  8, 24, 20, 25,  9,  8, 12,  4, 33,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  4, 26, 22, 39, 36, 37, 13, 30, 20, 24,  5, 10,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VfAiNX7jn9A', u'00012'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11, 12,  4, 33, 31, 25,  5, 33,  8,  3, 18, 19, 11, 14,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VfAiNX7jn9A', u'00004'])
({'predictions': array([ 5, 25, 11, 18, 20, 31, 13, 11, 38, 13, 23, 39, 36, 16,  7, 25, 11,
       12, 13, 24,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 36, 26, 29, 20, 29,  5, 23, 23, 31,  5, 13, 30,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VfAiNX7jn9A

({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 27,  8, 24,  3, 25, 33, 33, 36, 31,  3, 33, 18, 25,  5, 16,  1,
       31, 33, 10,  5, 24,  5, 11,  1,  0], dtype=int32)}, [u'DsrxbqFo41k', u'00002'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 38,  3, 25, 12,  9, 33,  9, 19, 30, 20, 17,  3, 11, 20, 22, 25,
        8, 23, 19, 22,  5, 30, 26, 22,  1], dtype=int32)}, [u'0QVXdEOiCw8', u'00001'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 38,  3, 40, 11, 30,  8, 37,
       19, 26, 19, 33,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27,  8,  5, 40, 24,  8, 24, 14, 23, 36, 24, 11, 33, 24,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 25, 11, 12,  3, 38, 16, 35, 37, 12,  8, 16, 33, 40,  5, 21,  1,
       40,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'iPE2SiCCo0w', u'00021'])
({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11, 12, 38, 19, 33, 31, 15, 31,  1, 19, 11,  9, 20,  1, 30,
       36, 29,  1, 20, 25,  1,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)}, [u'iPE2SiCCo0w', u'00009'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 25, 11,  8,  4, 31, 19, 19, 33, 19, 40, 40,  3, 23, 33,  5, 25,
       12

({'predictions': array([19, 33, 38,  3, 40,  5, 25, 33,  6, 23, 38, 15, 40, 20, 40, 20,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 23, 38, 36, 30, 25, 13, 31, 31, 33, 13, 37, 19, 31,  6, 13,
        4, 25, 31,  1, 13,  4, 33, 20], dtype=int32)}, [u'VAJS5J21Rd0', u'00004'])
({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 30, 39, 13,  9, 29, 30,  5, 25, 33,  5, 22,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VAJS5J21Rd0', u'00001'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 30, 39, 13,  9,  9, 23, 17,  5, 30, 20,  5, 25, 23,  1, 25, 14,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VAJS5J21Rd0

({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 12, 13, 30,  4, 22, 10,  5,
       38,  5, 23, 20, 11, 36, 19, 26,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 15, 33, 31,  3, 40, 12, 25, 33, 19, 30, 19, 33,  5, 29,  5, 14,
        5,  0], dtype=int32)}, [u'UMhLBPPtlrY', u'00005'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38, 13, 25, 23, 14, 25, 19, 26, 30, 19,
       23, 20,  9, 19, 17,  4, 25, 16,  6, 30, 24, 20,  1], dtype=int32), 'train_pred': array([12, 20, 33,  3, 33,  5, 29, 33, 16, 23, 30, 33, 12,  3, 31, 19, 14,
       30, 31], dtype=int32)}, [u'SS8TSf2srSw', u'00006'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 38, 35, 25, 19, 10, 23,  7, 33, 24, 19, 24,  8, 33,  1,  1,
        0,  0], dtype=int32)}, [u'SS8TSf2srSw', u'00007'])
({'predictions': array([38,  5, 33, 11, 36,  8, 24, 20, 25

({'predictions': array([18,  7, 11, 36, 38, 20, 31, 29, 20, 22,  5, 29,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 25, 11, 12,  4, 25,  8, 20, 33, 24, 13, 40,  5, 25, 40,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VIgzTLDyObo', u'00004'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27, 12,  8, 33, 38, 21, 20,  5, 23,  5, 40, 39, 27, 11,  4,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'VIgzTLDyObo', u'00001'])
({'predictions': array([ 5, 25, 11, 12,  4, 33, 31, 38,  5, 33, 12, 13, 30,  4, 22, 10,  5,
       38,  5, 23, 20, 11, 36, 19, 26,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 11, 36, 38, 25, 13, 33, 12,  6, 36, 12, 13,  9,  3, 12, 11,
        3, 37, 33,  1, 20, 30, 33,  9], dtype=int32)}, [u'VIgzTLDyObo

({'predictions': array([38,  5, 33, 11, 36,  8, 24, 20, 25,  9,  8, 12,  4, 33,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  5, 25, 23, 25,  3, 25, 33, 23, 36, 29,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)}, [u'w1R4F9sSoow', u'00006'])
({'predictions': array([19, 33, 38,  3, 40,  5, 25, 33,  6, 23, 38, 15, 40, 23,  8, 22, 12,
       19, 31,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 20, 11, 27, 11, 18,  9, 33, 18, 36, 31,  5, 31, 33, 16, 20, 23,
       12, 19,  1,  5, 33,  1], dtype=int32)}, [u'LEx6K4P4GJc', u'00010'])
({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 25, 19, 16,  5, 25, 28, 37, 20, 14,  4,  8,  7,  6, 12, 19,  8,
        0,  0,  0,  0,  0,  0], dtype=int32)}, [u'LEx6K4P4GJc', u'00005'])
({'predict

({'predictions': array([ 5, 25, 11, 31, 27, 12, 15, 11, 27, 25, 33, 11, 36, 13, 25, 20, 34,
       19, 26,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 27, 12, 33, 31,  5, 25, 23, 36, 39,  9, 30,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'O6Vayv9FCLM', u'00002'])
({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 15, 31, 33, 11, 17, 31, 14, 40,  5,  5, 30, 38,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'JcJ3MfJNm6g', u'00001'])
({'predictions': array([38,  5, 33, 11, 36,  8, 24, 20, 25,  9,  8, 12,  4, 33,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38,  7, 11, 40, 12,  4, 33, 29,  3, 31,  5,  9,  5, 23,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype

({'predictions': array([ 5, 25, 11,  8, 38,  3, 40, 23,  8, 22, 38, 13, 23, 25, 27,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 20, 33, 19, 40, 12, 33, 12,  4, 33, 24, 27, 33, 39, 31, 33, 33,
        1,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'TGZMSmcuiXM', u'00008'])
({'predictions': array([12,  4, 33, 31,  5, 17, 30, 15, 33, 29, 23, 15, 31, 33, 36,  9, 20,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([12, 33, 31,  5,  3, 33,  5,  5, 25,  8, 22,  4, 11, 16, 25, 23, 13,
       23, 22, 23, 30, 11,  1,  0,  0,  0], dtype=int32)}, [u'TGZMSmcuiXM', u'00004'])
({'predictions': array([18,  7, 11, 36, 38, 20, 11, 36, 12,  4, 33,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1], dtype=int32), 'train_pred': array([38, 20, 25,  4, 37, 33, 36, 17, 25, 19, 14, 12,  5, 33,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)}, [u'TGZMSmcuiXM

REF:        DH AE T   W AA Z AH N  T B  IY IH NG AE S  K T            
HYP: AH N D DH AE T S W AH T DH EH R AE K  CH AH W  AH L IY D UW IH NG
EVA: I  I I         I   S  S S  S  S S  S  S  S  S  S  S S  I I  I  I 
WER: 123.53%
[u'UMhLBPPtlrY', u'00002']
REF: SH IY G AA T AH N L AH K IY  
HYP: Y  UW N OW W AH   T AY M IY N
EVA: S  S  S S  S    D S S  S    I
WER: 90.91%
[u's0YjL9rZyR0', u'00008']
REF: B AH T  IH T  S D  IH F  ER AH N T
HYP:   Y  UW N  OW W AH T  AY M  IY N  
EVA: D S  S  S  S  S S  S  S  S  S    D
WER: 92.31%
[u's0YjL9rZyR0', u'00005']
REF: IH T  IH Z IH R  AH S  P AA N S AH B  AH L            
HYP:    AH N  D S  OW DH EY D OW N T D  UW EH N IY TH IH NG
EVA: D  S  S  S S  S  S  S  S S    S S  S  S  S I  I  I  I 
WER: 118.75%
[u's0YjL9rZyR0', u'00004']
REF: IH T  IH Z  R AO NG          
HYP: Y  UW N  OW W AH T  AY M IY N
EVA: S  S  S  S  S S  S  I  I I  I
WER: 157.14%
[u's0YjL9rZyR0', u'00007']
REF: AA R  W IY B    R EY V IH N AH F T UW W EH L K AH M IH T 
HYP: DH AE T S  

REF: W IY K R IY EY T AH D AH D  AY AH G  N AA S  T IH K T  EH S  T                 
HYP:   AH N D DH AE T    S W  EH N  L  ER N IH NG R IH L IY B  IH G AE N F AO R M IY
EVA: D S  S S S  S    D  S S  S  S  S  S    S  S  S    S S  S  S  S I  I I I  I I I 
WER: 116.67%
[u'AO4In7d6XSc', u'00003']
REF: AY L AH V B IH L D  IH NG R  OW B AA T S   
HYP:      AH N D AY W AA Z  L  AY K  W EH L N OW
EVA: D  D    S S S  S S  S  S  S  S  S S  S S I 
WER: 100.00%
[u'SN7wO06Yz1E', u'00003']
REF: W AH T IH Z DH AH AE B S  T R AE K SH AH   N   
HYP:   AH        N  D  AY W AA Z L AY K W  EH L N OW
EVA: D    D D  D S  S  S  S S  S S S    S  S  I   I 
WER: 94.12%
[u'SN7wO06Yz1E', u'00004']
REF: IH T  S S  OW AA B V  IY AH S
HYP: HH AW D UW W  IY D UW DH AE T
EVA: S  S  S S  S  S  S S  S  S  S
WER: 100.00%
[u'SN7wO06Yz1E', u'00005']
REF: Y  UW D OW N T K  W EH S CH AH N DH AE T                   
HYP: HH AW D        UW W IY S    AH P AO R  T AW ER F R EH N D Z
EVA: S  S    D  D D S    S    D     S S  S   

REF: DH EY S AH P  AO R T AH D M  IY AO L  DH    AH W EY TH R    UW IH T 
HYP: AH N  D DH AE T  S W AH T DH EH R  AE K  CH AH W AH L  IY D UW IH NG
EVA: S  S  S S  S  S  S S    S S  S  S  S  S  I       S  S  S  I       S 
WER: 86.96%
[u'iPE2SiCCo0w', u'00013']
REF: AH N D DH IH S R UW T IY N W EH N T AA N F  AO R S  AH M  T AY               M   
HYP: AH N D DH        AE T    S W EH N L ER N IH NG R IH L  IY B IH G AE N F AO R M IY
EVA:           D  D D S    D  S        S S    S  S    S  S  S  S S  I I  I I I  I   I 
WER: 84.62%
[u'QoT0S2vu9m4', u'00001']
REF: DH AE T S AH T EH R AH B AH L W EY S T AH V T  AY M 
HYP: DH AE T S AH   G  R EY T P  L   EY S T      UW B  IY
EVA:              D S    S  S S    D        D  D S  S  S 
WER: 52.38%
[u'BHY0FxzoKZE', u'00002']
REF: AY           T R AY D  EH V     R                   IY TH    IH NG
HYP: AH N D DH AE T S W  AH T  DH EH R AE K CH AH W AH L IY D  UW IH NG
EVA: S  I I I  I    S S  S  S  S  I    I  I I  I  I I  I    S  I       
WER: 166.6

REF: DH IH S IH Z Y  AO R  CH AE N S
HYP: Y  UW N OW W AH T  AY M  IY N  
EVA: S  S  S S  S S  S  S  S  S    D
WER: 91.67%
[u'zMGE3mbS9NY', u'00001']
REF: AE T  DH AH S       EY M      T AY M                  
HYP:    AH N  D  S OW DH EY D OW N T D  UW EH N IY TH IH NG
EVA: D  S  S  S    I  I     S I  I   S  S  I  I I  I  I  I 
WER: 170.00%
[u'uL5XUwfkxZw', u'00003']
REF: DH AE T S Y AO R  G  R AE N   D F  AA DH ER         
HYP: AH N  D S   OW DH EY D OW N T D UW EH N  IY TH IH NG
EVA: S  S  S   D S  S  S  S S    I   S  S  S  S  I  I  I 
WER: 106.25%
[u'uL5XUwfkxZw', u'00002']
REF: AH N D    W ER K L AY K AH D  AO G   
HYP: AH N D AY W AA Z L AY K W  EH L  N OW
EVA:        I    S  S        S  S  S  S I 
WER: 61.54%
[u'uL5XUwfkxZw', u'00007']
REF: DH EY      W ER AO L  W  AY T
HYP: HH AW D UW W IY D  UW DH AE T
EVA: S  S  I I    S  S  S  S  S   
WER: 100.00%
[u'uL5XUwfkxZw', u'00006']
REF: AH N  D  DH AE T  S AO L           
HYP: TH AE NG K  Y  UW V EH R IY M AH CH
EVA: S  S  S  S  S  S

REF: K  UH D  AY G OW DH EH R           
HYP: TH AE NG K  Y UW V  EH R IY M AH CH
EVA: S  S  S  S  S S  S       I  I I  I 
WER: 122.22%
[u'c0KYU2j0TM4', u'00001']
REF: DH AE T  IH Z G       R EY T      
HYP: TH AE NG K  Y UW V EH R IY M AH CH
EVA: S     S  S  S S  I I    S  S I  I 
WER: 122.22%
[u'2SlBiFZ85d0', u'00008']
REF: S OW DH AE T S W      AH   T AY M D UW IH NG     
HYP:         IH T   W AA Z AH N T AO L W EY Z  IY Z IY
EVA: D D  D  S    D   I  I    I   S  S S S  S  S  I I 
WER: 106.67%
[u'XYNUpQrZISc', u'00001']
REF: W  AY IH Z    DH AE T HH AA R D                    
HYP: AH N  D  S OW DH EY D OW N  T D UW EH N IY TH IH NG
EVA: S  S  S  S I     S  S S  S  S   I  I  I I  I  I  I 
WER: 154.55%
[u'y1KIVZw7Jxk', u'00005']
REF: Y UW N  OW AH B  AW T  N AH M  B ER Z
HYP:      HH AW D  UW W  IY S P  IY K AH P
EVA: D D  S  S  S  S  S  S  S S  S  S S  S
WER: 100.00%
[u's4smjT1qwZU', u'00002']
REF: DH EH R Z  N OW S T R AH K CH ER
HYP: HH AW D UW W IY S   P IY K AH P 
EVA: S  S  S S  

REF: D UW DH EY HH AE V EY D Z   
HYP: Y UW N  OW W  AH T AY M IY N
EVA: S    S  S  S  S  S S  S S  I
WER: 100.00%
[u'Sew0bn8mGAA', u'00007']
REF: D  UW DH EY HH AE V AH K AA M AH N K OW L D 
HYP: DH AE T  S  AH G  R EY T P  L EY S T UW B IY
EVA: S  S  S  S  S  S  S S  S S  S S  S S S  S S 
WER: 100.00%
[u'Sbm98rrVZzE', u'00001']
REF: B AH T AY N  IY D Y  AO R HH EH L  P
HYP: W AH T D  UW AY M IY N  B AY DH AE T
EVA: S      S  S  S  S S  S  S S  S  S  S
WER: 85.71%
[u'QKHkSUiQboA', u'00004']
REF: N OW B  AA D IY K  AH M  Z T UW G EH T F EY M AH S 
HYP:      AH N  D    AY W  AA Z      L AY K W EH L N  OW
EVA: D D  S  S    D  S  S  S    D D  S S  S S S  S S  S 
WER: 90.00%
[u'RD07GkmM2fc', u'00004']
REF: AH N D W IY F AY N AH L IY F IH G Y ER D AW T HH AW T UW D  UW IH T 
HYP: AH N D        AY                       W AA Z L  AY K W  EH L  N  OW
EVA:        D D  D    D D  D D  D D  D D D  S S  S S  S  S S  S  S  S  S 
WER: 85.19%
[u'QKHkSUiQboA', u'00007']
REF: HH AW AA R  W IY G OW IH NG

REF: AY W  AA Z L  AY K  W AH        T                   
HYP:    HH AW D UW W  IY S AH P AO R T AW ER F R EH N D Z
EVA: D  S  S  S S  S  S  S    I I  I   I  I  I I I  I I I
WER: 190.00%
[u'hiIcwt88o94', u'00006']
REF: AH N D AY        W AA Z L AY K OW DH AH AE M AH Z AA      N            
HYP: AH N D DH AE T S W EH N L ER N IH NG R  IH L IY B IH G AE N F AO R M IY
EVA:        S  I  I I   S  S   S  S S  S  S  S  S S  S S  I I    I I  I I I 
WER: 121.05%
[u'hiIcwt88o94', u'00001']
REF: AY JH AH       S T  D IH S  AH G R  IY W  IH DH DH IH S
HYP: AH N  D  HH IY S EH D W  EH L  Y UW F  AW N  D  DH EH M
EVA: S  S  S  I  I    S    S  S  S  S S  S  S  S  S     S  S
WER: 94.44%
[u'8VwTZFYSfvw', u'00002']
REF: IH T W AA Z  DH AH K OW L D AH S T  AY V EH V ER B IH      N            
HYP: AH N D DH AE T  S  W EH N L ER N IH NG R IH L IY B IH G AE N F AO R M IY
EVA: S  S S S  S  S  S  S S  S S S  S S  S  S S  S S       I I    I I  I I I 
WER: 118.18%
[u'hiIcwt88o94', u'00008']
REF: DH EY G       

REF: DH EY G  EH T S K  EH R  D      T UW W  AO K ER AW N  D 
HYP:       AH N  D S OW DH EY D OW N T D  UW EH N IY TH IH NG
EVA: D  D  S  S  S   S  S  S    I  I   S  S  S  S S  S  S  S 
WER: 94.74%
[u'J8FyHI00ELY', u'00006']
REF: AY M T R  AY IH NG T  UW B  IH K  AH M AH B  EH T ER P ER S  AH N 
HYP:        AH N  D  S  OW DH EY D  OW N  T D  UW EH      N IY TH IH NG
EVA: D  D D S  S  S  S  S  S  S  S  S  S  S S  S     D D  S S  S  S  S 
WER: 95.83%
[u'J8FyHI00ELY', u'00001']
REF: DH AE T S  W AH T  HH AE P AH N  D W IH DH M AY D  AE D  
HYP: HH AW D UW W    IY S  AH P    AO R T AW ER F R  EH N  D Z
EVA: S  S  S S    D  S  S  S    D  S  S S S  S  S S  S  S    I
WER: 90.48%
[u'J8FyHI00ELY', u'00002']
REF: M  AY D AE D  HH AE Z AE L Z HH AY M ER Z D IH Z IY Z 
HYP: AH N  D    AY W  AA Z    L      AY      K W EH L N  OW
EVA: S  S    D  S  S  S    D    D D     D D  S S S  S S  S 
WER: 80.95%
[u'nCg2NcGYu34', u'00012']
REF: HH IY SH UH D  N AA T HH AE V B  IH N S  ER P R  AY Z D 
HYP:       

REF: IH T S AH B AW T DH AH D EY T  AH     
HYP: AH N D AY W AA Z L  AY K W  EH L  N OW
EVA: S  S S S  S S  S S  S  S S  S  S  I I 
WER: 115.38%
[u'7gkdzkVbuVA', u'00003']
REF: HH AW S AE D AA R Y UW R AY T  N AW     
HYP: IH T  W AA Z AH N T AO L W  EY Z IY Z IY
EVA: S  S  S S  S S  S S S  S S  S  S S  I I 
WER: 114.29%
[u'SS8TSf2srSw', u'00006']
REF: SH IY G AA T  AH S AH P  AO              R T                 N EH T  W ER K 
HYP: AH N  D DH AE T  S W  EH N  L ER N IH NG R IH L IY B IH G AE N F  AO R M  IY
EVA: S  S  S S  S  S    S  S  S  I I  I I  I    S  I I  I I  I I    S  S  S S  S 
WER: 144.44%
[u'wG4RnDNWtJo', u'00003']
REF: W IY L AO S  T V  AE L Y  AH B AH L D EY T AH
HYP:   AH N D  AY W AA Z  L AY K  W EH L      N OW
EVA: D S  S S  S  S S  S    S  S  S S    D D  S S 
WER: 88.89%
[u'4AYE3w5TWHs', u'00006']
REF: Y  UW    K AE N  T    R AY IH T    
HYP: TH AE NG K Y  UW V EH R IY M  AH CH
EVA: S  S  I    S  S  S I    S  S  S  I 
WER: 110.00%
[u'd3syTrElgcg', u'00001']
REF: L UH

REF: AH N D B  OY DH EY W        ER              L AY K W  AH T               
HYP: AH N D DH AE T  S  W EH N L ER N IH NG R IH L IY B IH G  AE N F AO R M IY
EVA:        S  S  S  S    I  I I    I I  I  I I    S  S S  S  S  I I I  I I I 
WER: 153.33%
[u'BImnT7lcLDE', u'00006']
REF: B AH T   DH AE T D AH Z AH N T M IY N DH AE T W    IY K      AE N T          
HYP:   AH N D DH AE T   S  W EH N   L ER N IH NG R IH L IY B IH G AE N F AO R M IY
EVA: D    S I         D S  S S    D S S    S  S  S S  I    S I  I      S I  I I I 
WER: 95.83%
[u'PbgB2TaYhio', u'00007']
REF: IH T   S JH AH S  T N  AA T P AA S  AH B  AH L    
HYP: AH N D S OW DH EY D OW N  T D UW EH N  IY TH IH NG
EVA: S  S I   S  S  S  S S  S    S S  S  S  S  S  S  I 
WER: 100.00%
[u'0Fi83BHQsMA', u'00002']
REF: W IY R  IH L IY D OW N  T W AO K EH N IY M  AO R 
HYP:   DH AE T  S AH G R  EY T   P  L EY S T  UW B  IY
EVA: D S  S  S  S S  S S  S    D S  S S  S S  S  S  S 
WER: 94.74%
[u'0Fi83BHQsMA', u'00004']
REF:           AE T S A

REF: S OW AY L UH K  T F AO R  F   ER N IH CH ER                                
HYP:   AH N  D DH AE T S W  EH N L ER N IH NG R  IH L IY B IH G AE N F AO R M IY
EVA: D S  S  S S  S    S S  S  S I         S  S  I  I I  I I  I I  I I I  I I I 
WER: 162.50%
[u'jPf0LjZAS14', u'00004']
REF: HH AW SH UH D  W IY L AY V     
HYP:    Y  UW N  OW W AH T AY M IY N
EVA: D  S  S  S  S    S  S    S I  I
WER: 100.00%
[u'fIICVeGW4RY', u'00003']
REF: B AH T IH V EH N  CH AH W AH L IY DH EY D  IH D K AH M ER AW N  D
HYP:   AH   N  D DH AE T  S  W AH      T  W  AA Z  D R AY V IH NG IH T
EVA: D    D S  S S  S  S  S       D D  S  S  S  S    S S  S S  S  S  S
WER: 84.00%
[u'7uN4I1wEOXE', u'00001']
REF: AY HH AE V T  UW           T EY K  AH Y IH R  AO F 
HYP: AH N  D  S OW DH EY D OW N T D  UW EH N IY TH IH NG
EVA: S  S  S  S S  S  I  I I  I   S  S  S  S S  S  S  S 
WER: 120.00%
[u'qWNae7vYK6s', u'00004']
REF: AY W  AA Z P  EH T  R AH F  AY D
HYP:    HH AW D UW W  IY D UW DH AE T
EVA: D  S  S  S S  S  S  S 

REF: AY AE K CH AH W AH L IY HH AE V N  OW AY D  IY AH W  AY
HYP:    IH T W  AA Z AH N T  AO L  W EY Z  L  AY K  DH IH S 
EVA: D  S  S S  S  S    S S  S  S  S S  S  S  S  S  S  S  S 
WER: 95.00%
[u'7OMLSs8t1ng', u'00014']
REF: N OW W AH N  Z EH V ER S EH D DH AE T B  IH F AO R         
HYP:        HH AW D UW W IY S AH P AO R  T AW ER F    R EH N D Z
EVA: D D  D S  S  S S  S S    S  S S  S    S  S    D    I  I I I
WER: 100.00%
[u'3uSQlcGCHUU', u'00015']
REF: S OW AY M IH N DH AH M IY D IY AH  
HYP:   HH AW D UW W IY S  P IY   K  AH P
EVA: D S  S  S S  S S  S  S    D S     I
WER: 92.31%
[u'9CqVYUOjHLw', u'00002']
REF: IH T S DH AH W  AO R AA N T      EH R ER         
HYP: AH N D S  OW DH EY D OW N T D UW EH N IY TH IH NG
EVA: S  S S S  S  S  S  S S      I I     S S  I  I  I 
WER: 114.29%
[u'M7KSnq0xkWY', u'00001']
REF:            W IY HH AE V T  UW TH R  OW IH T AH W EY  
HYP: HH AW D UW W IY S  AH P AO R  T  AW ER F  R EH N D  Z
EVA: I  I  I I       S  S  S S  S  S  S  S  S  S S  S S  I

REF: IH T  IH Z  IH N  DH AH W  AO T  ER
HYP: HH AW D  UW W  IY S  P  IY K  AH P 
EVA: S  S  S  S  S  S  S  S  S  S  S  S 
WER: 100.00%
[u'OYfoGTIG7pY', u'00007']
REF: R  IH M EH M  B ER W AH T AY S  EH D  M  AY    L AH V                      
HYP: AH N  D DH AE T S  W EH N L  ER N  IH NG R  IH L IY B IH G AE N F AO R M IY
EVA: S  S  S S  S  S S    S  S S  S  S  S  S  S  I    S  S I  I I  I I I  I I I 
WER: 142.11%
[u'SS8TSf2srSw', u'00003']
REF:        DH AE T   W AA Z HH ER F AY T                           
HYP: AH N D DH AE T S W AH T DH EH R AE K CH AH W AH L IY D UW IH NG
EVA: I  I I         I   S  S S  S  S S  S I  I  I I  I I  I I  I  I 
WER: 190.91%
[u'iPE2SiCCo0w', u'00002']
REF: S OW AY TH AO T W AY N AA    T P AH N IH SH M AY S EH L  F   
HYP:         DH AE T S AH G R  EY T P            L EY S T  UW B IY
EVA: D D  D  S  S    S S  S S  I      D  D D  D  S S    S  S  S I 
WER: 90.91%
[u'UZmXwOgNq7c', u'00005']
REF: T AO K T  UW F  AA R  M ER Z
HYP: Y UW N OW W  AH T  AY M IY N

REF: AH N D DH IH S L  IY D  Z T UW AA D AH T IY Z 
HYP: AH N D         AY W  AA Z L AY K  W EH L N  OW
EVA:        D  D  D S  S  S    S S  S  S S  S S  S 
WER: 77.78%
[u'rGfhahVBIQw', u'00002']
REF: B AH T DH IH S IH Z DH AH S AO R  S AH V  IH N AH V EY SH AH N 
HYP:   AH N D  AY W AA Z         L  AY K W  EH L  N               OW
EVA: D    S S  S  S S    D  D  D S  S  S S  S  S    D  D D  D  D  S 
WER: 87.50%
[u'vXPJVwwEmiM', u'00006']
REF: M OW S T IH N  D IH P EH N D AH N T  M  IY D    IY AH W  AA Z  SH AH T  D AW N 
HYP:   AH N D DH AE T S  W EH N L ER N IH NG R  IH L IY B  IH G  AE N  F  AO R M  IY
EVA: D S  S S S  S  S S  S      S S    S  S  S  S  I    S  S  S  S  S  S  S  S S  S 
WER: 89.66%
[u'eVFYhbHpfqU', u'00001']
REF: P  L IY Z  S AY N HH IY R R AY T N AW
HYP: AH N D  AY W AA Z L  AY K W EH L N OW
EVA: S  S S  S  S S  S S  S  S S S  S   S 
WER: 93.33%
[u'3vfuCPFb8wk', u'00001']
REF: B AH T IH T S M AO R  K AA M P L AH K EY T  AH D   
HYP:   AH          N D  AY W AA   Z L AY

REF: SH IY      W ER K           T IH N  AH B AE NG K  
HYP: HH AW D UW W IY S AH P AO R T AW ER F  R EH N  D Z
EVA: S  S  I I    S  S I  I I  I   S  S  S  S S  S  S I
WER: 138.46%
[u'qsJYrwzfd6w', u'00007']
REF: S OW W  AY D OW N T  W IY L EH V  ER IH JH DH AE T 
HYP:      AH N  D      AY W AA Z L  AY K  W  EH L  N  OW
EVA: D D  S  S    D  D S    S  S S  S  S  S  S  S  S  S 
WER: 89.47%
[u'k2hQL9Zrokk', u'00006']
REF: AY HH AE V  T UW R IY      P AO R T IH T              
HYP:    HH    AW D UW W IY S AH P AO R T AW ER F R EH N D Z
EVA: D     D  S  S    S    I I           S  S  I I I  I I I
WER: 107.14%
[u'k2hQL9Zrokk', u'00001']
REF:      T EY K     M AY CH   AY L  D   
HYP: W AH T D  UW AY M IY N  B AY DH AE T
EVA: I I    S  S  I    S  S  I    S  S  I
WER: 122.22%
[u'k2hQL9Zrokk', u'00003']
REF: DH EY D OW N  T  G EH T DH EH R M AH      N IY AW T   
HYP: AH N  D    HH IY S EH D W  EH L Y UW F AW N D  DH EH M
EVA: S  S    D  S  S  S    S S     S S S  I I    S  S  S  I
WER: 94.44%
[u'H

REF:        S OW W  AH        T D IH D  DH EY D  UW   
HYP: AH N D S OW DH EY D OW N T D UW EH N  IY TH IH NG
EVA: I  I I      S  S  I I  I     S  S  S  S  S  S  I 
WER: 125.00%
[u'GGScU5ESei4', u'00005']
REF: DH AH OY L W IH L    N EH V  ER R AH N AW T   
HYP:       IH T W AA Z AH N T  AO L  W EY Z IY Z IY
EVA: D  D  S  S   S  S I    S  S  S  S S  S S  S I 
WER: 100.00%
[u'Sa27SUR0Mlo', u'00008']
REF:      D EY    T AH D R  AY V  Z  B EH T ER D  IH S  IH ZH AH N  Z    
HYP: AH N D DH AE T S  W AH T  DH EH R AE K CH AH W  AH L  IY D  UW IH NG
EVA: I  I   S  I    S  S S  S  S  S  S S  S S  S  S  S  S  S  S  S  S  I 
WER: 109.52%
[u'RD07GkmM2fc', u'00001']
REF: IH N F  AE K T IH T S M  AO R L AY K F AY V P ER    S EH N  T   
HYP:      DH AE        T S AH G  R          EY T P L  EY S T  UW B IY
EVA: D  D S     D D D      S  S    D D  D D S  S   S  I    S  S  S I 
WER: 83.33%
[u'0gks6ceq4eQ', u'00013']
REF: AY    N OW        AY HH AE V
HYP: Y  UW N OW W AH T AY M  IY N
EVA: S  I       I I 

REF: K  L AH S  T  ER B AA M  Z L EH F T               B IH HH AY N D          
HYP: AH N D  DH AE T  S W  EH N L ER N IH NG R IH L IY B IH G  AE N F AO R M IY
EVA: S  S S  S  S  S  S S  S  S   S  S S  I  I I  I I       S  S    S I  I I I 
WER: 125.00%
[u'tiSiSM3pk5M', u'00003']
REF: W AH T K AE      N AY D  UW AH B  AW T  IH T 
HYP:   IH T W AA Z AH N T  AO L  W  EY Z  IY Z  IY
EVA: D S    S S  I I    S  S  S  S  S  S  S  S  S 
WER: 100.00%
[u'SWvJxasiSZ8', u'00009']
REF: AH N D DH AE T S  AE K CH AH W AH L IY W AH T HH AE P AH N D 
HYP: AH N D    AY W AA Z  L AY K  W EH L                      N OW
EVA:        D  S  S S  S  S S  S    S    D  D D  D D  D  D D    S 
WER: 75.00%
[u'aSavBsX60Ss', u'00001']
REF: AH N  D  IH T D  AH Z  AH N  T      
HYP: TH AE NG K  Y UW V  EH R  IY M AH CH
EVA: S  S  S  S  S S  S  S  S  S  S I  I 
WER: 118.18%
[u'SWvJxasiSZ8', u'00004']
REF: B AH T P IY P  AH L D IH D N AA T R IH L EY T  T UW DH IH S
HYP:   AH N D DH AE T  S W AH T W AA     Z  D R  AY V IH

REF:            W      AH T W  UH D HH AE P AH    N    
HYP: HH AW D UW W IY S AH P AO R  T AW ER F R  EH N D Z
EVA: I  I  I I    I  I    S S  S  S S  S  S S  I    I I
WER: 154.55%
[u'5b5BDoddOLA', u'00002']
REF: W IY W ER AH N T K  AH N S IH D      ER D DH AH R IY L TH   IH NG                 
HYP:           AH N D DH AE T S W  EH N L ER N IH NG R IH L IY B IH G  AE N F AO R M IY
EVA: D D  D D       S S  S  S   S  S  I I    S S  S    S    S  I    S  I  I I I  I I I 
WER: 113.04%
[u'5b5BDoddOLA', u'00001']
REF: S OW DH AE T W AA Z    IY Z IY           
HYP:      W  AH T D UW AY M IY N B  AY DH AE T
EVA: D D  S  S    S S  S  I    S S  I  I  I  I
WER: 127.27%
[u'0iIh5YYDR2o', u'00001']
REF: AY M  N AA T  AH S OW SH AH L P     R AA B L  AH M G  AY              
HYP:    AH N D  DH AE T S  W  AH T DH EH R AE K CH AH W AH L  IY D UW IH NG
EVA: D  S    S  S  S  S S  S     S S  I    S  S S     S S  S  I  I I  I  I 
WER: 110.00%
[u'VAJS5J21Rd0', u'00003']
REF: AA R DH EY AH P  AH T EH N SH AH L

REF:                S OW      W AA Z M AH DH ER T ER IY S  AH
HYP: AH N D DH AE T S W  AH T W AA Z   D  R  AY V IH NG IH T 
EVA: I  I I I  I  I   S  I  I        D S  S  S  S S  S  S  S 
WER: 128.57%
[u'j7fRIGphgtk', u'00004']
REF: AY HH AE D  T UW HH AE V  F   EY TH               
HYP: IH T  W  AA Z AH N  T  AO L W EY Z  L AY K DH IH S
EVA: S  S  S  S  S S  S  S  S  S I    S  I I  I I  I  I
WER: 150.00%
[u'eVFYhbHpfqU', u'00011']
REF: IH T  G  OW Z         R AO NG      
HYP: TH AE NG K  Y UW V EH R IY M  AH CH
EVA: S  S  S  S  S I  I I    S  S  I  I 
WER: 150.00%
[u'OzA6jRYjVQs', u'00009']
REF: W  IY JH AH S  T N IY D  T UW AE K T AA N          IH T
HYP: AH N  D  DH AE T S W  AH T W  AA Z D R  AY V IH NG IH T
EVA: S  S  S  S  S    S S  S    S  S  S S S  S  I I  I      
WER: 94.44%
[u'Gmai4zkKNcM', u'00003']
REF: AH N D W IY K  UH D AH N T JH AH S T T  ER N IH T  AO F 
HYP: AH N D S OW DH EY D OW N T         D UW EH N IY TH IH NG
EVA:        S S  S  S    S      D  D  D S S  S    S  S  S

In [31]:
print(test_ed, test_accuracies, test_predictions, test_gt)

(106.80143173420696, 0.5833333333333334, [array([ 3, 11, 23,  5, 37, 33, 36, 18, 20, 30,  5,  6,  5, 24, 13, 36],
      dtype=int32), array([33, 31,  3, 40, 12, 25, 33, 19, 30, 19, 33,  5, 29,  5, 14],
      dtype=int32), array([31,  3, 33, 16,  9, 19,  8, 37, 33], dtype=int32), array([33, 38, 33, 31,  5, 19, 16, 14,  5, 25, 33], dtype=int32), array([31, 40, 25, 22, 24, 16,  5, 30, 30, 31,  5,  9, 23, 23],
      dtype=int32), array([31, 40, 25, 19, 16], dtype=int32), array([39, 20,  9, 19, 27, 22, 11, 26, 19, 29, 15,  5, 39, 14, 25,  1,  1,
       24, 16, 26], dtype=int32), array([31, 19, 40,  5,  3, 33,  5, 31, 14, 30, 14, 15, 25,  1, 37,  1, 25,
       20, 24, 19, 25], dtype=int32), array([18, 25,  6, 22, 19, 26,  4,  6, 30,  5,  5, 25, 12, 19, 26,  1,  4,
       30], dtype=int32), array([33,  8,  5, 33, 31, 20, 23, 20, 12, 22, 31, 22, 33,  6, 36, 20, 23,
       25, 20, 38,  6, 30], dtype=int32), array([22,  3, 11, 25, 20,  5, 25,  5, 40, 26, 22, 23, 25,  5,  5,  1,  6,
       13, 24

## Play