# Seq2Seq - Keras - Tensorflow

Dataset: [CMU Dict](http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b)
An introductory Seq2Seq model. 

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import tensorflow as tf
import keras, keras.backend as K
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam


import re
from sklearn.model_selection import train_test_split
from keras_tqdm import TQDMNotebookCallback

Path = 'data/'

Using TensorFlow backend.


In [2]:
##Code to cap GPU memory usage
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

In [3]:
parms = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
lstm_params = {}

## Preprocessing

In [4]:
#Get each word that begins with A-Z from each line into a list 
lines = [l.strip().split("  ") for l in open(Path+'cmudict-0.7b', encoding='latin1') 
         if re.match('^[A-Z]', l)]
#Split words and phonemes
lines = [(w, ps.split()) for w, ps in lines]
lines[0]

('A', ['AH0'])

In [5]:
#Get a list of all the unique phonemes from lines and adding _ to position 0 because it corresponds to padding
#when tokenised
phonemes = ["_"]+sorted(set(p for w, ps in lines for p in ps))
len(phonemes)

70

In [6]:
#Map phonemes to indices and letters to indices.
p2i = dict((v, k) for k, v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v, k) for k, v in enumerate(letters))

In [7]:
maxlen = 15
#Map words to corresponding list of phoneme indices. Constraint
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                    if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [8]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()]); maxlen_p

16

In [9]:
#words contain the number of words in the filtered dictionary
words = np.random.permutation(list(pronounce_dict.keys()))
n = len(words)

#Initialise the input and labels array with zeros so that everywhere except 
#the position of values is padded
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

#Fill in the non zero indices
for i, k in enumerate(words):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j]=p
    for j, p in enumerate(k): labels_[i][j] = l2i[p]
        

In [10]:
#Create train, validation sets
(input_train, input_test, labels_train, labels_test) = train_test_split(input_, labels_, test_size=0.1)

In [11]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters);input_vocab_size, output_vocab_size

(70, 28)

## Model

In [13]:
dim = 240

In [14]:
def get_rnn(return_sequences=True):
    return LSTM(dim, dropout=0.1, recurrent_dropout=0.1, 
                implementation=2, return_sequences=return_sequences)

In [None]:
#Input after embedding becomes #sample*maxlen_p*120
inp = Input((maxlen_p,))
x = Embedding(input_vocab_size, 120)(inp)

#Encoding stage - form a representation of all the information in the word as a whole
#Bidirectional creates a separate RNN for a reverse input which is then concat 
#with original RNN.
x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

#Decoding stage
#RepeatVector is used because RNN in Keras expect a list of inputs rather than a 
#a single hidden state representation. Another issue is that it is also harder for the RNN to keep
#track of what the whole word that needs to be decoded. By repeating, each timestep is preented
#with the same vector. 
x = RepeatVector(maxlen)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

In [16]:
model = Model(inp, x)

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 16, 120)           8400      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 16, 480)           693120    
_________________________________________________________________
lstm_2 (LSTM)                (None, 240)               692160    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 15, 240)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 15, 240)           461760    
_________________________________________________________________
lstm_4 (LSTM)                (None, 15, 240)           461760    
__________

In [None]:
model.compile(Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

## Train and Evaluation

In [None]:
hist = model.fit(input_train, np.expand_dims(labels_train, -1),
                validation_data=[input_test, np.expand_dims(labels_test, -1)],
                batch_size=128, **parms, epochs=3)

In [18]:
#model.save_weights('models/seq2seq_rnn.h5')
model.load_weights('models/seq2seq_rnn.h5')

In [19]:
preds = model.predict(input_test, batch_size=128)

In [20]:
predict = np.argmax(preds, axis=2)

In [21]:
print ('Accuracy', np.mean([all(real==p) for real, p in zip(labels_test, predict)]))

Accuracy 0.29932413665401353


In [23]:
def evaluate(predict):
    print ('Phonemes_________________________________predictions______label')
    for index in range(20):
        phoneme = '-'.join([phonemes[p] for p in input_test[index]])
        prediction = [letters[l] for l in predict[index]]
        real = [letters[l] for l in labels_test[index]]
        print (phoneme.strip('-_').ljust(40), ''.join(prediction).strip('_').ljust(14), 
               ''.join(real).strip('_'))

In [24]:
evaluate(predict)

Phonemes_________________________________predictions______label
P-EH0-K-AO1-R-AH0                        pecora         pecora
N-AH1-JH-IH0-NG                          nudging        nudging
HH-AO1-R-W-IH0-T-S                       horwitz        horwitz
K-Y-UW1-K-ER0                            kuker          kuker
L-IH1-V-R-IY0-D                          livride        liveried
D-IH0-V-IY1-T-AH0                        divita         devita
W-IY1-V-AH0-L                            weevll         weavil
F-L-AH1-K-CH-AH0-W-EY2-T-IH0-D           fluctuated     fluctuated
B-IH1-S-IH0-T                            bisset         bissett
S-W-EH1-T-S-UW2-T                        swettsut       sweatsuit
K-R-AY1-T-S                              crites         crites
F-EH1-N-D-R-IH0-K                        fendrick       fendrick
B-EY1-ER0                                bayer          bayar
T-OW2-T-AE2-L-AH0-T-EH1-R-IY0-AH0-N-IH2-Z-AH0-M totilatatiiism totalitarianism
T-R-UW1-AH0-N-T           