# Seq2Seq - Keras - Tensorflow

Dataset: [CMU Dict](http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b)
An introductory Seq2Seq model. 

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import tensorflow as tf
import keras, keras.backend as K
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam


import re
from sklearn.model_selection import train_test_split
from keras_tqdm import TQDMNotebookCallback

Path = 'data/'

Using TensorFlow backend.


In [2]:
##Code to cap GPU memory usage
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

In [3]:
parms = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
lstm_params = {}

## Preprocessing

In [4]:
#Get each word that begins with A-Z from each line into a list 
lines = [l.strip().split("  ") for l in open(Path+'cmudict-0.7b', encoding='latin1') 
         if re.match('^[A-Z]', l)]
#Split words and phonemes
lines = [(w, ps.split()) for w, ps in lines]
lines[0]

('A', ['AH0'])

In [5]:
#Get a list of all the unique phonemes from lines and adding _ to position 0 because it corresponds to padding
#when tokenised
phonemes = ["_"]+sorted(set(p for w, ps in lines for p in ps))
len(phonemes)

70

In [6]:
#Map phonemes to indices and letters to indices.
p2i = dict((v, k) for k, v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v, k) for k, v in enumerate(letters))

In [7]:
maxlen = 15
#Map words to corresponding list of phoneme indices. Constraint
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                    if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [8]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()]); maxlen_p

16

In [15]:
words

array(['hawkins', 'sleeping', 'melchert', ..., 'dedham', 'sleuth', 'herron'],
      dtype='<U15')

In [9]:
#words contain the number of words in the filtered dictionary
words = np.random.permutation(list(pronounce_dict.keys()))
n = len(words)

In [None]:


#Initialise the input and labels array with zeros so that everywhere except 
#the position of values is padded
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

#Fill in the non zero indices
for i, k in enumerate(words):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j]=p
    for j, p in enumerate(k): labels_[i][j] = l2i[p]
        

In [22]:
words[-10:]

array(['proglacial', 'caylor', 'culhane', 'bloodstains', 'conversation',
       'racal', 'twined', 'dedham', 'sleuth', 'herron'],
      dtype='<U15')

## BPE

In [23]:
thefile = open('test2.txt', 'w')
thefile.write("\n".join(words))

935323

In [24]:
lines = [l.strip().split("  ") for l in open(Path+'bpe.txt', encoding='latin1')]

clines = [sentence[0].replace(" ", "") for sentence in lines]

sep = [sentence.split('@@') for sentence in clines]

flat_list = [item for sublist in sep for item in sublist]

letters = list(set(flat_list))
letters.insert(0,'_')
n = len(sep); n

l2i = dict((v, k) for k, v in enumerate(letters))

input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

for i, k in enumerate(sep):
    for j, p in enumerate(pronounce_dict[''.join(k)]): input_[i][j]=p
    for j, p in enumerate(k): labels_[i][j] = l2i[p]
    

labels_.shape

[letters[l] for l in labels_[0]]

[phonemes[p] for p in input_[0]]

#Create train, validation sets
(input_train, input_test, labels_train, labels_test) = train_test_split(input_, labels_, test_size=0.1)

input_vocab_size, output_vocab_size = len(phonemes), len(letters);input_vocab_size, output_vocab_size

In [25]:
clines = [sentence[0].replace(" ", "") for sentence in lines]

In [26]:
sep = [sentence.split('@@') for sentence in clines]

In [27]:
flat_list = [item for sublist in sep for item in sublist]

In [28]:
letters = list(set(flat_list))
letters.insert(0,'_')
n = len(sep); n

108006

In [29]:
l2i = dict((v, k) for k, v in enumerate(letters))

In [30]:
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

In [31]:
for i, k in enumerate(sep):
    for j, p in enumerate(pronounce_dict[''.join(k)]): input_[i][j]=p
    for j, p in enumerate(k): labels_[i][j] = l2i[p]
    

In [33]:
labels_.shape

(108006, 15)

In [34]:
[letters[l] for l in labels_[0]]

['ha', 'w', 'kins', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']

In [35]:
[phonemes[p] for p in input_[0]]

['HH',
 'AO1',
 'K',
 'IH0',
 'N',
 'Z',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_']

In [36]:
#Create train, validation sets
(input_train, input_test, labels_train, labels_test) = train_test_split(input_, labels_, test_size=0.1)

In [37]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters);input_vocab_size, output_vocab_size

(70, 881)

## Model

In [45]:
n

108006

In [38]:
dim = 512

In [39]:
def get_rnn(return_sequences=True):
    return LSTM(dim, dropout=0.1, recurrent_dropout=0.1, 
                implementation=2, return_sequences=return_sequences)

In [40]:
#Input after embedding becomes #sample*maxlen_p*120
inp = Input((maxlen_p,))
x = Embedding(input_vocab_size, 120)(inp)

#Encoding stage - form a representation of all the information in the word as a whole
#Bidirectional creates a separate RNN for a reverse input which is then concat 
#with original RNN.
x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

#Decoding stage
#RepeatVector is used because RNN in Keras expect a list of inputs rather than a 
#a single hidden state representation. Another issue is that it is also harder for the RNN to keep
#track of what the whole word that needs to be decoded. By repeating, each timestep is preented
#with the same vector. 
x = RepeatVector(maxlen)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [41]:
model = Model(inp, x)

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 16, 120)           8400      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 16, 1024)          2592768   
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 15, 512)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 15, 512)           2099200   
__________

In [43]:
model.compile(Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


## Train and Evaluation

In [44]:
hist = model.fit(input_train, np.expand_dims(labels_train, -1),
                validation_data=[input_test, np.expand_dims(labels_test, -1)],
                batch_size=128, **parms, epochs=3)




In [None]:
#model.save_weights('models/seq2seq_rnn.h5')
model.load_weights('models/seq2seq_rnn.h5')

In [46]:
preds = model.predict(input_test, batch_size=128)

In [47]:
predict = np.argmax(preds, axis=2)

In [48]:
print ('Accuracy', np.mean([all(real==p) for real, p in zip(labels_test, predict)]))

Accuracy 9.25840199981e-05


In [49]:
#Visualise predictions
def evaluate(predict):
    print ('Phonemes_________________________________predictions______label')
    for index in range(20):
        phoneme = '-'.join([phonemes[p] for p in input_test[index]])
        prediction = [letters[l] for l in predict[index]]
        real = [letters[l] for l in labels_test[index]]
        print (phoneme.strip('-_').ljust(40), ''.join(prediction).strip('_').ljust(14), 
               ''.join(real).strip('_'))

In [50]:
evaluate(predict) 

Phonemes_________________________________predictions______label
R-IH0-V-IY1-L-Z                          rem            reveals
M-AH0-G-IY1-AH0-N                        mcd            mcgeean
HH-AA0-Y-AA1-R                           wd             hajjar
M-AA0-T-AA0-L-AO1-N                      mcch           matalon
K-IH1-N-CH-IH0-L-OW0                     conn           kincheloe
L-IH1-Z-AH0-B-EH0-TH                     deut           lizabeth
T-R-IH1-G                                fr             trygg
B-IH1-D-IH0-NG                           bd             bidding
T-R-EH1-M-B-AH0-L-IH0-NG                 pt             trembling
S-AW2-Y-UW1-M-AH0                        mct            saouma
AH0-B-W-EH1-L-AH0                        mct            abuellah
HH-AH1-D-AH0-L-S-T-AH0-N                 hoden          huddleston
M-EH1-N-S-ER0                            gn             mencer
S-AY1-D-SH-OW2-Z                         mcit           sideshows
B-L-AH1-D-S-T-R-IY2-M             