## Neural Machine Translation (German to English)
**LSTM Encoder-Decoder Architecture**

A typical seq2seq model has 2 major components:
* a) an encoder
* b) a decoder

Use cases of Sequence-to-Sequence: 
* Speech Recognition
* Name Entity/Subject Extraction to identify the main subject from a body of text
* Relation Classification to tag relationships between various entities tagged in the above step
* Chatbot skills to have conversational ability and engage with customers
* Text Summarization to generate a concise summary of a large amount of text
* Question Answering systems

![Image of Yaktocat](https://miro.medium.com/max/2400/1*sWc8g2yiQrOzntbVeGzbEQ.png)

In [42]:
import pandas as pd
import numpy as np
import os 
import string
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, RepeatVector, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import sequence
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth',200)
pd.set_option('display.max_rows',2000)

In [2]:
os.getcwd()

'/Users/manprmas/Oracle Content - Accounts/Oracle Content/Personal/Advanced NLP'

In [4]:
file = open('deu.txt',mode='rt',encoding='utf-8')
data = file.read()
file.close()

In [5]:
data = data.strip().split('\n')
data_sents = [i.split('\t') for i in data]

In [6]:
deu_eng = np.array(data_sents)
deu_eng = deu_eng[:,0:2]

In [7]:
deu_eng[:10,:]

array([['Go.', 'Geh.'],
       ['Hi.', 'Hallo!'],
       ['Hi.', 'Grüß Gott!'],
       ['Run!', 'Lauf!'],
       ['Run.', 'Lauf!'],
       ['Wow!', 'Potzdonner!'],
       ['Wow!', 'Donnerwetter!'],
       ['Duck!', 'Kopf runter!'],
       ['Fire!', 'Feuer!'],
       ['Help!', 'Hilfe!']], dtype='<U537')

In [8]:
import gensim

In [9]:
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]

In [10]:
deu_eng[:10,:]

array([['Go', 'Geh'],
       ['Hi', 'Hallo'],
       ['Hi', 'Grüß Gott'],
       ['Run', 'Lauf'],
       ['Run', 'Lauf'],
       ['Wow', 'Potzdonner'],
       ['Wow', 'Donnerwetter'],
       ['Duck', 'Kopf runter'],
       ['Fire', 'Feuer'],
       ['Help', 'Hilfe']], dtype='<U537')

In [12]:
random_indices = np.random.choice(deu_eng.shape[0], size=100000, replace=False)
deu_eng = deu_eng[random_indices, :]


In [13]:
deu_eng.shape

(100000, 2)

In [15]:
for i in range(len(deu_eng)):
    deu_eng[i,0] = deu_eng[i,0].lower()
    
    deu_eng[i,1] = deu_eng[i,1].lower()
    

In [16]:
deu_eng[:10,:]

array([['i have a lot of questions', 'ich habe zahlreiche fragen'],
       ['tom and mary are house hunting',
        'tom und mary sind auf der suche nach einem haus'],
       ['theyre engaged', 'sie sind verlobt'],
       ['tom is intolerant', 'tom ist intolerant'],
       ['as soon as he went to bed he fell asleep',
        'sobald er ins bett ging schlief er ein'],
       ['you have a very good voice', 'du hast eine sehr gute stimme'],
       ['i want to leave too', 'ich möchte auch gehen'],
       ['tom and mary were held as hostages for three months',
        'tom und mary waren drei monate in geiselhaft'],
       ['did you enjoy your swim', 'hat dir das schwimmen spaß gemacht'],
       ['knock before entering', 'klopfe an bevor du hereinkommst']],
      dtype='<U537')

In [17]:
# empty lists
eng_l = []
deu_l = []

# populate the lists with sentence lengths
for i in deu_eng[:,0]:
    eng_l.append(len(i.split()))

for i in deu_eng[:,1]:
    deu_l.append(len(i.split()))

In [18]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [19]:
eng_tokenizer = tokenization(deu_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)

English Vocabulary Size: 12006


In [69]:
eng_tokenizer.word_index

{'tom': 1,
 'to': 2,
 'you': 3,
 'the': 4,
 'i': 5,
 'a': 6,
 'is': 7,
 'that': 8,
 'in': 9,
 'do': 10,
 'he': 11,
 'of': 12,
 'it': 13,
 'was': 14,
 'this': 15,
 'have': 16,
 'me': 17,
 'dont': 18,
 'for': 19,
 'my': 20,
 'what': 21,
 'are': 22,
 'mary': 23,
 'we': 24,
 'your': 25,
 'his': 26,
 'be': 27,
 'im': 28,
 'and': 29,
 'on': 30,
 'with': 31,
 'know': 32,
 'want': 33,
 'like': 34,
 'not': 35,
 'she': 36,
 'at': 37,
 'did': 38,
 'has': 39,
 'can': 40,
 'how': 41,
 'very': 42,
 'were': 43,
 'here': 44,
 'go': 45,
 'didnt': 46,
 'think': 47,
 'as': 48,
 'its': 49,
 'about': 50,
 'there': 51,
 'him': 52,
 'cant': 53,
 'will': 54,
 'time': 55,
 'all': 56,
 'up': 57,
 'why': 58,
 'youre': 59,
 'had': 60,
 'if': 61,
 'get': 62,
 'going': 63,
 'they': 64,
 'her': 65,
 'good': 66,
 'one': 67,
 'isnt': 68,
 'out': 69,
 'no': 70,
 'really': 71,
 'when': 72,
 'from': 73,
 'doesnt': 74,
 'would': 75,
 'an': 76,
 'ill': 77,
 'now': 78,
 'by': 79,
 'been': 80,
 'help': 81,
 'please': 82,
 'c

In [20]:
deu_tokenizer = tokenization(deu_eng[:, 1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1

deu_length = 8
print('Deutch Vocabulary Size: %d' % deu_vocab_size)

Deutch Vocabulary Size: 22582


In [21]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    
    # pad sequences with 0 values
    seq = sequence.pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(deu_eng, test_size=0.3, random_state = 12)

In [80]:
train[0][0]

'may i put it here'

In [23]:
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

In [79]:
trainY[0]

array([182,   5, 171,  13,  44,   0,   0,   0], dtype=int32)

In [24]:
trainX.shape


(70000, 8)

In [25]:
testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

### RepeatVector vs return_sequences=True

![Image of Yaktocat](https://i.stack.imgur.com/LNXjF.jpg)

In [26]:
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = tf.keras.Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

In [27]:
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 32)
rms = RMSprop(lr=0.01)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 32)             722624    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
repeat_vector (RepeatVector) (None, 8, 32)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8, 32)             8320      
_________________________________________________________________
dense (Dense)                (None, 8, 12006)          396198    
Total params: 1,135,462
Trainable params: 1,135,462
Non-trainable params: 0
_________________________________________________________________


In [29]:
filename = 'model_nlp_machine_translation_lstm'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [113]:
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=1, batch_size=1024, 
          validation_data = [testX,testY.reshape(testY.shape[0], testY.shape[1], 1)],
          callbacks=[checkpoint], verbose=1)

Train on 70000 samples, validate on 30000 samples
Epoch 00001: val_loss improved from 3.32064 to 3.30594, saving model to model_nlp_machine_translation_lstm
INFO:tensorflow:Assets written to: model_nlp_machine_translation_lstm/assets


In [114]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.legend(['train','validation'])
# plt.show()

In [115]:
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [116]:
type(preds)

numpy.ndarray

In [117]:
preds[0:10]

array([[  1,   7,   7,   2,   2,   2, 105, 105],
       [  5,   6,  48,  48,  48,  48,   6,  48],
       [ 20, 115,  39, 134, 134,   0,   0,   0],
       [ 77,  63,   2,   3,   3,   0,   0,   0],
       [  1,   7,  20,  26,  11,  11,  11,   0],
       [  5,  53,  35,  48,  48,  48,   0,   0],
       [  1, 149, 619,   0,   0,   0,   0,   0],
       [  1,  14, 224,   0,   0,   0,   0,   0],
       [  3, 174, 174,   3,   3,   3,   0,   0],
       [  4,   4,   4, 576,   4,   4,   4, 818]])

In [118]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [119]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
             
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)            
        
    preds_text.append(' '.join(temp))

In [120]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

In [121]:
pred_df.iloc[45:46,:]

Unnamed: 0,actual,predicted
45,im quite hungry,im very hungry


In [100]:
test[45:46,1]

array(['ich bin ziemlich hungrig'], dtype='<U537')

### Extract Embeddings from Embedding Layer

In [123]:
embeddings = model.layers[0].get_weights()[0]


In [124]:
words_embeddings = {w:embeddings[idx] for w, idx in eng_tokenizer.word_index.items()}

In [125]:
words_embeddings['quite']

array([-1.6456634e-01, -5.1916063e-01, -7.9730794e-02,  1.6115907e-01,
       -3.3530605e-01, -3.3116352e-03, -7.4509019e-01,  4.9231246e-02,
       -1.9988891e-01,  2.4994239e-01, -2.9379165e-01, -1.7670871e-01,
       -5.9266701e-02,  6.8998128e-02, -1.5970820e-01,  1.5919159e-01,
       -9.8319747e-02, -2.4200144e-01, -1.7721292e-02,  1.5975747e-01,
        6.3015006e-02,  1.6616705e-01,  4.0547571e-01, -2.6260704e-01,
        1.0263384e-04, -2.8314281e-01, -2.3600174e-02, -4.0434178e-02,
        3.0470502e-01, -1.3632643e-01, -8.1682831e-02, -6.7003801e-02],
      dtype=float32)