In [1]:
import numpy as np
import keras.backend as K
from scipy.spatial import distance
import nltk as NLTK
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import *
from keras.utils import to_categorical
from keras.optimizers import *
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from pathlib import Path

Using TensorFlow backend.


In [2]:
# inputs (english)
en_vec_file = './en.vec'
en_sent_file =  './demo.500.en.txt' ##    './sent.en.txt'   (if u want to test on 100 samples)
vocab_en = []
word_vec_en = {}

# outputs (german)
de_vocab_file = './all_vocab_vectors.de.txt'  ## './de.vocab.txt' (de 100 vocab)
de_sent_file = './demo.500.de.txt'      ## './sent.de.txt'
vocab_de = []

word_vec_dim = 100
hidden_unit = 256
batch_size = 8
epoch = 500

In [3]:
def load_vectors_en(vec_file):
    pad_vec = []
    
    ## Padding since all sentences are not of equal length 
    vocab_en.append('<p>')

    for i in range(word_vec_dim):
        pad_vec.append(0.0001)

    with open(vec_file, 'r', encoding='utf-8') as f:
        for line in f:
            arr = line.strip().lower().replace('\n', '').split(' ')
            vec = []
            for i in range(1, len(arr)):
                vec.append(arr[i])
            word_vec_en[arr[0]] = vec
            vocab_en.append(arr[0])
    f.close()
    
    word_vec_en['<p>'] = pad_vec
    print(len(word_vec_en), 'vectors loaded ...')


def load_vocab_de(vocab_file):
    fin = open(vocab_file, 'r', encoding='utf-8')
    vocab_de.append('<p>')
    for line in fin:
        line = line.strip().lower().replace('\n', '').strip()
        vocab_de.append(line)
    fin.close()

def init():
    load_vectors_en(en_vec_file)
    load_vocab_de(de_vocab_file)

def make_one_hot(idx, y_list):
    one_hot_seq = np.zeros((1, len(y_list)))
    np.put(one_hot_seq, idx, 1)
    return one_hot_seq

In [4]:
def make_xy(input_en, output_de):

    # x - english
    x_max_len = 0
    fin = open(input_en, 'r', encoding='utf-8')
    for line in fin:
        line = line.strip().lower().replace('\n', '').strip()
        arr = line.split(' ')
        if len(arr) > x_max_len:
            x_max_len = len(arr)
    fin.close()
    print('x_max_len', x_max_len)
    
    x = []
    fin = open(input_en, 'r', encoding='utf-8')
    for line in fin:
        line = line.strip().lower().replace('\n', '').strip()
        arr = line.split(' ')
        temp_x = []
        
        for w in arr:
            if w in vocab_en:
                temp_x.append(vocab_en.index(w))
        r = x_max_len - len(temp_x)
        for i in range(r):
            temp_x.append(vocab_en.index('<p>'))
        x.append(temp_x)
    fin.close()

    x = pad_sequences(x, maxlen=x_max_len, dtype='int32', padding='post', truncating='pre', value=0.0)
    print('x-shape', x.shape)

    # y - output
    y_max_len = 0
    fin = open(output_de, 'r', encoding='utf-8')
    for line in fin:
        line = line.strip().lower().replace('\n', '').strip()
        arr = line.split(' ')
        
        if len(arr) > y_max_len:
            y_max_len = len(arr)
    fin.close()
    print('y_max_len', y_max_len)
    
    
    y = []
    fin = open(output_de, 'r', encoding='utf-8')
    for line in fin:
        line = line.strip().lower().replace('\n', '').strip()
        arr = line.split(' ')
        temp_y = []
        
        for w in arr:
            if w in vocab_de:
                temp_y.append(make_one_hot(vocab_de.index(w), vocab_de))
        r = x_max_len - len(temp_y)
        for i in range(r):
            temp_y.append(make_one_hot(vocab_de.index('<p>'), vocab_de))
        y.append(temp_y)

    fin.close()
    y = np.reshape(y, (len(y), x_max_len, len(vocab_de)))
    print('y-shape', y.shape)

    return x, x_max_len, y, x_max_len


In [5]:
def create_zhou16_atten_vec(inputs):
    input_last_dim = int(inputs.shape[2])
    m = Activation('tanh')(inputs)
    print('M', m._keras_shape)
    
    alpha = Dense(1, activation='softmax')(m)
    print('alpha', alpha._keras_shape)
    
    input_t = Permute((2, 1))(inputs)
    alpha_t = Permute((2, 1))(alpha)
    
    r_nc = merge([alpha_t, input_t], mode='dot')
    print('r', r_nc._keras_shape)
    
    h_star = Activation('tanh')(r_nc)
    print('h_star', h_star._keras_shape)
    
    attention = Reshape((input_last_dim,), input_shape=(None, 1, input_last_dim))(h_star)
    print('attention', attention._keras_shape)
    return attention


In [None]:
def init_net(x_len, y_len):
    input_en_word_ids = Input(shape=(x_len,), dtype='int32', name='input_en_idx')

    embedding_matrix = []
    
    for i in range(len(vocab_en)):
        embedding_matrix.append(np.asarray(word_vec_en.get(vocab_en[i]), dtype='float32'))
    
    embedding_matrix = np.reshape(embedding_matrix, (len(embedding_matrix), word_vec_dim))
    
    embedded_english_sent = Embedding(input_dim=len(embedding_matrix), output_dim=word_vec_dim, input_length=x_len,
                               weights=[embedding_matrix], trainable=True)

    x = embedded_english_sent(input_en_word_ids)
    
    x_dropped = Dropout(0.5)(x)
    brnn_output = Bidirectional(LSTM(hidden_unit, return_sequences=True), input_shape=(x_len, word_vec_dim), merge_mode='ave')(x_dropped)
    attention_vec = create_zhou16_atten_vec(brnn_output)
    repeat_brnn = RepeatVector(x_len)(attention_vec)
    x = concatenate([brnn_output,repeat_brnn])
    
    lstm_out = LSTM(hidden_unit, return_sequences=True, input_shape=(x_len, x[2]))(x)
    output_de_ids = TimeDistributed(Dense(len(vocab_de), activation='softmax'), input_shape=(x_len, lstm_out[2]), name='output_output_idx')(lstm_out)

    opts = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model = Model(inputs=[input_en_word_ids], outputs=[output_de_ids])
    model.compile(optimizer=opts, loss='categorical_crossentropy', metrics=['acc'])
    
    print('model-input ', model.input_shape)
    print('model-params ', model.count_params())
    print('model-output ', model.output_shape)
    print('model-summary ')
    model.summary()
    
    return model


In [None]:
def translate(model, x_max_len, sents):
    x = []
    for s in sents:
        arr = s.split(' ')
        temp_x = []
        
        for w in arr:
            temp_x.append(vocab_en.index(w))
        r = x_max_len -len(temp_x)
        for i in range(r):
            temp_x.append(vocab_en.index('<p>'))
        x.append(temp_x)

    output_de_sents = model.predict(x=x, batch_size=4, verbose=1)

    for i in range(len(output_de_sents)):
        de_s = output_de_sents[i]
        strs = ''
        for j in range(len(de_s)):
            w = vocab_de[np.argmax(de_s[j], axis=0)]
            if w != '<p>':
                strs += w+ ' '
        strs = strs.strip()
        print(i, strs)

if __name__ == '__main__':
    model_file = Path('en-de-translation.h5')

    # training
    init()
    x, x_l, y, y_l = make_xy(input_en=en_sent_file, output_de=de_sent_file)
    
    ### comment next 3 lines if already trained once- to avoid repeated trainings
    model = init_net(x_l, y_l)
    
    if model_file.is_file():
        model.load_weights('en-de-translation.h5')
    
    model.fit(x=x, y=y, batch_size=batch_size, epochs=epoch, validation_split=0.1)
    model.save_weights(model_file)

    # testing
    model = init_net(x_len=x_l, y_len=y_l)
    model.load_weights('en-de-translation.h5')
    sent_arr = ['i declare resumed the session of the european parliament ad@@ jour@@ ned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .',
                'i should now like to comment on the issue itself .',
                'i should now like to comment on the issue itself .',
                'i should now like to comment on the issue itself .']

    translate(model, x_max_len=x_l, sents=sent_arr)


29229 vectors loaded ...
x_max_len 86
x-shape (500, 86)
y_max_len 88
y-shape (500, 86, 29426)
M (None, 86, 256)
alpha (None, 86, 1)
r (None, 1, 256)
h_star (None, 1, 256)
attention (None, 256)


  name=name)


model-input  (None, 86)
model-params  12031131
model-output  (None, 86, 29426)
model-summary 
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_en_idx (InputLayer)        (None, 86)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 86, 100)       2949800                                      
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 86, 100)       0                                            
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 86, 256)       731136                                     