In [83]:
#importing necessary packages along the way
#loading and cleaning the data
import re
import string
from unicodedata import normalize
import numpy as np

#NN Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [84]:
#loading data file
def load_file(filename):
    #opening the file as read only
    file=open(filename, mode='rt', encoding='utf-8')
    #reading line by line
    text=file.readlines()
    #splitting the lines and extract the sentence pairs
    pairs=[]
    for lines in text:
        temp=[]
        #line=re.split('[?.!]', lines)
        line=lines.split("\t")
        temp.append(line[0])
        temp.append(line[1])
        pairs.append(temp)
    #closing the file
    file.close()
    return pairs

In [85]:
#cleaning the pairs of sentences from punctuation
#normalizing the pairs of sentences to lowercase
def clean_pairs(lines):
    cleaned=[]
    #regex for character filtering
    re_print=re.compile('[^%s]' % re.escape(string.printable))
    #translation table for removing punctuation
    table=str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair=[]
        for line in pair:
            #tokenize on white space
            line=line.split()
            #convert to lowercase
            line=[word.lower() for word in line]
            #remove punctuation from each token
            line=[word.translate(table) for word in line]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [86]:
#the source language is English and the target language is Swedish
swedish_english=load_file("swe.txt")
#print(swedish)
cleaned_swedish=clean_pairs(swedish)

In [87]:
#checking the dimension of data
#print(cleaned_swedish)
#print(len(cleaned_swedish))

In [88]:
#splitting the cleaned data into train and test
data=cleaned_swedish
np.random.shuffle(data)
#as training a neural network requires an enormous amount of data, we designate 90% of the data to training phase
train, test=data[:int(0.9*len(data))], data[int(0.9*len(data)):]

In [89]:
#print(train)
#print(test)

In [90]:
#fitting a toknizer
def tokenize(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [91]:
#maximum sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [92]:
eng_tokens=tokenize(data[:, 0])
eng_words=len(eng_tokens.word_index)+1
eng_length=max_length(data[:, 0])
#print(eng_words)
#print(eng_length)

In [93]:
swe_tokens=tokenize(data[:, 1])
swe_words=len(swe_tokens.word_index)+1
swe_length=max_length(data[:, 1])
#print(swe_words)
#print(swe_length)

In [94]:
#encode and add padding in source language
def encode_sequences(tokenizer, length, lines):
    #integer encode sequences
    X=tokenizer.texts_to_sequences(lines)
    #pad sequences
    X=pad_sequences(X, maxlen=length, padding='post')
    return X

In [95]:
#encode target sentence
def encode_output(sequences, vocab_size):
    ylist=[]
    for sequence in sequences:
        encoded=to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y=np.array(ylist)
    y=y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [96]:
x_train=encode_sequences(swe_tokens, swe_length, train[:, 1])
y_train=encode_sequences(eng_tokens, eng_length, train[:, 0])
y_train=encode_output(y_train, eng_words)

x_test=encode_sequences(swe_tokens, swe_length, test[:, 1])
y_test=encode_sequences(eng_tokens, eng_length, test[:, 0])
y_test=encode_output(y_test, eng_words)

In [97]:
def NN_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model=Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
 
model=NN_model(swe_words, eng_words, swe_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 29, 256)           1942016   
_________________________________________________________________
lstm_7 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_4 (RepeatVecto (None, 32, 256)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 32, 256)           525312    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 32, 5273)          1355161   
Total params: 4,347,801
Trainable params: 4,347,801
Non-trainable params: 0
_________________________________________________________________
None


In [98]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [101]:
filename='model1.h5'
checkpoint=ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(x_train, y_train, epochs=30, batch_size=64, validation_split=0.2, callbacks=[checkpoint], verbose=2)

Train on 13624 samples, validate on 3406 samples
Epoch 1/30
 - 486s - loss: 0.8507 - accuracy: 0.8622 - val_loss: 0.9293 - val_accuracy: 0.8577

Epoch 00001: val_loss improved from inf to 0.92930, saving model to model1.h5
Epoch 2/30
 - 465s - loss: 0.8223 - accuracy: 0.8653 - val_loss: 0.9112 - val_accuracy: 0.8601

Epoch 00002: val_loss improved from 0.92930 to 0.91121, saving model to model1.h5
Epoch 3/30
 - 467s - loss: 0.7903 - accuracy: 0.8695 - val_loss: 0.8909 - val_accuracy: 0.8651

Epoch 00003: val_loss improved from 0.91121 to 0.89092, saving model to model1.h5
Epoch 4/30
 - 547s - loss: 0.7584 - accuracy: 0.8737 - val_loss: 0.8647 - val_accuracy: 0.8688

Epoch 00004: val_loss improved from 0.89092 to 0.86471, saving model to model1.h5
Epoch 5/30
 - 484s - loss: 0.7251 - accuracy: 0.8777 - val_loss: 0.8471 - val_accuracy: 0.8711

Epoch 00005: val_loss improved from 0.86471 to 0.84712, saving model to model1.h5
Epoch 6/30
 - 466s - loss: 0.6931 - accuracy: 0.8812 - val_loss: 

<keras.callbacks.callbacks.History at 0x7ff07de48128>

In [102]:
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

#map encodings to words
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None
 
#generate target sequence given source sequence
def predict_sequence(model, tokenizer, source):
    prediction=model.predict(source, verbose=0)[0]
    integers=[np.argmax(vector) for vector in prediction]
    target=list()
    for i in integers:
        word=word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
 
#evaluate the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted=[], []
    for i, source in enumerate(sources):
        #translate encoded source text
        source=source.reshape((1, source.shape[0]))
        translation=predict_sequence(model, eng_tokens, source)
        raw_target, raw_src=raw_dataset[i]
        if i<10:
            print('Swedish: %s, English: %s, NN: %s' % (raw_src, raw_target, translation))
            print()
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    #calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 

#load model
model=load_model('model1.h5')
#test on some test sequences
evaluate_model(model, eng_tokens, x_test, test)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Swedish: jag har ett barn, English: i have a kid, NN: i have a book

Swedish: mary satte några blommor i vasen och ställde sedan vasen på bordet, English: mary put some flowers in the vase and then put the vase on the table, NN: he put some some the to to to the the

Swedish: är det blått, English: is it blue, NN: is it

Swedish: hade tom sett något skulle han ha sagt det till oss, English: if tom had seen anything he wouldve told us, NN: has tom tom tom seen tom tom tom to

Swedish: är ni två fulla, English: are you both drunk, NN: are you are tonight

Swedish: tom köpte en lädersoffa, English: tom bought a leather couch, NN: tom bought a speeding

Swedish: vill du ha fisk, English: do you want fish, NN: do you want food

Swedish: som jag sa jag var upptagen, English: like i said i was busy, NN: i i think i was was

Swedish: köp lite godis åt tom bara, English: just buy tom some candy, NN: dont give tom some dog

Swedish: har tom kabeltv, English: does tom have cable, NN: has tom been