In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory

def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering

	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:

			# tokenize on white space
			line = line.split()

			# remove punctuation from each token
			line = [word.translate(table) for word in line]

			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = '/content/kab.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-kab.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-kab.pkl
[Go] => [Ddu]
[Go] => [Ddut]
[Go] => [Ddumt]
[Go] => [Ruḥ]
[Go] => [Ruḥet]
[Go] => [Ruḥemt]
[Hi] => [Azul ]
[Hi] => [Azul]
[Run] => [Azzel]
[Run] => [Azzlet]
[Run] => [Azzlemt]
[Run] => [Azzel]
[Run] => [Azzelet]
[Run] => [Azzelemt]
[Who] => [Anwa]
[Who] => [Anta]
[Wow] => [Waw]
[Wow] => [Muqel kan]
[Wow] => [Ẓeṛ kan]
[Wow] => [Ihuh]
[Fire] => [Times]
[Fire] => [Jbed]
[Fire] => [Qres]
[Help] => [Annaɣ ]
[Help] => [Abbuh]
[Hide] => [Ffer]
[Hide] => [Ffret]
[Hide] => [Ffremt]
[Jump] => [Neggez]
[Jump] => [Nteg]
[Jump] => [Neggez]
[Jump] => [Nṭeg]
[Jump] => [Nṭew]
[Jump] => [Nteg]
[Jump] => [Ǧellbet]
[Jump] => [Ǧellbemt]
[Jump] => [Ǧelleb]
[Stay] => [Qqimemt]
[Stay] => [Qqim]
[Stay] => [Qqimet]
[Stop] => [Ḥbes]
[Stop] => [Bed]
[Wait] => [Rǧu]
[Wait] => [Rǧut]
[Wait] => [Rǧumt]
[Wait] => [Rju]
[Go on] => [Kemmel]
[Go on] => [Kemmlemt]
[Hello] => [Azul]
[Hello] => [Azul]
[Hello] => [Ɛelxiṛ]
[Hello] => [Axir]
[Hello] => [Saḥit]
[Hurry] => [Ɣiwel]
[I ran] => [Ttazzaleɣ]

In [None]:
model.summary()

print(X.shape)

In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-kab.pkl')

# reduce dataset size
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:18000], dataset[2000:]
# save
save_clean_data(dataset, 'english-kab-both.pkl')
save_clean_data(train, 'english-kab-train.pkl')
save_clean_data(test, 'english-kab-test.pkl')

Saved: english-kab-both.pkl
Saved: english-kab-train.pkl
Saved: english-kab-test.pkl


In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('Kabyle Vocabulary Size: %d' % ger_vocab_size)
print('Kabyle Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=100, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 2970
English Max Length: 8
Kabyle Vocabulary Size: 12230
Kabyle Max Length: 11
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 256)           3130880   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 8, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8, 256)            525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 8, 2970)           763290    
Total params: 4,944,794
Trainable params: 4,944,794
Non-trainable params: 0
_________________________________________________________________
None
Epo

<tensorflow.python.keras.callbacks.History at 0x7f32126ca990>

In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('Kabyle Vocabulary Size: %d' % ger_vocab_size)
print('Kabyle Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=200, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 2970
English Max Length: 8
Kabyle Vocabulary Size: 12230
Kabyle Max Length: 11
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 256)           3130880   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 8, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8, 256)            525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 8, 2970)           763290    
Total params: 4,944,794
Trainable params: 4,944,794
Non-trainable params: 0
_________________________________________________________________
None
Epo

<tensorflow.python.keras.callbacks.History at 0x7f0754f727d0>

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src,test = raw_dataset[i]
		if i < 10:
			print('Source=[%s], Target=[%s], Predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('Train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('Test')
evaluate_model(model, eng_tokenizer, testX, test)

Train
Source=[Tella kra n yiwet i sisawlen i Tom], Target=[Someone called Tom], Predicted=[someone called tom]
Source=[Tom iṛuḥ d Mary], Target=[Tom went with Mary], Predicted=[tom went with mary]
Source=[Ddmemt wigi], Target=[Take these], Predicted=[take these]
Source=[Kcemd di leɛnayak], Target=[Please get in], Predicted=[please get in]
Source=[Yella win i yellan deg uxxam], Target=[Anyone home], Predicted=[anyone home]
Source=[Yečča Tom s usixef], Target=[Tom ate quickly], Predicted=[tom ate quickly]
Source=[Ḥbesit], Target=[Stop it], Predicted=[make it stop]
Source=[Ssenqes tazzla], Target=[Slow down], Predicted=[slow down]
Source=[D lɛib fellak], Target=[Shame on you], Predicted=[shame on you]
Source=[Wali axxaminna], Target=[Look at that house], Predicted=[look at that house]
BLEU-1: 0.610482
BLEU-2: 0.509154
BLEU-3: 0.392476
BLEU-4: 0.168266
Test
Source=[Fakkikent imenɣi], Target=[Stop arguing], Predicted=[stop arguing]
Source=[Ttmeslayent tafṛansist], Target=[They spoke French]

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src,test = raw_dataset[i]
		if i < 10:
			print('Source=[%s], Target=[%s], Predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('Train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('Test')
evaluate_model(model, eng_tokenizer, testX, test)

Train
Source=[Meslayem s leɛqel], Target=[Talk slower], Predicted=[talk slower]
Source=[N Japu wina], Target=[Is he Japanese], Predicted=[is he japanese]
Source=[Ɣef lǧalim i nxeddem], Target=[Were working for you], Predicted=[were working for you]
Source=[Sew yidi kra], Target=[Have a drink with me], Predicted=[have a drink with me]
Source=[Tixeṛas], Target=[Just leave it], Predicted=[just leave it]
Source=[Ḥemmleɣ tikli], Target=[I like walking], Predicted=[i like walking]
Source=[Aqlaɣ da merra], Target=[Were all here], Predicted=[were all here]
Source=[Ttɛasan meṛṛa], Target=[Everybody waited], Predicted=[everybody waited]
Source=[Ḥwaǧeɣ ad ččeɣ tura], Target=[I need to eat now], Predicted=[i need to eat now]
Source=[Tḥeṣlemt], Target=[Are you stuck], Predicted=[are you stuck]
BLEU-1: 0.655715
BLEU-2: 0.580471
BLEU-3: 0.512210
BLEU-4: 0.339974
Test
Source=[Sirdemt udmawennkent], Target=[Wash your face], Predicted=[wash your face]
Source=[Tom yebɣa mraw d snat n tmellalin], Target=[

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src,test = raw_dataset[i]
		if i < 10:
			print('Source=[%s], Target=[%s], Predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('Train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('Test')
evaluate_model(model, eng_tokenizer, testX, test)

Train
Source=[Ddeqs aya ur ttruɣ ara], Target=[I havent cried in a while], Predicted=[i havent cried in a while]
Source=[Tεummeḍ aṭas], Target=[Did you swim much], Predicted=[did you swim much]
Source=[Aya mačči d aylak], Target=[This isnt for you], Predicted=[this isnt for you]
Source=[Teɛǧebas Boston i Tom], Target=[Tom loved Boston], Predicted=[tom enjoyed boston]
Source=[I yelha wassa], Target=[What a lovely day], Predicted=[what a lovely day]
Source=[Ur ugadeɣ ara maḍi], Target=[Im not afraid at all], Predicted=[im not afraid at all]
Source=[Qebleɣ assumernwen], Target=[I accept your offer], Predicted=[i accept your offer]
Source=[Tella tin i yellan deg uxxam], Target=[Anyone home], Predicted=[somebody home in the]
Source=[Yexṣeṛ lḥal], Target=[Its cloudy], Predicted=[the weather is terrible]
Source=[Ayɣer i tebɣamt akk annecta], Target=[Why do you want all that], Predicted=[why do you want all these]
BLEU-1: 0.686607
BLEU-2: 0.623809
BLEU-3: 0.576840
BLEU-4: 0.437631
Test
Source=

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src,test = raw_dataset[i]
		if i < 100:
			print('Source=[%s], Target=[%s], Predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('Train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('Test')
evaluate_model(model, eng_tokenizer, testX, test)

Train
Source=[Ddeqs aya ur ttruɣ ara], Target=[I havent cried in a while], Predicted=[i havent cried in a while]
Source=[Tεummeḍ aṭas], Target=[Did you swim much], Predicted=[did you swim much]
Source=[Aya mačči d aylak], Target=[This isnt for you], Predicted=[this isnt for you]
Source=[Teɛǧebas Boston i Tom], Target=[Tom loved Boston], Predicted=[tom enjoyed boston]
Source=[I yelha wassa], Target=[What a lovely day], Predicted=[what a lovely day]
Source=[Ur ugadeɣ ara maḍi], Target=[Im not afraid at all], Predicted=[im not afraid at all]
Source=[Qebleɣ assumernwen], Target=[I accept your offer], Predicted=[i accept your offer]
Source=[Tella tin i yellan deg uxxam], Target=[Anyone home], Predicted=[somebody home in the]
Source=[Yexṣeṛ lḥal], Target=[Its cloudy], Predicted=[the weather is terrible]
Source=[Ayɣer i tebɣamt akk annecta], Target=[Why do you want all that], Predicted=[why do you want all these]
Source=[D tiktim tinna], Target=[Is that your own idea], Predicted=[is that your

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src,test = raw_dataset[i]
		if i < 10:
			print('Source=[%s], Target=[%s], Predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-kab-both.pkl')
train = load_clean_sentences('english-kab-train.pkl')
test = load_clean_sentences('english-kab-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('Train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('Test')
evaluate_model(model, eng_tokenizer, testX, test)

Train
Source=[Tibḥirtiw d tamecṭuḥt], Target=[My garden is small], Predicted=[my garden is small]
Source=[Ṛeqqeɛ waki], Target=[Fix this], Predicted=[fix this]
Source=[Ttuɣ kullec ɣef waya], Target=[I forgot all about that], Predicted=[i forgot all about that]
Source=[Tameṭṭut n Tom d mmtismin], Target=[Tom has a jealous wife], Predicted=[tom has a jealous wife]
Source=[Ad mdyecnu Tom], Target=[Tom will sing for you], Predicted=[tom will sing for you]
Source=[Andat ttbut], Target=[Wheres the proof], Predicted=[wheres the proof]
Source=[D acu i nezmer ad kenttid nexdem], Target=[What can we do for you], Predicted=[what can we do for you]
Source=[Amek i sqqaṛen i babam], Target=[Whats your dads name], Predicted=[whats your dads name]
Source=[Ur ttɛaqabet ara Tom ɣef ayen], Target=[Dont punish Tom for that], Predicted=[dont punish tom for that]
Source=[Xedmeɣam kullec], Target=[I did everything for you], Predicted=[i did everything for you]
BLEU-1: 0.686006
BLEU-2: 0.623655
BLEU-3: 0.5774