In [None]:
import requests
# scr_file = 'https://gist.githubusercontent.com/michelkana/b9ab95e1e44ee297e51ef163dd7302ca/raw/021fc91ceb22ee154ea2b8cb98eb3303c0e22184/yemba_english_sentences.txt'
# scr_file = 'https://raw.githubusercontent.com/puddinator/SearchLah-/main/dataset.txt'

scr_file_en = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en'
req_en = requests.get(scr_file_en)
text_en = req_en.text
# convert to list
lines_en = text_en.strip().split('\n')[:1350]

scr_file_vi = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi'
req_vi = requests.get(scr_file_vi)
text_vi = req_vi.text
# convert to list
lines_vi = text_vi.strip().split('\n')[:1350]

pairs = [list(x) for x in zip(lines_en, lines_vi)]
# pairs = [line.split(';') for line in lines]
pairs[1]

['In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .']

In [None]:
from keras.preprocessing.text import Tokenizer
import numpy as np

# convert to array
pairs = np.array(pairs)

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)


en_tokenizer = create_tokenizer(pairs[:, 1])
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = max_length(pairs[:, 1])
print('English Vocabulary Size: %d' % en_vocab_size)
print('English Max Length: %d' % (en_length))

#English Vocabulary Size: 183
#English Max Length: 18
  
yb_tokenizer = create_tokenizer(pairs[:, 0])
yb_vocab_size = len(yb_tokenizer.word_index) + 1
yb_length = max_length(pairs[:, 0])
print('Yemba Vocabulary Size: %d' % yb_vocab_size)
print('Yemba Max Length: %d' % (yb_length))

English Vocabulary Size: 2144
English Max Length: 153
Yemba Vocabulary Size: 3348
Yemba Max Length: 112


In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# shuffle data
dataset = np.array(pairs)
np.random.shuffle(dataset)
train, test = dataset[:1300,:], dataset[1300:,:]

# prepare training data
trainX = encode_sequences(en_tokenizer, en_length, train[:, 1])
trainY = encode_sequences(yb_tokenizer, yb_length, train[:, 0])
trainY = encode_output(trainY, yb_vocab_size)

# prepare validation data
testX = encode_sequences(en_tokenizer, en_length, test[:, 1])
testY = encode_sequences(yb_tokenizer, yb_length, test[:, 0])
testY = encode_output(testY, yb_vocab_size)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed

# define seq2seq model
def define_model(src_vocab, tar_vocab, source_steps, target_steps, 
		 embedding_dim):
	model = Sequential()
  	# encoder
	model.add(Embedding(src_vocab, embedding_dim, 
			    input_length=source_steps, mask_zero=True))
	model.add(LSTM(embedding_dim))
	model.add(RepeatVector(target_steps))
  	# decoder
	model.add(LSTM(embedding_dim, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	# compile model
	model.compile(optimizer='adam', loss='categorical_crossentropy')
	# summarize defined model
	model.summary()
	return model

model = define_model(
	en_vocab_size, yb_vocab_size, en_length, yb_length, 256)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 153, 256)          548864    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 112, 256)         0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 112, 256)          525312    
                                                                 
 time_distributed (TimeDistr  (None, 112, 3348)        860436    
 ibuted)                                                         
                                                                 
Total params: 2,459,924
Trainable params: 2,459,924
Non-

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('model_en_yb.h5', monitor='val_loss', 
                              verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY, epochs=100, batch_size=16, 
                    validation_data=(testX, testY), 
                    callbacks=[checkpoint], verbose=2)

Epoch 1/100

Epoch 1: val_loss improved from inf to 1.08380, saving model to model_en_yb.h5
82/82 - 15s - loss: 2.1498 - val_loss: 1.0838 - 15s/epoch - 179ms/step
Epoch 2/100

Epoch 2: val_loss did not improve from 1.08380
82/82 - 3s - loss: 1.1995 - val_loss: 1.2142 - 3s/epoch - 34ms/step
Epoch 3/100

Epoch 3: val_loss did not improve from 1.08380
82/82 - 3s - loss: 1.1455 - val_loss: 1.1637 - 3s/epoch - 34ms/step
Epoch 4/100

Epoch 4: val_loss did not improve from 1.08380
82/82 - 3s - loss: 1.1079 - val_loss: 1.1417 - 3s/epoch - 34ms/step
Epoch 5/100

Epoch 5: val_loss improved from 1.08380 to 0.98008, saving model to model_en_yb.h5
82/82 - 3s - loss: 1.2289 - val_loss: 0.9801 - 3s/epoch - 35ms/step
Epoch 6/100

Epoch 6: val_loss improved from 0.98008 to 0.95047, saving model to model_en_yb.h5
82/82 - 3s - loss: 1.0568 - val_loss: 0.9505 - 3s/epoch - 35ms/step
Epoch 7/100

Epoch 7: val_loss did not improve from 0.95047
82/82 - 3s - loss: 1.0312 - val_loss: 1.1798 - 3s/epoch - 34ms/st

In [None]:
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
  
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [None]:
# evaluate the model
def evaluate_model(model, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, yb_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	# print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
	
evaluate_model(model, testX, test)

src=[Nhưng dù sao , đây là -- thực phẩm , tại trung tâm của cuộc sống , trung tâm của cuộc sống gia đình , đang được bày ra , thưởng thức , mọi người dành thời gian cho chúng .], target=[But anyway , this is -- it &apos;s food at the center of life , at the center of family life , being celebrated , being enjoyed , people taking time for it .], predicted=[and the the the the the the the the the the the the the the the the the the the the]
src=[Các nhà khoa học phải được thắt chặt hoàn toàn vào ghế để có thể thực hiện đo đạc trên máy bay .], target=[And the scientists have to be completely harnessed in in order to make measurements while they &apos;re on board .], predicted=[and is the fraction the unfortunately older older most recycled and members grandparents]
src=[Nó chỉ cần tuân thủ những quy tắc an toàn .], target=[It just has to follow safety guidelines .], predicted=[it it s emotion emotion]
src=[Bao nhiêu người trong số bạn đã giành chiến thắng ?], target=[How many of you won j

KeyError: ignored