In [54]:
import collections
import helper
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [56]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SLU Semesters/SLU 3rd Semester/NLP/Fifth Competition/

%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SLU Semesters/SLU 3rd Semester/NLP/Fifth Competition
'Neural Machine Translation.ipynb'   [0m[01;34mtest-05[0m/   [01;34mtrain-05[0m/


In [57]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def read_data(file_name):
  data=[]
  with open(file_name, encoding='utf8') as f:
    for line in f:
      line=line.strip()
      data.append(line)
  size = len(data)
  idx_list = [idx + 1 for idx, val in
            enumerate(data) if val == '</s>']
  res = [data[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))]
  return res

In [60]:
%ls

'Neural Machine Translation.ipynb'   [0m[01;34mtest-05[0m/   [01;34mtrain-05[0m/


In [None]:
source = read_data('train-05//train-source.txt')
target = read_data('train-05/train-target.txt')
source

In [63]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [64]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [65]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

preproc_source, preproc_target, source_tokenizer, target_tokenizer =\
    preprocess(source, target)
    
max_source_length = preproc_source.shape[1]
max_target_length = preproc_target.shape[1]
source_vocab_size = len(source_tokenizer.word_index)
target_vocab_size = len(target_tokenizer.word_index)

print('Data Preprocessed')
print("Max source sentence length:", max_source_length)
print("Max target sentence length:", max_target_length)
print("Source vocabulary size:", source_vocab_size)
print("Target vocabulary size:", target_vocab_size)

In [66]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ''

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [67]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

# Train RNN
tmp_x = pad(preproc_source, max_target_length)
tmp_x = tmp_x.reshape((-1, preproc_target.shape[-2], 1))

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_target_length,
    source_vocab_size,
    target_vocab_size)
simple_rnn_model.fit(tmp_x, preproc_target, batch_size=32, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], target_tokenizer))

Epoch 1/10
 242/1130 [=====>........................] - ETA: 1:09:58 - loss: 2.4616 - accuracy: 0.9128

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(french_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(french_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
bidi_model = bd_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)
bidi_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model.predict(tmp_x[:1])[0], french_tokenizer))

In [None]:
# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

In [None]:

# source: https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model