In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import plotly.express as px
import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [None]:
from google.colab import files
import io

uploaded = files.upload()

for fn in uploaded.keys():
  
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
data_path = 'machine_translation.txt'

Saving machine_translation.txt to machine_translation.txt
User uploaded file "machine_translation.txt" with length 34494242 bytes


In [None]:
def read_file(data_path):
  with open(data_path, 'rt') as file:
    text = file.read()
    file.close()
    return text

In [None]:
def remove_punctuation(sentence):
  remove_punc = [char for char in sentence if char not in string.punctuation]
  combine_removed_punc = ''.join(remove_punc)
  return combine_removed_punc

In [None]:
def to_lines(text):
  sents = text.strip().split('\n')
  sents = [i.split('\t') for i in sents]
  return sents

In [None]:
data = read_file('machine_translation.txt')
french_n_eng = to_lines(data)
french_n_eng = np.array(french_n_eng)

In [None]:
french_n_eng = french_n_eng[:25000,:2]

In [None]:
french_n_eng[:,0] = [s.translate(str.maketrans('','',string.punctuation)) for s in french_n_eng[:,0]]
french_n_eng[:,1] = [s.translate(str.maketrans('','',string.punctuation)) for s in french_n_eng[:,1]]

french_n_eng[:,0] = [s.lower() for s in french_n_eng[:,0]]
french_n_eng[:,1] = [s.lower() for s in french_n_eng[:,1]]

In [None]:
french_n_eng

array([['go', 'geh'],
       ['hi', 'hallo'],
       ['hi', 'grüß gott'],
       ...,
       ['we cant go there', 'wir können da nicht hingehen'],
       ['we cant help tom', 'wir können tom nicht helfen'],
       ['we cant help you', 'wir können dir nicht helfen']], dtype='<U537')

In [None]:
def max_length(lines):
  return max(len(line.split()) for line in lines)

In [None]:
def tokenize_text(text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)
  return tokenizer

In [None]:
eng_tokenizer = tokenize_text(french_n_eng[:,0])
french_tokenizer = tokenize_text(french_n_eng[:,1])

english_vocab_length = len(eng_tokenizer.word_index) + 1
french_vocab_length = len(french_tokenizer.word_index) + 1

english_max_length = max_length(french_n_eng[:,0])
french_max_length = max_length(french_n_eng[:,1])

print(f'The total number of unique English words is: {english_vocab_length}')
print(f'The total number of unique French words is: {french_vocab_length}')

print(f'The maximum length of English sentences are: {english_max_length}')
print(f'The maximum length of French sentences are: {french_max_length}')

The total number of unique English words is: 4247
The total number of unique French words is: 6674
The maximum length of English sentences are: 5
The maximum length of French sentences are: 10


In [None]:
def sequence_encoding(tokenizer, length, sentences):
  seq = tokenizer.texts_to_sequences(sentences)
  seq = pad_sequences(seq, maxlen=length, padding='post')
  return seq

In [None]:
def encode_output(sequences, vocab_size):
  ylist = []
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)

  y = np.array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(french_n_eng, test_size=0.25, random_state = 42)

In [None]:
X_train = sequence_encoding(french_tokenizer,french_max_length,train[:,1])
y_train = sequence_encoding(eng_tokenizer,english_max_length,train[:,0])
y_train = encode_output(y_train, english_vocab_length)

X_test = sequence_encoding(french_tokenizer,french_max_length,test[:,1])
y_test = sequence_encoding(eng_tokenizer,english_max_length,test[:,0])
y_test = encode_output(y_test, english_vocab_length)

In [None]:
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
model = Sequential()
model.add(Embedding(french_vocab_length, 512, input_length=french_max_length,mask_zero=True))
model.add(LSTM(512))
model.add(RepeatVector(english_max_length))
model.add(LSTM(512, return_sequences=True))
model.add(TimeDistributed(Dense(english_vocab_length, activation='softmax')))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 512)           3417088   
_________________________________________________________________
lstm (LSTM)                  (None, 512)               2099200   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 512)            2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 4247)           2178711   
Total params: 9,794,199
Trainable params: 9,794,199
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test), callbacks=[checkpoint], verbose=1)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 3.65273, saving model to model.h5
Epoch 2/30
Epoch 00002: val_loss improved from 3.65273 to 3.25967, saving model to model.h5
Epoch 3/30
Epoch 00003: val_loss improved from 3.25967 to 2.92614, saving model to model.h5
Epoch 4/30
Epoch 00004: val_loss improved from 2.92614 to 2.63449, saving model to model.h5
Epoch 5/30
Epoch 00005: val_loss improved from 2.63449 to 2.40862, saving model to model.h5
Epoch 6/30
Epoch 00006: val_loss improved from 2.40862 to 2.23845, saving model to model.h5
Epoch 7/30
Epoch 00007: val_loss improved from 2.23845 to 2.11281, saving model to model.h5
Epoch 8/30
Epoch 00008: val_loss improved from 2.11281 to 2.01739, saving model to model.h5
Epoch 9/30
Epoch 00009: val_loss improved from 2.01739 to 1.95054, saving model to model.h5
Epoch 10/30
Epoch 00010: val_loss improved from 1.95054 to 1.89030, saving model to model.h5
Epoch 11/30
Epoch 00011: val_loss improved from 1.89030 to 1.85850, saving model to

<tensorflow.python.keras.callbacks.History at 0x7f9c446eb6d8>

In [None]:
from keras.models import load_model
model = load_model('model.h5')

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [None]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [np.argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [None]:
# evaluate the skill of the model
from nltk.translate.bleu_score import corpus_bleu
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [26]:

print('train')
evaluate_model(model, eng_tokenizer, X_train, train)

train
src=[tom wird verlieren], target=[tom will lose], predicted=[tom lose]
src=[sie lachen], target=[theyre laughing], predicted=[theyre laughing]
src=[wir lächelten], target=[we smiled], predicted=[we smiled]
src=[tom grimassierte], target=[tom grimaced], predicted=[tom grimaced]
src=[ich mag pferde], target=[i like horses], predicted=[i like horses]
src=[wir nennen ihn tom], target=[we call him tom], predicted=[we call him tom]
src=[kinder sind grausam], target=[kids are cruel], predicted=[kids are cruel]
src=[ich verehre sie], target=[i adore you], predicted=[i adore you]
src=[heute ist sonnabend], target=[today is saturday], predicted=[today is saturday]
src=[ihr macht wohl witze], target=[youre joking], predicted=[youre joking]
BLEU-1: 0.906205
BLEU-2: 0.864896
BLEU-3: 0.815807
BLEU-4: 0.601791


In [28]:
print('test')
evaluate_model(model, eng_tokenizer, X_test, test)

test
src=[tom war gelangweilt], target=[tom was bored], predicted=[tom was bored]
src=[tom ist schwer verletzt], target=[tom is badly hurt], predicted=[tom died hurt]
src=[tom hat dienst], target=[tom is on duty], predicted=[tom have kids]
src=[tom ist vorbereitet], target=[tom is prepared], predicted=[tom is stunned]
src=[tom will geld], target=[tom wants money], predicted=[tom wants money]
src=[kühe fressen gras], target=[cows eat grass], predicted=[how a your numb]
src=[ich werde tom ausrufen lassen], target=[ill page tom], predicted=[ill go tom tom]
src=[zeigen sie uns wo es langgeht], target=[show us the way], predicted=[lets us the]
src=[komm sofort hierher], target=[come at once], predicted=[come back once]
src=[tom ist eingeschritten], target=[tom intervened], predicted=[tom is fanatical]
BLEU-1: 0.597706
BLEU-2: 0.477394
BLEU-3: 0.408898
BLEU-4: 0.246018


In [30]:
preds = model.predict_classes(X_test.reshape((X_test.shape[0],X_test.shape[1])))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [32]:
preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = word_for_id(i[j], eng_tokenizer)
            if j > 0:
                if (t == word_for_id(i[j-1], eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))

In [35]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})
pred_df.sample(15)

Unnamed: 0,actual,predicted
1432,beat it,go away
1684,can you show me,can you see me
2280,were you jealous,were you jealous
2404,this is a pun,thats a fir tree
678,its secret,this is secret
5369,im not certain,i not sure
5288,tom is a player,tom is a bad
2683,its about time,its so wrong
4761,i need support,i need internet
675,tom wasnt crazy,tom wasnt crazy
