In [None]:
import tensorflow
from tensorflow.keras.utils import get_file


In [None]:
! wget http://www.manythings.org/anki/deu-eng.zip

--2021-10-26 08:22:54--  http://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.92.44, 172.67.186.54, 2606:4700:3033::ac43:ba36, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.92.44|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9079830 (8.7M) [application/zip]
Saving to: ‘deu-eng.zip’


2021-10-26 08:22:55 (12.8 MB/s) - ‘deu-eng.zip’ saved [9079830/9079830]



In [None]:
!unzip deu-eng.zip

Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


## **Clean Text**

In [None]:
# load doc into memory
def load_doc(filename):

  # open the file as read only
  file=open(filename,mode='rt',encoding='utf-8')

  # read all text
  text=file.read()
  
  # clost the file
  file.close()
  return text
  

In [None]:
filename='/content/deu.txt'
doc=load_doc(filename)


In [None]:
doc.strip().split('\n')

['Go.\tGeh.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)',
 'Hi.\tHallo!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)',
 'Hi.\tGrüß Gott!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)',
 'Run!\tLauf!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)',
 'Run.\tLauf!\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)',
 'Wow!\tPotzdonner!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122382 (Pfirsichbaeumchen)',
 'Wow!\tDonnerwetter!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122391 (Pfirsichbaeumchen)',
 'Duck!\tKopf runter!\tCC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9968521 (wolfgangth)',
 'Fire!\tFeuer!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #1958697 (Tamy)',
 'Help!\tHilfe!\tCC-BY 2.0 (France) Attribution: tatoeba.org #4350

Each line contains a single pair of phrases, first English and then German, separated by a tab character. We must split the loaded text by line and then by phrase. The function to below will split the loaded text.

In [None]:
#Split loaded documents into sentences
def to_pairs(doc):
  lines=doc.strip().split('\n')
  pairs=[line.split('\t') for line in lines]
  return pairs

In [None]:
pairs=to_pairs(doc)
pairs

[['Go.',
  'Geh.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)'],
 ['Hi.',
  'Hallo!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)'],
 ['Hi.',
  'Grüß Gott!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)'],
 ['Run!',
  'Lauf!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)'],
 ['Run.',
  'Lauf!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)'],
 ['Wow!',
  'Potzdonner!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122382 (Pfirsichbaeumchen)'],
 ['Wow!',
  'Donnerwetter!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122391 (Pfirsichbaeumchen)'],
 ['Duck!',
  'Kopf runter!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9968521 (wolfgangth)'],
 ['Fire!',
  'Feuer!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamste

# **Text Cleaning Steps:**

1.Remove all non-printable characters.

2.Remove all punctuation characters. 

3.Normalize all Unicode characters to ASCII 
  (e.g. Latin characters). 

4.Normalize the case to lowercase. 

5.Remove any remaining tokens that are not alphabetic.

In [None]:
import pickle
from unicodedata import normalize
import string
import re
import numpy as np


In [None]:
def clean_pairs(lines):
  cleaned=list()

  #prepare regex for character filtering
  re_punc=re.compile('[%s]'%re.escape(string.punctuation))
  re_print=re.compile('[^%s]'%re.escape(string.printable))

  for pair in lines:
    clean_pair=list()

    for line in pair:
      line=normalize('NFD',line).encode('ascii','ignore')
      line=line.decode('UTF-8')
      line=line.split()
      line=[word.lower() for word in line]
      line=[re_punc.sub('',w) for w in line]
      line=[re_print.sub('',w) for w in line]
      line=[word for word in line if word.isalpha()]
      clean_pair.append(' '.join(line))
    cleaned.append(clean_pair)
  return np.array(cleaned)

In [None]:
clean_pairs=clean_pairs(pairs)
clean_pairs

array([['go', 'geh', 'ccby france attribution tatoebaorg cm roujin'],
       ['hi', 'hallo', 'ccby france attribution tatoebaorg cm cburgmer'],
       ['hi', 'gru gott',
        'ccby france attribution tatoebaorg cm esperantostern'],
       ...,
       ['it may be impossible to get a completely errorfree corpus due to the nature of this kind of collaborative effort however if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning we might be able to minimize errors',
        'es ist wohl unmoglich einen vollkommen fehlerfreien korpus zu erreichen das liegt in der natur eines solchen gemeinschaftsprojekts doch wenn wir unsere mitglieder dazu bringen konnen nicht mit sprachen herumzuexperimentieren die sie gerade lernen sondern satze in ihrer eigenen muttersprache beizutragen dann gelingt es uns vielleicht die zahl der fehler klein zu halten',
        'ccby france attribution tatoebaorg ck pfirsichbaeumchen'],
       ['i

# **Split Text**

The clean data contains a little over 150,000 phrase pairs and some of the pairs toward the end of the file are very long. This is a good number of examples for developing a small translation model.

We will simplify the problem by reducing the dataset to the first 10,000 examples in the file; these will be the shortest phrases in the dataset. Further, we will then stake the first 
9,000 of those as examples for training and the remaining 1,000 examples to test the fit model



Below steps  of loading the clean data, splitting it, and saving the split portions of data to new files.

In [None]:

#load  clean dataset
def load_clean_data(filename):
  return pickle.load(open(filename,'rb'))

# save a list of clean sentences to file
def save_clean_dataset(sentences,filename):
  return pickle.dump(sentences,open(filename,'wb'))

In [None]:
save_clean_dataset(clean_pairs,'english-german.pkl')

In [None]:
#spot check
for i in range(20):
  print('[%s]=> [%s],[%s]'%(clean_pairs[i,0],clean_pairs[i,1],clean_pairs[i,2]))

[go]=> [geh],[ccby france attribution tatoebaorg cm roujin]
[hi]=> [hallo],[ccby france attribution tatoebaorg cm cburgmer]
[hi]=> [gru gott],[ccby france attribution tatoebaorg cm esperantostern]
[run]=> [lauf],[ccby france attribution tatoebaorg papabear fingerhut]
[run]=> [lauf],[ccby france attribution tatoebaorg jsakuragi fingerhut]
[wow]=> [potzdonner],[ccby france attribution tatoebaorg zifre pfirsichbaeumchen]
[wow]=> [donnerwetter],[ccby france attribution tatoebaorg zifre pfirsichbaeumchen]
[duck]=> [kopf runter],[ccby france attribution tatoebaorg cm wolfgangth]
[fire]=> [feuer],[ccby france attribution tatoebaorg spamster tamy]
[help]=> [hilfe],[ccby france attribution tatoebaorg lukaszpp muiriel]
[help]=> [zu hulf],[ccby france attribution tatoebaorg lukaszpp pfirsichbaeumchen]
[stay]=> [bleib],[ccby france attribution tatoebaorg ck wochenweise]
[stop]=> [stopp],[ccby france attribution tatoebaorg cm jakov]
[stop]=> [anhalten],[ccby france attribution tatoebaorg cm yorwba]

In [None]:
#load dataset
raw_dataset=load_clean_data('/content/english-german.pkl')

In [None]:
n_sentences=10000 #reduce dataset

dataset=raw_dataset[:n_sentences,:]

np.random.shuffle(dataset) #random shuffle

In [None]:

#split in to train and test
train,test=dataset[:9000],dataset[9000:]

In [None]:
save_clean_dataset(dataset,'english-german.pkl')
save_clean_dataset(train,'english-german-train.pkl')
save_clean_dataset(test,'english-german-test.pkl')

In [None]:
#load datasets

dataset=load_clean_data('english-german.pkl')
train=load_clean_data('english-german-train.pkl')
test=load_clean_data('english-german-test.pkl')

## **Fit a tokenizer on the clean text data.**


We will use the both or combination of the train and test datasets to define the maximum length and vocabulary of the problem. This is for simplicity. Alternately, we could define these properties from the training dataset alone and truncate examples in the test set that are too long or have words that are out of the vocabulary. We can use the Keras Tokenize class to map words to integers, as needed for modeling. We will use separate tokenizer for the English sequences and the German sequences. The function below-named create train a tokenizer on a list of phrases.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
def create_tokenizer(dataset):
  tokenizer=Tokenizer()
  tokenizer.fit_on_texts(dataset)
  return tokenizer

## **Calculate the maximum sequence length**

In [None]:
def max_length(dataset):
  return max(len(line.split()) for line in dataset)
  

In [None]:
#prepare english tokenizer
eng_tokenizer=create_tokenizer(dataset[:,0])
eng_length=max_length(dataset[:,0])
eng_vocab_size=len(eng_tokenizer.word_index)+1

In [None]:
#prepare german tokenizer
ger_tokenizer=create_tokenizer(dataset[:,1])
ger_length=max_length(dataset[:,1])
ger_vocab_size=len(ger_tokenizer.word_index)+1

# **Function to encode and pad sequences**
We are now ready to prepare the training dataset. Each input and output sequence must be encoded to integers and padded to the maximum phrase length. This is because we will use a word embedding for the input sequences and one hot encode the output sequences

In [None]:
def encode_sequences(tokenizer,length,data):
  X=tokenizer.texts_to_sequences(data)
  X=pad_sequences(X,maxlen=length,padding='post')
  return X

# **One hot encode output sequences**

The output sequence needs to be one hot encoded. This is because the model will predict the probability of each word in the vocabulary as output.

In [None]:
def encode_output(sequences,vocab_size):
  ylist=list()

  for sequence in sequences:
    encoded=to_categorical(sequence,num_classes=vocab_size)
    ylist.append(encoded)
  y=np.array(ylist)
  y=y.reshape(sequences.shape[0],sequences.shape[1],vocab_size)
  return y

## **Prepare training and test data for modeling.**

In [None]:
#Prepare training data
trainX=encode_sequences(ger_tokenizer,ger_length,train[:,1])
trainY=encode_sequences(eng_tokenizer,eng_length,train[:,0])

In [None]:
trainY=encode_output(trainY,eng_vocab_size)

In [None]:
trainY.shape

(9000, 5, 2185)

In [None]:
trainY.shape

(9000, 5, 2185)

## **Define and summarize the model.**

We will use an encoder-decoder LSTM model on this problem. In this architecture, the input sequence is encoded by a front-end model called the encoder then decoded word by word by a backend model called the decoder. The function define model() below defines the model and takes a number of arguments used to configure the model, such as the size of the input and output vocabularies, the maximum length of input and output phrases, and the number of memory units used to configure the model. 



The model is trained using the efficient Adam approach to stochastic gradient descent and minimizes the categorical loss function because we have framed the prediction problem as multiclass classification

In [None]:
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM,TimeDistributed,Embedding,Dense,RepeatVector
from nltk.translate.bleu_score import corpus_bleu

In [None]:
def define_model(ger_vocab,eng_vocab,ger_length,eng_length,n_units):
  model=Sequential()
  model.add(Embedding(ger_vocab,n_units,input_length=ger_length,mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(eng_length))
  model.add(LSTM(n_units,return_sequences=True))
  model.add(TimeDistributed(Dense(eng_vocab,activation='softmax')))
  model.compile(optimizer='adam',loss='categorical_crossentropy')
  model.summary()
  return model

In [None]:
testX=encode_sequences(ger_tokenizer,ger_length,test[:,1])
testY=encode_sequences(eng_tokenizer,eng_length,test[:,0])

In [None]:
testY=encode_output(testY,eng_vocab_size)

In [None]:
model=define_model(ger_vocab_size,eng_vocab_size,ger_length,eng_length,256)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 9, 256)            903424    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 2185)           561545    
Total params: 2,515,593
Trainable params: 2,515,593
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
checkpoint=ModelCheckpoint('model.h5',monitor='val_loss',verbose=1,save_best_only=True,mode='min')


In [None]:
model.fit(trainX,trainY,epochs=30,validation_data=(testX,testY),callbacks=[checkpoint],verbose=2)

Epoch 1/30
282/282 - 10s - loss: 3.7356 - val_loss: 3.2311

Epoch 00001: val_loss improved from inf to 3.23114, saving model to model.h5
Epoch 2/30
282/282 - 2s - loss: 3.0716 - val_loss: 3.0286

Epoch 00002: val_loss improved from 3.23114 to 3.02864, saving model to model.h5
Epoch 3/30
282/282 - 2s - loss: 2.8222 - val_loss: 2.8551

Epoch 00003: val_loss improved from 3.02864 to 2.85506, saving model to model.h5
Epoch 4/30
282/282 - 2s - loss: 2.6011 - val_loss: 2.7229

Epoch 00004: val_loss improved from 2.85506 to 2.72294, saving model to model.h5
Epoch 5/30
282/282 - 2s - loss: 2.3850 - val_loss: 2.5409

Epoch 00005: val_loss improved from 2.72294 to 2.54091, saving model to model.h5
Epoch 6/30
282/282 - 2s - loss: 2.1594 - val_loss: 2.4054

Epoch 00006: val_loss improved from 2.54091 to 2.40541, saving model to model.h5
Epoch 7/30
282/282 - 2s - loss: 1.9547 - val_loss: 2.2964

Epoch 00007: val_loss improved from 2.40541 to 2.29637, saving model to model.h5
Epoch 8/30
282/282 - 2s

<keras.callbacks.History at 0x7fc31f1a8a90>

## **Map a predicted word index to the word in the vocabulary**

In [None]:
def word_for_id(interger,tokenizer):
  for word,index in tokenizer.word_index.items():
    if index==interger:
      return word
  
  return None

## ***Predict and interpret the target sequence.***

In [None]:
def predict_sequence(model,eng_tokenzier,source):
  predict=model.predict(source,verbose=0)[0]
  integers=[np.argmax(vector) for vector in predict]

  target=list()

  for i in integers:
    word=word_for_id(i,eng_tokenizer)
    if word is None:
      break
    
    target.append(word)
  return ' '.join(target)

## **Function to evaluate a fit model**

In [None]:
def evaluate_model(model,sources,raw_dataset):
  actual,predict=list(),list()

  for i, source in enumerate(sources):

    source=source.reshape((1,source.shape[0]))

    translation=predict_sequence(model,eng_tokenizer,source)
    raw_target,raw_src=raw_dataset[i][0],raw_dataset[i][1]
    if i<10:

      print('src=[%s],target=[%s],predict=[%s]'%(raw_src,raw_target,translation))
    actual.append([raw_target.split()])
    predict.append(translation.split())

    # calculate BLEU score
    
  print('BLEU-1: %f' % corpus_bleu(actual, predict, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predict, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predict, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predict, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
evaluate_model(model, trainX, train)

src=[ich verkaufe obst],target=[i sell fruit],predict=[i sell fruit]
src=[ich habe ein taxi gerufen],target=[i hailed a cab],predict=[i hailed a cab]
src=[ich brach mir den arm],target=[i broke my arm],predict=[i broke my arm]
src=[tom war besturzt],target=[tom was upset],predict=[tom was upset]
src=[ich bin depressiv],target=[im depressed],predict=[im depressed]
src=[ich bin wirklich alt],target=[im really old],predict=[im really old]
src=[wir mussen gehorchen],target=[we must obey],predict=[we must obey]
src=[wie beruhrend],target=[how touching],predict=[how touching]
src=[ich mag beide],target=[i like both],predict=[i like both]
src=[ich war beleidigt],target=[i was offended],predict=[i was offended]
BLEU-1: 0.943288
BLEU-2: 0.921405
BLEU-3: 0.827803
BLEU-4: 0.494887


In [None]:
evaluate_model(model, testX, test)

src=[nimm tom],target=[take tom],predict=[trust tom]
src=[ich hatte spa],target=[i had some fun],predict=[i had fun]
src=[mir wird ubel],target=[i feel dizzy],predict=[i fear nothing]
src=[tom hat unrecht],target=[toms wrong],predict=[tom is wrong]
src=[sie ist krankenschwester],target=[she is a nurse],predict=[shes a]
src=[ich hab dafur gesorgt dass er geht],target=[i made him go],predict=[i told a lip]
src=[sie mogeln],target=[they cheat],predict=[they cheat]
src=[er ist nicht bereit],target=[hes not ready],predict=[hes not ready]
src=[wir summen gerade],target=[were humming],predict=[were fasting]
src=[das gefiele mir],target=[id like that],predict=[that like me]
BLEU-1: 0.570396
BLEU-2: 0.455612
BLEU-3: 0.373780
BLEU-4: 0.158286
