In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/spa.txt


In [2]:
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

# Reading the text file of Spanish-English pairs:-

In [3]:
lines=pd.read_table("/kaggle/input/spa.txt",names=['input','target','comments'])
lines.head(10)

Unnamed: 0,input,target,comments
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
5,Run!,¡Corre!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
6,Run!,¡Corran!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
7,Run!,¡Corra!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
8,Run!,¡Corred!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
9,Run.,Corred.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [4]:
lines=lines[['input','target']]
lines

Unnamed: 0,input,target
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
124320,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
124321,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
124322,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
124323,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


# We have separated the English and Spanish Sentences..

In [5]:
lines.sample(5)

Unnamed: 0,input,target
49381,What position do you hold?,¿Qué postura sostienes?
95846,The trains in Serbia are terribly slow.,Los trenes en Serbia van excesivamente lentos.
50341,He became a nice young man.,Él se convirtió en un excelente joven.
109643,Why in the world would I want to be a teacher?,¿Por qué diablos yo querría ser profesora?
17575,I talk in my sleep.,Hablo mientras duermo.


In [6]:
lines=lines.sample(70000)

In [7]:
len(lines)

70000

# Now preprocessing the statements:-

In [8]:
def preprocess_sentence(sentence):
    
    num_digits= str.maketrans('','', digits)
    
    sentence= sentence.lower()
    sentence= re.sub(" +", " ", sentence)
    sentence= re.sub("'", '', sentence)
    sentence= sentence.translate(num_digits)
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'start_ ' + sentence + ' _end'
    
    return sentence

In [9]:
lines['input']=lines['input'].apply(preprocess_sentence)

In [10]:
lines['input']

30735                   start_ the suspect confessed . _end
56014             start_ is the beam solid or hollow ? _end
44856                start_ we ought to be back soon . _end
113311    start_ venice ,  italy is one of the wonders o...
100949    start_ what do you like most about working her...
                                ...                        
118517    start_ tom couldnt find a job in boston ,  so ...
84562       start_ the problem is youre not canadian . _end
34034                   start_ ill tell you a secret . _end
42877                 start_ im going to take a bath . _end
329                                start_ hi ,  guys . _end
Name: input, Length: 70000, dtype: object

In [11]:
lines['target']=lines['target'].apply(preprocess_sentence)

In [12]:
lines['target']

30735                   start_ el sospechoso confesó . _end
56014           start_ ¿ está la viga sólida o hueca ? _end
44856                start_ deberíamos volver pronto . _end
113311    start_ venecia en italia es una de las maravil...
100949    start_ ¿ qué es lo que más te gusta de trabaja...
                                ...                        
118517    start_ tom no pudo encontrar trabajo en boston...
84562     start_ el problema es que tú no eres canadiens...
34034                   start_ te contaré un secreto . _end
42877                           start_ voy a bañarme . _end
329                     start_ ¿ qué pasa ,  troncos ? _end
Name: target, Length: 70000, dtype: object

In [13]:
len(lines)

70000

In [14]:
rows = lines.to_numpy().tolist()

In [15]:
rows = np.array(rows)
rows

array([['start_ the suspect confessed . _end',
        'start_ el sospechoso confesó . _end'],
       ['start_ is the beam solid or hollow ? _end',
        'start_ ¿ está la viga sólida o hueca ? _end'],
       ['start_ we ought to be back soon . _end',
        'start_ deberíamos volver pronto . _end'],
       ...,
       ['start_ ill tell you a secret . _end',
        'start_ te contaré un secreto . _end'],
       ['start_ im going to take a bath . _end',
        'start_ voy a bañarme . _end'],
       ['start_ hi ,  guys . _end',
        'start_ ¿ qué pasa ,  troncos ? _end']], dtype='<U279')

# Made English Spanish Pairs..

In [16]:
english=[]
for i in lines['input']:
    english.append(i)

In [17]:
len(english)

70000

In [18]:
spanish=[]
for i in lines['target']:
    spanish.append(i)

In [19]:
len(spanish)

70000

In [20]:
english[0]

'start_ the suspect confessed . _end'

# Creating the input and target tokens:-

In [21]:
input_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
input_sentence_tokenizer.fit_on_texts(english)
input_array = input_sentence_tokenizer.texts_to_sequences(english)
input_array= tf.keras.preprocessing.sequence.pad_sequences(input_array,padding='post')

In [22]:
input_array

array([[   1,    4, 2115, ...,    0,    0,    0],
       [   1,   11,    4, ...,    0,    0,    0],
       [   1,   31,  990, ...,    0,    0,    0],
       ...,
       [   1,   84,   93, ...,    0,    0,    0],
       [   1,   35,   72, ...,    0,    0,    0],
       [   1, 2227,   17, ...,    0,    0,    0]], dtype=int32)

In [23]:
target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(spanish)
target_array = target_sentence_tokenizer.texts_to_sequences(spanish)
target_array= tf.keras.preprocessing.sequence.pad_sequences(target_array,padding='post',maxlen=30)

In [24]:
target_array

array([[ 1,  3,  2, ...,  0,  0,  0],
       [ 1, 10,  2, ...,  0,  0,  0],
       [ 1,  3,  2, ...,  0,  0,  0],
       ...,
       [ 1,  7,  3, ...,  0,  0,  0],
       [ 1,  7,  3, ...,  0,  0,  0],
       [ 1, 18,  3, ...,  0,  0,  0]], dtype=int32)

In [25]:
print(len(target_array[0]))

30


In [26]:
max_target_length= max(len(t) for t in  target_array)
print(max_target_length)
max_source_length= max(len(t) for t in  input_array)
print(max_source_length)

30
51


# Padding the sentences to a certain length::-

In [27]:
input_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
input_sentence_tokenizer.fit_on_texts(english)
input_array = input_sentence_tokenizer.texts_to_sequences(english)
input_array= tf.keras.preprocessing.sequence.pad_sequences(input_array,padding='post',maxlen=20)

In [28]:
target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(spanish)
target_array = target_sentence_tokenizer.texts_to_sequences(spanish)
target_array= tf.keras.preprocessing.sequence.pad_sequences(target_array,padding='post',maxlen=30)

In [29]:
input_sentence_tokenizer

<keras_preprocessing.text.Tokenizer at 0x7fed21ebda50>

In [30]:
target_sentence_tokenizer

<keras_preprocessing.text.Tokenizer at 0x7fed21cc10d0>

# Train Test Split:-

In [31]:
input_train, input_val, target_train, target_val = train_test_split(input_array, target_array, test_size=0.2)

print(len(input_train), len(target_train), len(input_val), len(target_val))

56000 56000 14000 14000


# 80-20 Split

In [32]:
print(len(input_sentence_tokenizer.word_index)+1)

11128


In [33]:
print(len(target_sentence_tokenizer.word_index)+1)

20980


When the dataset is big, we want to create the dataset in memory to be efficient. We will use tf.data.Dataset.from_tensor_slices() method to get slices of the array in the form of an object.

In [34]:
buffer_size = len(input_train)
batch_size = 64
steps_per_epoch = len(input_train)//batch_size
embedding_dim = 256
units = 1024
vocab_input_size = len(input_sentence_tokenizer.word_index)+1
vocab_target_size = len(target_sentence_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)
# print(type(dataset))

# Encoder-Decoder Architecture:-

In [36]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,units,batchsize):
        super(Encoder, self).__init__()
        self.batchsize=batchsize
        self.units=units
        self.embedding=tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru=tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')# entire sequence of outputs will be returned from all the units.
        #To return the internal state of GRU, we set the return_state to True
        # this returns 3 parameters in LSTM with return states and sequences true but in GRU it returns only 2 parameters..
    
    def call(self,y,hidden):
        y=self.embedding(y)
        output,state=self.gru(y,initial_state=hidden)
        return output,state

    def initialize_hidden_state(self):
        return tf.zeros((self.batchsize, self.units))

In [37]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 20]), TensorShape([64, 30]))

In [38]:
encoder = Encoder(vocab_input_size, embedding_dim, units,batch_size )

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 20, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


# Creating a Bahdanau Attention Layer:-

Attention layer consists:-

Alignment Score

Attention weights

Context vector

In [39]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    # The encoder hiiden states are taken as input to the attention layer which are of shape (batch size, units)
    # and the encoder output of each timestep is of shape (batch size, sequence length, units).
    # so for adding we have to expand dimensions
    def call(self,encoder_out,encoder_hid):
        
       hidden1=tf.expand_dims(encoder_hid,1)
        
        
       # score shape == (batch_size, max_length, 1)
       # we get 1 at the last axis because we are applying score to self.V
       # the shape of the array before applying self.V is (batch_size, max_length, units)
       score = self.V(tf.nn.tanh(
          self.W1(encoder_out) + self.W2(hidden1)))

       # attention_weights shape == (batch_size, max_length, 1)
       attention_weights = tf.nn.softmax(score, axis=1) ## the alignment scores for each encoder hidden state
        #are combined and represented in a single vector and subsequently softmaxed

       # context_vector shape after sum == (batch_size, hidden_size)
       context_vector = attention_weights * encoder_out ## attention weights multiplied with the encoder output states are used to calculate the context vactor
        
       context_vector = tf.reduce_sum(context_vector, axis=1)

       return context_vector, attention_weights ## returning the context vector and the attention_weights

In [40]:
attention_layer = Attention(10)# 10 for units of attention
attention_result, attention_weights = attention_layer(sample_output,sample_hidden)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 20, 1)


# The context vector should be of the shape of (batch size, units) as it be combined with the decoder previous embeddings..

# Decoder Class:-

In [41]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,units,batchsize):
        super(Decoder, self).__init__()
        self.batchsize=batchsize
        self.units=units
        self.embedding=tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru=tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')# entire sequence of outputs will be returned from all the units.
        #To return the internal state of GRU, we set the return_state to True
        
        #fully connected layer for the decoder outputs
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # attention layer
        self.attention=Attention(self.units)
        
    def call(self, x, enc_output,hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(enc_output,hidden)
#         print(context_vector.shape)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
#         print(x.shape)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)## context vaector is added with the previous decoder hidden state.
       
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights       

In [42]:
decoder = Decoder(vocab_target_size, embedding_dim, units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_output, sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 20980)


# Defining the Optimiser and Loss Function:-

In [43]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [45]:
target_sentence_tokenizer.word_index['start_']

1

In [46]:
def train_step(inp, target, encoder_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    encoder_output, encoder_hidden = encoder(inp, enc_hidden)

    decoder_hidden = encoder_hidden

    decoder_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * batch_size, 1)

    # Teacher forcing 
    for t in range(1, target.shape[1]):
      # passing enc_output to the decoder
      predictions, decoder_hidden, _ = decoder(decoder_input, encoder_output, decoder_hidden)

      loss += loss_function(target[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(target[:, t], 1)

  batch_loss = (loss / int(target.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [47]:
steps_per_epoch

875

In [48]:
epochs = 5

for epoch in range(epochs):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print('EPOCH:- {} BATCH:- {} LOSS:- {}'.format(epoch + 1,batch, batch_loss.numpy()))
   

  print('EPOCH:- {} Loss:- {:.5f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for this Epoch {} sec\n'.format(time.time() - start))

EPOCH:- 1 BATCH:- 0 LOSS:- 2.6692914962768555
EPOCH:- 1 BATCH:- 100 LOSS:- 1.554011583328247
EPOCH:- 1 BATCH:- 200 LOSS:- 1.5967878103256226
EPOCH:- 1 BATCH:- 300 LOSS:- 1.7050445079803467
EPOCH:- 1 BATCH:- 400 LOSS:- 1.5926105976104736
EPOCH:- 1 BATCH:- 500 LOSS:- 1.5504059791564941
EPOCH:- 1 BATCH:- 600 LOSS:- 1.6022757291793823
EPOCH:- 1 BATCH:- 700 LOSS:- 1.656678557395935
EPOCH:- 1 BATCH:- 800 LOSS:- 1.3925235271453857
EPOCH:- 1 Loss:- 1.57393
Time taken for this Epoch 466.7751874923706 sec

EPOCH:- 2 BATCH:- 0 LOSS:- 1.414547324180603
EPOCH:- 2 BATCH:- 100 LOSS:- 1.3436685800552368
EPOCH:- 2 BATCH:- 200 LOSS:- 1.378778338432312
EPOCH:- 2 BATCH:- 300 LOSS:- 1.3461782932281494
EPOCH:- 2 BATCH:- 400 LOSS:- 1.3486477136611938
EPOCH:- 2 BATCH:- 500 LOSS:- 1.2581250667572021
EPOCH:- 2 BATCH:- 600 LOSS:- 1.426490068435669
EPOCH:- 2 BATCH:- 700 LOSS:- 1.2014590501785278
EPOCH:- 2 BATCH:- 800 LOSS:- 1.3172775506973267
EPOCH:- 2 Loss:- 1.31184
Time taken for this Epoch 464.2180075645447 se

In [49]:
def evaluate(sentence):
  attention_plot = np.zeros((max_target_length, max_source_length))

  sentence = preprocess_sentence(sentence)

  inputs = [input_sentence_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_source_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)

  for t in range(max_target_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         enc_out,
                                                         dec_hidden
                                                         )

    attention_weights = tf.reshape(attention_weights, (-1, ))


    predicted_id = tf.argmax(predictions[0]).numpy()

    result += target_sentence_tokenizer.index_word[predicted_id] + ' '

    if target_sentence_tokenizer.index_word[predicted_id] == '_end':
      return result, sentence

    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [51]:
def translate(sentence):
  result, sentence = evaluate(sentence)
  
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

In [56]:
translate('Hello.')

Input: start_ hello . _end
Predicted translation: hola . _end 


In [53]:
translate('Hello everyone.')

Input: start_ hello everyone . _end
Predicted translation: hola todos . _end 


In [54]:
translate('How are you?')

Input: start_ how are you ? _end
Predicted translation: ¿ cómo estás _end 


In [55]:
translate('I am working from home.')

Input: start_ i am working from home . _end
Predicted translation: estoy trabajando . _end 


In [62]:
translate('Learning Spanish.')

Input: start_ learning spanish . _end
Predicted translation: aprender un _end 


In [63]:
translate('Lets Hope for the best.')

Input: start_ lets hope for the best . _end
Predicted translation: esperemos por . _end 
