In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [2]:
import unicodedata
import re
import numpy as np
import os
import io
import time

In [3]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True,
    cache_dir='.',
)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
path_to_zip

'./datasets/spa-eng.zip'

In [5]:
path_to_file = os.path.dirname(path_to_zip) + '/spa-eng/spa.txt'

In [6]:
path_to_file

'./datasets/spa-eng/spa.txt'

In [7]:
path_to_file = os.path.join(
    os.path.dirname(path_to_zip),
    'spa-eng',
    'spa.txt'
)

In [8]:
path_to_file

'./datasets/spa-eng/spa.txt'

In [9]:
# Unicode to ascii

In [10]:
? -> \xfa
? -> \00f1

?this is my ?

Object ` -> \xfa` not found.
Object ` -> \00f1` not found.
Object `this is my ` not found.


In [11]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


In [12]:
unicode_to_ascii('Á')

'A'

In [13]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [14]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [15]:
preprocess_sentence(sp_sentence)

'<start> ¿ puedo tomar prestado este libro ? <end>'

In [16]:
type('abc'.encode('utf-8'))

bytes

In [17]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)


In [18]:
lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')

In [19]:
lines[0:10]

['Go.\tVe.',
 'Go.\tVete.',
 'Go.\tVaya.',
 'Go.\tVáyase.',
 'Hi.\tHola.',
 'Run!\t¡Corre!',
 'Run.\tCorred.',
 'Who?\t¿Quién?',
 'Fire!\t¡Fuego!',
 'Fire!\t¡Incendio!']

In [20]:
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:10]]


In [21]:
word_pairs[:10]

[['<start> go . <end>', '<start> ve . <end>'],
 ['<start> go . <end>', '<start> vete . <end>'],
 ['<start> go . <end>', '<start> vaya . <end>'],
 ['<start> go . <end>', '<start> vayase . <end>'],
 ['<start> hi . <end>', '<start> hola . <end>'],
 ['<start> run ! <end>', '<start> corre ! <end>'],
 ['<start> run . <end>', '<start> corred . <end>'],
 ['<start> who ? <end>', '<start> ¿ quien ? <end>'],
 ['<start> fire ! <end>', '<start> fuego ! <end>'],
 ['<start> fire ! <end>', '<start> incendio ! <end>']]

In [22]:
samples = [[1, 2], [3, 4], [5, 6]]

In [23]:

list(zip(*samples))

[(1, 3, 5), (2, 4, 6)]

In [24]:

list(zip(*samples))

[(1, 3, 5), (2, 4, 6)]

In [25]:
list(zip(*word_pairs))

[('<start> go . <end>',
  '<start> go . <end>',
  '<start> go . <end>',
  '<start> go . <end>',
  '<start> hi . <end>',
  '<start> run ! <end>',
  '<start> run . <end>',
  '<start> who ? <end>',
  '<start> fire ! <end>',
  '<start> fire ! <end>'),
 ('<start> ve . <end>',
  '<start> vete . <end>',
  '<start> vaya . <end>',
  '<start> vayase . <end>',
  '<start> hola . <end>',
  '<start> corre ! <end>',
  '<start> corred . <end>',
  '<start> ¿ quien ? <end>',
  '<start> fuego ! <end>',
  '<start> incendio ! <end>')]

In [26]:
for a, b in zip([1,2,3], [4,5,6]):
    print(a, b)

1 4
2 5
3 6


In [27]:
english, spanish = create_dataset(path_to_file, None)

In [28]:
len(english)

118964

In [29]:
len(spanish)

118964

In [30]:
def max_length(tensor):
    return max(len(t) for t in tensor)


In [31]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(
        tensor,
        padding='post'
    )
    
    return tensor, lang_tokenizer

In [32]:
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters=''
)

In [33]:
lang_tokenizer.fit_on_texts(english)

In [34]:
english[10:20]

('<start> fire ! <end>',
 '<start> help ! <end>',
 '<start> help ! <end>',
 '<start> help ! <end>',
 '<start> jump ! <end>',
 '<start> jump . <end>',
 '<start> stop ! <end>',
 '<start> stop ! <end>',
 '<start> stop ! <end>',
 '<start> wait ! <end>')

In [35]:
lang_tokenizer.texts_to_sequences(english[10:20])

[[1, 426, 119, 2],
 [1, 94, 119, 2],
 [1, 94, 119, 2],
 [1, 94, 119, 2],
 [1, 1861, 119, 2],
 [1, 1861, 3, 2],
 [1, 213, 119, 2],
 [1, 213, 119, 2],
 [1, 213, 119, 2],
 [1, 266, 119, 2]]

In [36]:
lang_tokenizer.index_word[2]

'<end>'

In [37]:
lang_tokenizer.word_counts

OrderedDict([('<start>', 118964),
             ('go', 2861),
             ('.', 103454),
             ('<end>', 118964),
             ('hi', 20),
             ('run', 200),
             ('!', 921),
             ('who', 1592),
             ('?', 15998),
             ('fire', 228),
             ('help', 1275),
             ('jump', 27),
             ('stop', 503),
             ('wait', 394),
             ('on', 4161),
             ('hello', 30),
             ('i', 34330),
             ('ran', 226),
             ('try', 404),
             ('won', 688),
             ('oh', 25),
             ('no', 1690),
             ('relax', 46),
             ('smile', 84),
             ('attack', 49),
             ('get', 1909),
             ('up', 2108),
             ('now', 1237),
             ('got', 1178),
             ('it', 9779),
             ('he', 11460),
             ('hop', 2),
             ('in', 8977),
             ('hug', 19),
             ('me', 6472),
             ('fell', 187),
        

In [38]:
lang_tokenizer.texts_to_sequences(english[100:110])

[[1, 49, 223, 119, 2],
 [1, 49, 223, 119, 2],
 [1, 49, 223, 119, 2],
 [1, 49, 223, 119, 2],
 [1, 49, 223, 119, 2],
 [1, 49, 223, 3, 2],
 [1, 49, 223, 3, 2],
 [1, 49, 223, 3, 2],
 [1, 49, 223, 3, 2],
 [1, 49, 223, 3, 2]]

In [39]:
lang_tokenizer.word_index['<start>']

1

In [40]:
english[:2]

('<start> go . <end>', '<start> go . <end>')

In [41]:
sample_tensor, english_tokenize = tokenize(english)

In [42]:
type(sample_tensor)

numpy.ndarray

In [43]:
sample_tensor.shape

(118964, 51)

In [44]:
english[-1]

'<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>'

In [45]:
sample_tensor[-1:]

array([[    1,    72,     7,    39,     6,   718,    43,     9,   995,
         1277,    20,     7,   129,    37,  1426,     6,  1950,   586,
            5,   268,  1049,   181,    42,   181,    16,     5,   268,
          170,    17,  5242,  2246,  1950,     5,   268,  4623,   181,
           42,   181,   346,    48,    33,   200,    14,  3437,    42,
           44,     5, 12932, 12933,     3,     2]], dtype=int32)

In [46]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [47]:
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)


In [48]:
max_length_inp

16

In [49]:
max_length_targ

11

In [50]:
# X_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [51]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [52]:

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))


24000 24000 6000 6000


In [53]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))


In [54]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])


Input Language; index to word mapping
1 ----> <start>
12 ----> me
142 ----> siento
136 ----> mal
22 ----> por
9 ----> el
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
135 ----> feel
141 ----> bad
54 ----> for
41 ----> him
3 ----> .
2 ----> <end>


In [55]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [56]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 11]))

In [57]:
example_input_batch

<tf.Tensor: id=18, shape=(64, 16), dtype=int32, numpy=
array([[   1,   53,   61, ...,    0,    0,    0],
       [   1,    9,  106, ...,    0,    0,    0],
       [   1,    6,  132, ...,    0,    0,    0],
       ...,
       [   1, 2158,  772, ...,    0,    0,    0],
       [   1,   25,   12, ...,    0,    0,    0],
       [   1, 2268,   15, ...,    0,    0,    0]], dtype=int32)>

In [58]:
import pickle

In [59]:
with open('eng-tokenizer.pickle', 'wb') as handle:
    pickle.dump(inp_lang,handle, protocol=pickle.HIGHEST_PROTOCOL)

In [60]:
with open('eng-tokenizer.pickle', 'rb') as handle:
    eng_tokenizer = pickle.load(handle)

In [61]:
eng_tokenizer

<keras_preprocessing.text.Tokenizer at 0x7f5caca7d898>

In [62]:
inp_lang.to_json()

'{"class_name": "Tokenizer", "config": {"num_words": null, "filters": "", "lower": true, "split": " ", "char_level": false, "oov_token": null, "document_count": 30000, "word_counts": "{\\"<start>\\": 30000, \\"ve\\": 96, \\".\\": 25250, \\"<end>\\": 30000, \\"vete\\": 43, \\"vaya\\": 21, \\"vayase\\": 7, \\"hola\\": 16, \\"corre\\": 14, \\"!\\": 724, \\"corred\\": 1, \\"\\\\u00bf\\": 4063, \\"quien\\": 370, \\"?\\": 4066, \\"fuego\\": 15, \\"incendio\\": 2, \\"disparad\\": 2, \\"ayuda\\": 106, \\"socorro\\": 1, \\"auxilio\\": 2, \\"salta\\": 2, \\"salte\\": 3, \\"parad\\": 2, \\"para\\": 234, \\"pare\\": 6, \\"espera\\": 58, \\"esperen\\": 7, \\"continua\\": 13, \\"continue\\": 4, \\"corri\\": 7, \\"corria\\": 2, \\"lo\\": 1250, \\"intento\\": 17, \\"he\\": 259, \\"ganado\\": 13, \\"oh\\": 7, \\",\\": 584, \\"no\\": 3177, \\"tomatelo\\": 2, \\"con\\": 399, \\"soda\\": 2, \\"sonrie\\": 3, \\"al\\": 237, \\"ataque\\": 2, \\"atacad\\": 1, \\"levanta\\": 15, \\"ahora\\": 310, \\"mismo\\": 

# Encoder Model

In [63]:
# tf.keras.Model
# tf.keras.layers.Layer

In [93]:
import keras
from keras.layers import Dense, Conv2D

In [94]:
model = Sequential()
model.add(Conv2D)
model.add(Activation('relu'))
model.add(Dense())
model.add(MyLayer(4))
          

TypeError: The added layer must be an instance of class Layer. Found: <class 'keras.layers.convolutional.Conv2D'>

In [87]:
Dense(4)

<tensorflow.python.keras.layers.core.Dense at 0x7f5cae6841d0>

In [66]:
class MyLayer(tf.keras.layers.Layer):
    def __init__(self, units, input_size=10):
        super(MyLayer, self).__init__()
        self.units = units
        self.W = tf.Variable(np.random.random((input_size, units)))
#         self.b = tf.Variable(np.zeros(units))
        
    def call(self, x):
        return tf.matmul(self.W, x)

In [67]:
# Conv block
# Convolution
# Activation
# Pooling

In [71]:
class ConvBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(ConvBlock, self).__init__()
        self.conv = Conv2D()
        self.activation = Activation('relu')
        self.pool = MaxPool2D()
#         self.b = tf.Variable(np.zeros(units))
        
    def call(self, x):
        x = self.conv(x)
        x = self.activation(x)
        x = self.pool(x)
        return x

In [None]:
ConvBlock
ConvBlock
ConvBlock

In [72]:
class MyCustomModel(tf.keras.Model):
    def __init__(self):
        super(MyCustomModel, self).__init__()
#         self.conv = Conv2D()
#         self.activation = Activation('relu')
#         self.pool = MaxPool2D()
        self.conv1 = ConvBlock()
#         self.conv = Conv2D()
#         self.activation = Activation('relu')
#         self.pool = MaxPool2D()    
    
        self.conv2 = ConvBlock()
#         self.conv = Conv2D()
#         self.activation = Activation('relu')
#         self.pool = MaxPool2D()
        self.conv3 = ConvBlock()
        #
        
        
    def call(self, x):
        
        x1 = self.conv1(x)
        x2 = self.conv2(tf.concat(x1, x2))
        x3 = self.conv3(x1)
        return x

In [None]:
model.fit()
model.predict()
model.evaluate()

In [None]:
x = Input()(shape=(224, 224, 3))
x = ConvBlock((3,3), f, s, t)(x)
x = Conv2D((3,3), f, s, t)(x)
x = Conv2D((3,3), f, s, t)(x)

In [None]:
optimizer = Adam
loss = CE

model.compile(optimizer=optimizer, loss=loss)
model.fit(X, y, epochs=10, callbacks=[])
model.evaluate()
model.predict(x)

In [None]:
# with tf.GradientTape() as tape:
    # gardients

In [73]:
# Encoder

In [75]:
from tensorflow.keras.layers import Embedding, LSTM, GRU

In [102]:
class Encoder(tf.keras.Model):
    def __init__(self, enc_size, vocab_size, embedding_dim, batch_size, *args, **kwargs):
        super(Encoder, self).__init__(*args, **kwargs)
        self.batch_size = batch_size
        self.enc_size = enc_size
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru1 = GRU(
            self.enc_size,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
#         self.gru2 = GRU(
#             self.enc_size,
#             return_sequences=True,
#             return_state=True,
#             recurrent_initializer='glorot_uniform'
#         )
        
        
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru1(x, initial_state=hidden)
#         output, state = self.gru2(output, initial_state=hidden)
        return output, state

    def intialize_hidden_state(self):
        # h0 initialize
        return tf.zeros((self.batch_size, self.enc_size))

In [98]:
class MyImprovedEncoder(Encoder):
    def __init__(self, a, *args, **kwargs):
        super(MyImprovedEncoder, self).__init__(*args, **kwargs)
        self.a = a
        # my code

In [99]:
MyImprovedEncoder(a=1, enc_size=units, vocab_size=vocab_inp_size, embedding_dim=embedding_dim, batch_size=BATCH_SIZE)

<__main__.MyImprovedEncoder at 0x7ffa44da4048>

In [None]:
# h0 = tf.zero(BATCH_SIZE, enc_size)

In [77]:
vocab_inp_size

9414

In [78]:
embedding_dim

256

In [79]:
BATCH_SIZE

64

In [80]:
units

1024

In [103]:
encoder = Encoder(units, vocab_inp_size, embedding_dim,BATCH_SIZE)

In [105]:
sample_hidden = encoder.intialize_hidden_state()

In [None]:
# __call__

In [115]:
sample_enc_output, sample_enc_hidden = encoder(example_input_batch, sample_hidden)

In [108]:
sample_output.shape

TensorShape([64, 16, 1024])

In [107]:
sample_hidden.shape

TensorShape([64, 1024])

In [109]:
encoder.trainable_variables

[<tf.Variable 'encoder_1/embedding_5/embeddings:0' shape=(9414, 256) dtype=float32, numpy=
 array([[-0.02405313,  0.01118705, -0.00900221, ..., -0.02745303,
         -0.01768808, -0.02039759],
        [-0.02430129, -0.04065233,  0.02536715, ..., -0.04108682,
         -0.01033243,  0.04633598],
        [-0.00243936, -0.01556148,  0.0426462 , ..., -0.00887741,
          0.03793115, -0.03844951],
        ...,
        [ 0.04485312,  0.0453629 , -0.04329422, ..., -0.03758893,
          0.01958226,  0.02176924],
        [-0.02578396, -0.01560203, -0.03896049, ...,  0.03056771,
         -0.00318732,  0.04133384],
        [-0.03276111,  0.00088693,  0.02171394, ..., -0.0446475 ,
          0.01616229,  0.02238699]], dtype=float32)>,
 <tf.Variable 'encoder_1/gru_3/kernel:0' shape=(256, 3072) dtype=float32, numpy=
 array([[ 0.01239154, -0.00067599, -0.01560164, ..., -0.04024666,
          0.02721113, -0.03719517],
        [ 0.00335   ,  0.02379695, -0.00996122, ..., -0.01288218,
          0.00441

In [127]:
# Decoder

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc = tf.keras.layers.Dense(vocab_size)
        
    
    def call(self, x, hidden, enc_output):
        x = self.embedding(x)
        output, state = self.gru(x)
        print(output.shape)
        output = tf.reshape(output, (-1, output.shape[2]))
        print(output.shape)
        x = self.fc(output)
        return x

In [128]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [129]:
example_input_batch

<tf.Tensor: id=18, shape=(64, 16), dtype=int32, numpy=
array([[   1,    4,  879, ...,    0,    0,    0],
       [   1,   65,   68, ...,    0,    0,    0],
       [   1,    6,   60, ...,    0,    0,    0],
       ...,
       [   1,   48, 2994, ...,    0,    0,    0],
       [   1,  352,   20, ...,    0,    0,    0],
       [   1,  406, 5367, ...,    0,    0,    0]], dtype=int32)>

In [131]:
sample_decoder_output = decoder(
    tf.random.uniform((BATCH_SIZE, 11)),
    sample_enc_hidden,
    sample_enc_output
)

(64, 11, 1024)
(704, 1024)


In [None]:
# Attention

In [132]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
    super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        # Additive attenion
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        # Softmax(score)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        # alpha * ht
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

IndentationError: expected an indented block (<ipython-input-132-fca44f856e8b>, line 3)

In [None]:
# Decoder

class AttentionDecoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Ba
    
    def call(self, x, hidden, enc_output):
        x = self.embedding(x)
        output, state = self.gru(x)
        print(output.shape)
        output = tf.reshape(output, (-1, output.shape[2]))
        print(output.shape)
        x = self.fc(output)
        return x
