In [1]:
import tensorflow as tf
print(tf.__version__)

print('GPU name: {}'.format(tf.test.gpu_device_name()))

2.5.0
GPU name: /device:GPU:0


In [2]:
!nvidia-smi

Tue Jun 15 16:58:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 165...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   45C    P0    14W /  N/A |    297MiB /  3903MiB |      8%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
# folder path if running in Google Colab
#googleDrivePathPrefix = 'drive/My Drive/Colab Notebooks'

# folder path for running locally
googleDrivePathPrefix = '.'

In [4]:
import pandas as pd
import numpy as np
from os import path
from math import ceil
import json
import time
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.layers import Layer,Input,Masking,LSTM,Embedding,Dense
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping

In [5]:
df = pd.read_json(path.join(googleDrivePathPrefix,'data/cmn-processed-tokenized.json'))
df.head()

Unnamed: 0,english,chinese,english_split,chinese_split,chinese_tokenized
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]","[1, 1924, 3, 2]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]","[1, 7, 33, 3, 2]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]","[1, 7, 95, 397, 5, 3, 2]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]","[1, 208, 208, 160, 2]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]","[1, 208, 12, 46, 160, 2]"


Apply padding to the tokenized german sequences.

In [6]:
tokenizer_seq = df['chinese_tokenized']
max_len_in_chinese_tokenized = max([len(item) for item in tokenizer_seq])
print(f'Max length in tokenized chinese sequence: {max_len_in_chinese_tokenized}')

Max length in tokenized chinese sequence: 46


In [7]:
chinese_padded_seq = pad_sequences(tokenizer_seq,maxlen = None,padding = "post")
print(f'Shape of padded sequences: {chinese_padded_seq.shape}')

Shape of padded sequences: (24089, 46)


In [8]:
print(f"0-th chinese sentence: {df['chinese_split'].loc[0]}\n")
print(f"0-th tokenized sequence: {df['chinese_tokenized'].loc[0]}\n")
print(f"0-th padded sequence: {chinese_padded_seq[0]}\n")


0-th chinese sentence: ['<start>', '嗨', '。', '<end>']

0-th tokenized sequence: [1, 1924, 3, 2]

0-th padded sequence: [   1 1924    3    2    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]



Load a pre-trained english embedding layer from Tensorflow Hub

``` python
# Load embedding module from Tensorflow Hub
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1",
output_shape=[128], input_shape=[], dtype=tf.string)
```

We will use the pre-downloaded model file instead, as follows:

In [9]:
embedding_layer = load_model(path.join(googleDrivePathPrefix,'models/tf2-preview_nnlm-en-dim128_1'))



In [10]:
# Test the layer
embedding_layer(tf.constant(["these", "aren't", "the", "droids", "you're","looking", "for"])).shape

TensorShape([7, 128])

Train test split using `sklearn` library.

In [11]:
x_train,x_test,y_train,y_test = train_test_split(df['english'].to_list(),chinese_padded_seq,train_size=0.8,shuffle=True)

In [12]:
print(f'x_train[0]:\n{x_train[0]}\n')
print(f'y_train[0]:\n{y_train[0]}\n')

x_train[0]:
I know I owe you money . 

y_train[0]:
[   1    4   49   47    4 1175    7  238    3    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]



In [13]:
print(f'x_train length: {len(x_train)}\n')
print(f'y_train length: {len(y_train)}\n')
print(f'x_test length: {len(x_test)}\n')
print(f'y_test length: {len(y_test)}\n')

x_train length: 19271

y_train length: 19271

x_test length: 4818

y_test length: 4818



Create `tf.data.Dataset` object

In [14]:
training_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((x_test,y_test))

Mapping to split english text with spaces.

In [15]:
def map_splitting(english,chinese):
    return (tf.strings.split(english,' '),chinese)

training_dataset_split=training_dataset.map(map_splitting)
validation_dataset_split=validation_dataset.map(map_splitting)

In [16]:
training_dataset_split.element_spec

(TensorSpec(shape=(None,), dtype=tf.string, name=None),
 TensorSpec(shape=(46,), dtype=tf.int32, name=None))

Inspect first element after map_splitting.

In [17]:
element = next(iter(training_dataset_split.take(1)))
print(f'x_train[0]:\n{element[0]}\n')
print(f'x_train[0] shape:\n{element[0].shape}\n')
print(f'y_train[0]:\n{element[1]}\n')

x_train[0]:
[b'I' b'know' b'I' b'owe' b'you' b'money' b'.' b'']

x_train[0] shape:
(8,)

y_train[0]:
[   1    4   49   47    4 1175    7  238    3    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]



Filter dataset examples where the English sentence is more than 46.

In [18]:
# max_len_in_chinese_tokenized = 46

def filter_less_eq_max_token_len(english,chinese):
    return tf.less_equal(tf.shape(english)[0],tf.constant(max_len_in_chinese_tokenized))

training_dataset_filter = training_dataset_split.filter(filter_less_eq_max_token_len)
validation_dataset_filter = validation_dataset_split.filter(filter_less_eq_max_token_len)

In [19]:
count_train = training_dataset_filter.reduce(0, lambda x, _: x + 1)
count_val = validation_dataset_filter.reduce(0, lambda x, _: x + 1)

In [20]:
print(f'{count_train}')
print(f'{count_val}')

19271
4818


Inspect 5 random samples from the filtered training dataset.

In [21]:
elements = iter(training_dataset_filter.shuffle(buffer_size=100).take(5))
i=0
for english,german in elements:
    i+=1
    print(f'{i}-th random sample shape:\n{english.shape}\n')

1-th random sample shape:
(7,)

2-th random sample shape:
(8,)

3-th random sample shape:
(11,)

4-th random sample shape:
(14,)

5-th random sample shape:
(8,)



Mapping to apply the pre-trained word embedding to english texts.

In [22]:
def map_embedding(english,chinese):
    return (embedding_layer(english),chinese)

training_dataset_embed=training_dataset_filter.map(map_embedding)
validation_dataset_embed=validation_dataset_filter.map(map_embedding)

In [23]:
training_dataset_embed.element_spec

(TensorSpec(shape=(None, 128), dtype=tf.float32, name=None),
 TensorSpec(shape=(46,), dtype=tf.int32, name=None))

Inspect first element after map_embedding.

In [24]:
element = next(iter(training_dataset_embed.take(1)))
print(f'Embedding mapping:\n')
print(f'x_train[0] shape:\n{element[0].shape}\n')
print(f'y_train[0]:\n{element[1]}\n')

Embedding mapping:

x_train[0] shape:
(8, 128)

y_train[0]:
[   1    4   49   47    4 1175    7  238    3    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]



Padding english sentence embeddings to length of 46 (padding to prior).

In [25]:
def map_english_padding(english,chinese):
    english_length = tf.shape(english)[0]
    paddings = [[max_len_in_chinese_tokenized-english_length,0],
                [0,0]
               ]
    return (tf.pad(english,paddings=paddings),chinese)

training_dataset_english_padded=training_dataset_embed.map(map_english_padding)
validation_dataset_english_padded=training_dataset_embed.map(map_english_padding)

In [26]:
element = next(iter(training_dataset_english_padded.take(1)))
print(f'x_train[0] shape:\n{element[0].shape}\n')
print(f'y_train[0]:\n{element[1]}\n')

x_train[0] shape:
(46, 128)

y_train[0]:
[   1    4   49   47    4 1175    7  238    3    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]



Batching to batch_size of 16.

In [27]:
batch_size = 16
training_dataset_final = training_dataset_english_padded.\
                         repeat().\
                         batch(batch_size=batch_size)
validation_dataset_final = validation_dataset_english_padded.\
                         repeat().\
                         batch(batch_size=batch_size)

Inspecting element_spec of final training and validation datasets.

In [28]:
training_dataset_final.element_spec

(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 46), dtype=tf.int32, name=None))

In [29]:
validation_dataset_final.element_spec

(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 46), dtype=tf.int32, name=None))

Inspecting first batch of both training and validation datasets.

In [30]:
element = next(iter(training_dataset_final.take(1)))
print(f'x_train batch shape:\n{element[0].shape}\n')
print(f'y_train batch shape:\n{element[1].shape}\n')

x_train batch shape:
(16, 46, 128)

y_train batch shape:
(16, 46)



In [31]:
element = next(iter(validation_dataset_final.take(1)))
print(f'x_validation batch shape:\n{element[0].shape}\n')
print(f'y_validation batch shape:\n{element[1].shape}\n')

x_validation batch shape:
(16, 46, 128)

y_validation batch shape:
(16, 46)



### Create encoder model

In [32]:
class EndTokenEmbedLayer(Layer):
    def __init__(self):
        super(EndTokenEmbedLayer, self).__init__()

    def build(self, input_shape):
        self.embedding_size = input_shape[-1]
        self.embedding = self.add_weight(shape=(self.embedding_size,),
                                         initializer='random_normal',
                                         name='end_token_embedding')
  
    def call(self, inputs):
        one_row = tf.reshape(self.embedding,(-1,1,self.embedding_size))
        end_token_output = tf.tile(one_row,[tf.shape(inputs)[0],1,1])
        return tf.concat((inputs,end_token_output),axis=1)

In [33]:
endTokenlayer = EndTokenEmbedLayer()
for english,german in iter(training_dataset_final.take(1)):
    endTokenAdded = endTokenlayer(english)
    print(f'English sentences shape: {english.shape}\n')
    print(f'English sentences (end token appended) shape: {endTokenAdded.shape}\n')

English sentences shape: (16, 46, 128)

English sentences (end token appended) shape: (16, 47, 128)



In [34]:
def Encoder(input_shape):
    inputs = Input(input_shape)
    h = EndTokenEmbedLayer()(inputs)
    h = Masking(mask_value=0.)(h)
    lstm , hidden_state, cell_state = LSTM(512,return_sequences=True,return_state=True)(h)
    model = Model(inputs=inputs, outputs=[hidden_state, cell_state])
    return model

In [35]:
# max_len_in_chinese_tokenized =46

encoder = Encoder(input_shape=(max_len_in_chinese_tokenized,128))
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 46, 128)]         0         
_________________________________________________________________
end_token_embed_layer_1 (End (None, 47, 128)           128       
_________________________________________________________________
masking (Masking)            (None, 47, 128)           0         
_________________________________________________________________
lstm (LSTM)                  [(None, 47, 512), (None,  1312768   
Total params: 1,312,896
Trainable params: 1,312,896
Non-trainable params: 0
_________________________________________________________________


In [36]:
for english,chinese in iter(training_dataset_final.take(1)):
    hidden_state, cell_state = encoder(english)
    print(f'hidden_state shape: {hidden_state.shape}\n')
    print(f'cell_state shape: {cell_state.shape}\n')

hidden_state shape: (16, 512)

cell_state shape: (16, 512)



### Build the decoder network

In [37]:
tokenizer=[]
with open(path.join(googleDrivePathPrefix,'data/tokenizer.json')) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

tokenizer_config = tokenizer.get_config()

In [38]:
word_index = json.loads(tokenizer_config['word_index'])
max_word_index = max(word_index.values())
max_word_index

3438

In [39]:
class Decoder(Model):
    def __init__(self,input_embedding_dim):
        super(Decoder, self).__init__()
        self.embedding = Embedding(input_dim = input_embedding_dim[0],
                                   output_dim = input_embedding_dim[1],
                                   mask_zero = True, trainable=True)
        self.lstm = LSTM(units=512, return_sequences=True, return_state=True, trainable=True)
        self.dense = Dense(units=max_word_index + 1, trainable=True)

    def call(self,inputs,hidden_state = None,cell_state = None):
        h = self.embedding(inputs)
        if hidden_state != None and cell_state != None:
            lstm,hidden,cell = self.lstm(h,initial_state =[hidden_state,cell_state])
        else:
            lstm,hidden,cell = self.lstm(h)
        h = self.dense(lstm)
        return h,hidden,cell

Remark: Notice that input_dim is set to maximum word index + 1 because the embedding layer maps vocabulary word index into 0-indexed array. Thus the max word index, 3438 needs to be
mapped into an array of 3439 elements, i.e. with the range of [0,3438].

In [40]:
decoder = Decoder(input_embedding_dim=(max_word_index + 1, 128))

In [41]:
for english,chinese in iter(training_dataset_final.take(1)):
    print(chinese.shape)
    lstm_out, hidden_state, cell_state = decoder(chinese)
    print(f'lstm_out shape: {lstm_out.shape}\n')
    print(f'hidden_state shape: {hidden_state.shape}\n')
    print(f'cell_state shape: {cell_state.shape}\n')

(16, 46)
lstm_out shape: (16, 46, 3439)

hidden_state shape: (16, 512)

cell_state shape: (16, 512)



In [42]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  440192    
_________________________________________________________________
lstm_1 (LSTM)                multiple                  1312768   
_________________________________________________________________
dense (Dense)                multiple                  1764207   
Total params: 3,517,167
Trainable params: 3,517,167
Non-trainable params: 0
_________________________________________________________________


### Make a custom training loop

In [43]:
# define loss objective
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

In [44]:
# define optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01,momentum=0.9,nesterov=True)

In [45]:
# define custom model via model subclassing 
class NeuralTranslationModel(Model):
    def __init__(self,encoder_input_shape,decoder_input_shape):
        super(NeuralTranslationModel, self).__init__()
        self.encoder = Encoder(input_shape=encoder_input_shape)
        self.decoder = Decoder(input_embedding_dim=decoder_input_shape)
        self.model_trainable_variables = self.encoder.trainable_variables + \
                                         self.decoder.trainable_variables    
  
    def chinese_data_io(self,chinese_data):
        input_data = chinese_data[:,0:tf.shape(chinese_data)[1]-1]
        output_data = chinese_data[:,1:tf.shape(chinese_data)[1]]
        return(input_data,output_data)

    def call(self,inputs):
        (encoder_in, decoder_in)=inputs
        hidden_state ,cell_state = self.encoder(encoder_in)
        dense_output, _, _ = self.decoder(decoder_in, hidden_state, cell_state)
        return dense_output

    @tf.function
    def train_step(self,data):        
        (english,chinese) = data
        chinese_input, chinese_output = self.chinese_data_io(chinese)  
        with tf.GradientTape() as tape:        
            hidden_state ,cell_state = self.encoder(english)
            dense_output, _, _ = self.decoder(chinese_input, hidden_state, cell_state)
            loss = tf.math.reduce_mean(self.compiled_loss(chinese_output,dense_output))
            grads = tape.gradient(loss, self.model_trainable_variables)
            self.optimizer.apply_gradients(zip(grads,
                                               self.model_trainable_variables))
            self.compiled_metrics.update_state(chinese_output,dense_output)
        return {m.name:m.result() for m in self.metrics}

    @tf.function
    def test_step(self, data):
        (english,chinese) = data
        chinese_input, chinese_output = self.chinese_data_io(chinese) 
        hidden_state ,cell_state = self.encoder(english)
        dense_output, _, _ = self.decoder(chinese_input, hidden_state, cell_state)
        loss = tf.math.reduce_mean(self.compiled_loss(chinese_output,dense_output))
        self.compiled_metrics.update_state(chinese_output,dense_output)
        return {m.name:m.result() for m in self.metrics}

In [46]:
encoder_in=tf.zeros([1,max_len_in_chinese_tokenized,128])
encoder_in.shape

TensorShape([1, 46, 128])

In [47]:
decoder_in=tf.Variable([[1]])
decoder_in.shape

TensorShape([1, 1])

In [48]:
# instantiate and compile the custom model
translation_model = NeuralTranslationModel(encoder_input_shape=(max_len_in_chinese_tokenized,128),
                                           decoder_input_shape=(max_word_index + 1, 128))

# build the model by calling it
translation_model((encoder_in,decoder_in))

translation_model.compile(optimizer = optimizer,
                          loss = loss_object,
                          metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [49]:
# define callbacks
checkpoint_epoch = ModelCheckpoint(filepath='models/eng-to-ch/checkpoint_epoch/checkpoint_{epoch}',
                                   save_weights_only=True,
                                   save_freq='epoch',
                                   verbose=1)
checkpoint_best = ModelCheckpoint(filepath='models/eng-to-ch/checkpoint_best/checkpoint',
                                   save_weights_only=True,
                                   save_freq='epoch',
                                   save_best_only=True,
                                   monitor='val_sparse_categorical_accuracy',
                                   verbose=1)
lr_reduce_plateau = ReduceLROnPlateau(monitor='val_loss', 
                                      factor=0.05, 
                                      patience=5, 
                                      verbose=1, 
                                      mode='min')
early_stopping = EarlyStopping(monitor='val_loss', 
                               min_delta=0, 
                               patience=10, 
                               verbose=1,
                               mode='min')

callbacks=[checkpoint_best,lr_reduce_plateau,early_stopping]

# fit the model
steps_per_epoch = ceil(len(x_train)/int(batch_size))
validation_steps = ceil(len(x_test)/int(batch_size))
translation_model.fit(training_dataset_final,
                      epochs=2,
                      steps_per_epoch=steps_per_epoch,
                      validation_data=validation_dataset_final,
                      validation_steps=validation_steps,
                      callbacks=callbacks
                     )

Epoch 1/2

Epoch 00001: val_sparse_categorical_accuracy improved from -inf to 0.04537, saving model to models/eng-to-ch/checkpoint_best/checkpoint
Epoch 2/2

Epoch 00002: val_sparse_categorical_accuracy did not improve from 0.04537


<tensorflow.python.keras.callbacks.History at 0x7f194eb1e070>

The encoder and decoder models can be accessed through the custom model as followed:

In [50]:
translation_model.encoder

<tensorflow.python.keras.engine.functional.Functional at 0x7f191340cc10>

In [51]:
translation_model.decoder

<__main__.Decoder at 0x7f1913200250>

### Making translation with the custom model

In [52]:
start_token = word_index['<start>']
end_token = word_index['<end>']
inv_chinese_index = {value:key for key,value in tokenizer.word_index.items()}

In [53]:
def translate(english_split_in):
    eng_embedding = embedding_layer(english_split_in)
    eng_padded = tf.pad(eng_embedding, 
                        [[max_len_in_chinese_tokenized-len(eng_embedding), 0], 
                         [0, 0]], 
                        constant_values = 0)
    english_expand = tf.expand_dims(eng_padded, 0)
    hidden_state, cell_state = translation_model.encoder(english_expand)

    current_translation = []
    current_token = tf.Variable([[start_token]])

    while (len(current_translation) <= max_len_in_chinese_tokenized):
        out1, hidden_state, cell_state = translation_model.decoder(current_token,hidden_state,cell_state)
        out2 = tf.argmax(out1, axis=2).numpy()[0,0]
        current_token = tf.Variable([[out2]])
        if out2 == end_token:
            break
        else:
            current_translation.append(out2)
    inv_tokenized = [inv_chinese_index[w] for w in current_translation]
    inv_tokenized_string = ' '.join(inv_tokenized)
    return inv_tokenized_string

In [54]:
df['english_split'][100]

['Hurry', 'up', '.']

In [55]:
translate(df['english_split'][100])

'我 我 我 我 我 我 我 我 我 我 我 坏 鬆 鬆 赖 赖 赖 棒 辣 辣 裕 匈 辣 守 匈 憶 憶 砸 挣 壽 猛 榜 滋 绳 貓 遮 遮 跃 墳 敗 尤 洛 洛 薪 薪 助 苗'

Up to this point, the model has not been fully trained. However, we have verify the functionality of the entire pipeline.

Up next, we will train the model in the next notebook.