In [1]:
import tensorflow as tf
print(tf.__version__)
print('GPU name: {}'.format(tf.test.gpu_device_name()))

2.5.0
GPU name: /device:GPU:0


In [2]:
!nvidia-smi

Sat Jun 12 15:40:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 165...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   40C    P0    13W /  N/A |    277MiB /  3903MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import numpy as np
import pandas as pd
import json

from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json

### Load models

In [4]:
embedding_layer = load_model('models/tf2-preview_nnlm-en-dim128_1')



In [5]:
encoder=load_model('models/eng-to-chn/encoder')



Additional steps are required for loading the decoder model, due to the fact that it's a subclass `Model` object.

Loaded model cannot be consumed directly. As a workaround, we need to redefine the `Decoder` class in this notebook, and load the saved weights. 

In [6]:
class Decoder(Model):
    def __init__(self):
        super(Decoder, self).__init__()
        self.embedding = Embedding(input_dim = max_word_index + 1,output_dim =128,mask_zero = True)
        self.lstm = LSTM(units=512, return_sequences=True, return_state=True)
        self.dense = Dense(units=max_word_index + 1)

    def call(self,inputs,hidden_state = None,cell_state = None):
        h = self.embedding(inputs)
        if hidden_state != None and cell_state != None:
            lstm,hidden,cell = self.lstm(h,initial_state =[hidden_state,cell_state])
        else:
            lstm,hidden,cell = self.lstm(h)
        h = self.dense(lstm)
        return h,hidden,cell

In [7]:
decoder=load_model('models/eng-to-chn/decoder')



In [8]:
decoder.save_weights('models/eng-to-chn/decoder-weights/decoder-weights')

Load tokenizer to get the `max_word_index` required to generate the decoder model.

In [9]:
tokenizer=[]
with open('data/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

tokenizer_config = tokenizer.get_config()

In [10]:
word_index = json.loads(tokenizer_config['word_index'])
max_word_index = max(word_index.values())
print(f'Max word_index: {max_word_index}')

Max word_index: 3438


In [11]:
decoder=Decoder()
decoder.load_weights('models/eng-to-chn/decoder-weights/decoder-weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd781c21bb0>

### Load dataset

In [12]:
df = pd.read_json('data/cmn-processed-tokenized.json')

In [13]:
df.head()

Unnamed: 0,english,chinese,english_split,chinese_split,chinese_tokenized
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]","[1, 1924, 3, 2]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]","[1, 7, 33, 3, 2]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]","[1, 7, 95, 397, 5, 3, 2]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]","[1, 208, 208, 160, 2]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]","[1, 208, 12, 46, 160, 2]"


In [14]:
tokenizer_seq = df['chinese_tokenized']
max_len_in_chinese_tokenized = max([len(item) for item in tokenizer_seq])

print(f'Max length in tokenized chinese sequence: {max_len_in_chinese_tokenized}')

Max length in tokenized chinese sequence: 46


In [15]:
english = df['english']
chinese = df['chinese']
english_split = df['english_split']
english.head()

0      Hi . 
1      Hi . 
2     Run . 
3    Wait ! 
4    Wait ! 
Name: english, dtype: object

### Translation function development

Develop the translation function with a sample English sentence at index `500`.

In [16]:
test_index = 500
print(f'Sampled English sentence: "{english_split[test_index]}"')

Sampled English sentence: "['Whose', 'is', 'it', '?']"


Embed the input English sentence into the pre-trained 128 length embedding.

In [17]:
eng_embedding = embedding_layer(english_split[test_index])
print(f'Input shape: {len(english_split[test_index])}')
print(f'Embedding layer output shape: {eng_embedding.shape}')

Input shape: 4
Embedding layer output shape: (4, 128)


Apply `prior` zero padding to the embedding to match the encoder model trained, which expect `46` English words input.

In [18]:
eng_padded = tf.pad(eng_embedding, 
                    [[max_len_in_chinese_tokenized-len(eng_embedding), 0], 
                     [0, 0]], 
                    constant_values = 0)

In [19]:
print(f'Padding layer output shape: {eng_padded.shape}')

Padding layer output shape: (46, 128)


In [20]:
for i in range(5):
    print(f'First 4 elements of {i+1}-th embedding: {eng_padded[i,0:4]}')

First 4 elements of 1-th embedding: [0. 0. 0. 0.]
First 4 elements of 2-th embedding: [0. 0. 0. 0.]
First 4 elements of 3-th embedding: [0. 0. 0. 0.]
First 4 elements of 4-th embedding: [0. 0. 0. 0.]
First 4 elements of 5-th embedding: [0. 0. 0. 0.]


In [21]:
end_index=max_len_in_chinese_tokenized
for i in np.arange(end_index-5,end_index,1):
    print(f'First 4 elements of {i+1}-th embedding: {eng_padded[i,0:4]}')

First 4 elements of 42-th embedding: [0. 0. 0. 0.]
First 4 elements of 43-th embedding: [ 0.02321582 -0.00299811  0.01814764  0.12830451]
First 4 elements of 44-th embedding: [ 0.22104432 -0.01606884  0.00432623  0.04148778]
First 4 elements of 45-th embedding: [ 0.03716571 -0.02912278  0.12921344  0.06043958]
First 4 elements of 46-th embedding: [-0.01335301  0.11507112  0.12568313  0.08377809]


Expand the dimension of the input at `axis=0` to represent the batch axis.

In [22]:
english_expand = tf.expand_dims(eng_padded, 0)
print(f'Expand dim english shape: {english_expand.shape}')

Expand dim english shape: (1, 46, 128)


Feed the input into the encoder model.

In [23]:
hidden_state, cell_state = encoder(english_expand)

In [24]:
print(f'Encoder hidden state shape: {hidden_state.shape}')
print(f'Encoder cell state shape: {cell_state.shape}')

Encoder hidden state shape: (1, 512)
Encoder cell state shape: (1, 512)


Extract the `<start>` and `<end>` tokens' `word_index`.

In [25]:
start_token = word_index['<start>']
end_token = word_index['<end>']

print(f'<start> token word index: {start_token}')
print(f'<end> token word index: {end_token}')

<start> token word index: 1
<end> token word index: 2


- A `<start>` token is passed in as the first input, which is embedded using a learned Chinese word embedding.
- The decoder RNN then makes a prediction for the next Chinese word, which during inference is then passed in as the following input, and this process is repeated until the special `<end>` token is emitted from the decoder.

In [26]:
current_translation = []
current_token = tf.Variable([[start_token]])
print(f'current_token shape: {current_token.shape}')

while (len(current_translation) <= max_len_in_chinese_tokenized):
    out1, hidden_state, cell_state = decoder(current_token,hidden_state,cell_state)
    out2 = tf.argmax(out1, axis=2).numpy()[0,0]
    current_token = tf.Variable([[out2]])
    if out2 == end_token:
        break
    else:
        current_translation.append(out2)

print(f'Translation: {current_translation}')

current_token shape: (1, 1)
Translation: [7, 5, 11, 35, 48, 9]


Getting inverse token index dictionary.

In [27]:
inv_chinese_index = {value:key for key,value in tokenizer.word_index.items()}

Mapping the tokenized translation output into Chinese characters.

In [28]:
inv_tokenized = [inv_chinese_index[w] for w in current_translation]
print(f'{inv_tokenized}')

['你', '的', '是', '什', '麼', '？']


Getting Chinese sentence string.

In [29]:
print(f"{' '.join(inv_tokenized)}")

你 的 是 什 麼 ？


### Generalize into a single `translate` function

In [30]:
def translate(english_split_in):
    eng_embedding = embedding_layer(english_split_in)
    eng_padded = tf.pad(eng_embedding, 
                        [[max_len_in_chinese_tokenized-len(eng_embedding), 0], 
                         [0, 0]], 
                        constant_values = 0)
    english_expand = tf.expand_dims(eng_padded, 0)
    hidden_state, cell_state = encoder(english_expand)

    current_translation = []
    current_token = tf.Variable([[start_token]])

    while (len(current_translation) <= max_len_in_chinese_tokenized):
        out1, hidden_state, cell_state = decoder(current_token,hidden_state,cell_state)
        out2 = tf.argmax(out1, axis=2).numpy()[0,0]
        current_token = tf.Variable([[out2]])
        if out2 == end_token:
            break
        else:
            current_translation.append(out2)
    inv_tokenized = [inv_chinese_index[w] for w in current_translation]
    inv_tokenized_string = ' '.join(inv_tokenized)
    return inv_tokenized_string

In [31]:
{'English':english[test_index], 
                                 'Chinese':translate(english_split[test_index])}

{'English': 'Whose is it ? ', 'Chinese': '你 的 是 什 麼 ？'}

In [32]:
df_sample_result = pd.DataFrame(data={'English':english[test_index], 
                                      'Chinese-Reference':chinese[test_index],
                                      'Chinese-Translated':translate(english_split[test_index])}, 
                                index=[0])
df_sample_result

Unnamed: 0,English,Chinese-Reference,Chinese-Translated
0,Whose is it ?,这是谁的 ？,你 的 是 什 麼 ？


### Full translation

In [42]:
full_test_count = 10

In [43]:
test_indices = np.random.choice(english_split.index,full_test_count)
test_indices

array([14777, 21788, 18257, 22888,   194,  3120,  3085,  9726, 20885,
       18153])

In [44]:
full_translations=[]
for idx in test_indices:
    tmp_result = translate(english_split[idx])
    full_translations.append(tmp_result)

In [45]:
df_full_results = pd.DataFrame(data={'English':english.loc[test_indices].reset_index(drop=True), 
                                     'Chinese-Reference':chinese.loc[test_indices].reset_index(drop=True),
                                     'Chinese-Translated':full_translations}, 
                               index=range(len(test_indices)))
df_full_results

Unnamed: 0,English,Chinese-Reference,Chinese-Translated
0,She greeted him waving her hand .,她揮著手向他打招呼 。,她 把 他 的 手 脸 在 看 下 。
1,Tom and Mary have a very good relationship .,汤姆和玛丽关系很好 。,汤 姆 和 玛 丽 一 个 很 好 的 想 忙 。
2,What have you been doing this week ?,你這個星期一直在做什麼 ？,你 今 天 做 做 什 么 ？
3,It is necessary for you to go there immediatel...,你必需马上去那儿 。,你 必 須 做 到 它 。
4,Lie still .,躺着不动 。,我 们 发 着 。
5,This is impossible .,这不可能 。,这 是 不 要 的 。
6,The lid won't open .,這蓋子打不開 。,它 不 開 了 。
7,"Could you take this , please ?",請你拿這個好嗎 ？,你 能 幫 你 嗎 ？
8,Have you ever seen the man in this photo ?,你见过这张照片上的男人吗 ？,你 看 见 看 看 你 的 照 照 照 嗎 ？
9,Thousands of people died of hunger .,數千人死於飢餓 。,人 們 有 一 個 人 都 死 的 。
