In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json

from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.models import load_model

from src.model import NeuralTranslationModel

### Load models

Load `NeuralTranslationModel`

In [2]:
max_len_in_chinese_tokenized=46
max_word_index=3438

In [3]:
translation_model = NeuralTranslationModel(encoder_input_shape=(max_len_in_chinese_tokenized,128),
                                           decoder_input_shape=(max_word_index + 1, 128))

# build the model by calling it
encoder_in=tf.zeros([1,max_len_in_chinese_tokenized,128])
decoder_in=tf.Variable([[1]])
print(f'encoder_in shape: {encoder_in.shape}')
print(f'decoder_in shape: {decoder_in.shape}')
_ = translation_model((encoder_in,decoder_in))

encoder_in shape: (1, 46, 128)
decoder_in shape: (1, 1)


In [4]:
translation_model.load_weights('models/eng-to-ch/checkpoint_best/checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f89157af0d0>

Load pre-trained English embedding

In [5]:
embedding_layer = load_model('models/tf2-preview_nnlm-en-dim128_1')



---

### Load dataset

In [6]:
df = pd.read_json('data/cmn-processed-tokenized.json')
df.head()

Unnamed: 0,english,chinese,english_split,chinese_split,chinese_tokenized
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]","[1, 1924, 3, 2]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]","[1, 7, 33, 3, 2]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]","[1, 7, 95, 397, 5, 3, 2]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]","[1, 208, 208, 160, 2]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]","[1, 208, 12, 46, 160, 2]"


In [7]:
chinese = df['chinese']
english = df['english']
english_split = df['english_split']

---

### Load tokenizer

In [8]:
tokenizer=[]
with open('data/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

tokenizer_config = tokenizer.get_config()
word_index = json.loads(tokenizer_config['word_index'])

In [9]:
start_token = word_index['<start>']
end_token = word_index['<end>']
inv_chinese_index = {value:key for key,value in tokenizer.word_index.items()}

---

### Making translation

In [10]:
def translate(english_split_in):
    eng_embedding = embedding_layer(english_split_in)
    eng_padded = tf.pad(eng_embedding, 
                        [[max_len_in_chinese_tokenized-len(eng_embedding), 0], 
                         [0, 0]], 
                        constant_values = 0)
    english_expand = tf.expand_dims(eng_padded, 0)
    hidden_state, cell_state = translation_model.encoder(english_expand)

    current_translation = []
    current_token = tf.Variable([[start_token]])

    while (len(current_translation) <= max_len_in_chinese_tokenized):
        out1, hidden_state, cell_state = translation_model.decoder(current_token,hidden_state,cell_state)
        out2 = tf.argmax(out1, axis=2).numpy()[0,0]
        current_token = tf.Variable([[out2]])
        if out2 == end_token:
            break
        else:
            current_translation.append(out2)
    inv_tokenized = [inv_chinese_index[w] for w in current_translation]
    inv_tokenized_string = ' '.join(inv_tokenized)
    return inv_tokenized_string

In [11]:
full_test_count = 10

In [12]:
test_indices = np.random.choice(english_split.index,full_test_count)
test_indices

array([ 9823, 21897, 15761,  5972,  7261,  6796,  9512, 18266, 10582,
        5166])

In [13]:
full_translations=[]
for idx in test_indices:
    tmp_result = translate(english_split[idx])
    full_translations.append(tmp_result)

In [14]:
df_full_results = pd.DataFrame(data={'English-Dataset':english.loc[test_indices].reset_index(drop=True), 
                                     'Chinese-Dataset':chinese.loc[test_indices].reset_index(drop=True),
                                     'Chinese-Translated':full_translations}, 
                               index=range(len(test_indices)))
df_full_results

Unnamed: 0,English-Dataset,Chinese-Dataset,Chinese-Translated
0,He is still very much alive .,他依旧充满活力 。,他 一 直 非 常 。
1,He kept his promise and helped his brothers .,他履行了他的承诺 ， 并且帮助了他的兄弟 。,他 把 他 的 父 親 的 幫 助 他 們 不 想 。
2,Is there a post office near here ?,這附近有郵局嗎 ？,有 附 近 有 房 间 吗 ？
3,Wait for me in the car .,在车里等一下 。,在 我 站 要 一 起 。
4,I don't like eating meat .,我不喜歡吃肉了 。,我 不 喜 歡 吃 食 。
5,Tom looked at his notes .,汤姆看了看笔记 。,汤 姆 看 了 我 的 房 子 。
6,Tom passed away last night .,汤姆在昨晚去世了 。,汤 姆 昨 天 早 上 。
7,What would you think if I did that ?,如果我那么做你会怎么想 ？,你 怎 麼 想 我 想 做 什 麼 ？
8,We're all praying for Japan .,我們全體為日本祈禱 。,我 們 在 一 個 棒 間 。
9,This is too expensive !,这太贵了 。,这 是 真 的 。
