In [18]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.5.0


In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
googleDrivePathPrefix = 'drive/My Drive/Colab Notebooks'

In [21]:
import pandas as pd
import numpy as np
from os import path
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
df = pd.read_json(path.join(googleDrivePathPrefix,'data/cmn-processed.json'))

In [23]:
df.head()

Unnamed: 0,english,chinese,english_split,chinese_split
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]"


In [24]:
english = df['english_split']
chinese = df['chinese_split']
print(f'english-chinese sample count: {english.count()}')

english-chinese sample count: 24089


In [25]:
tokenizer = Tokenizer(num_words=None,filters='',lower=False,char_level=False)

In [26]:
tokenizer_config = tokenizer.get_config()
tokenizer_config

{'char_level': False,
 'document_count': 0,
 'filters': '',
 'index_docs': '{}',
 'index_word': '{}',
 'lower': False,
 'num_words': None,
 'oov_token': None,
 'split': ' ',
 'word_counts': '{}',
 'word_docs': '{}',
 'word_index': '{}'}

In [27]:
tokenizer.fit_on_texts(chinese)
tokenizer_seq = tokenizer.texts_to_sequences(chinese)

In [28]:
tokenizer_config = tokenizer.get_config()

In [29]:
word_count_dict = json.loads(tokenizer_config['word_counts'])
df_word_count = pd.DataFrame({'char':word_count_dict.keys(), 'count':word_count_dict.values()})
print('Word count for top 10 highest frequency characters:')
df_word_count.sort_values(by=['count'],ascending=False).head(10)

Word count for top 10 highest frequency characters:


Unnamed: 0,char,count
0,<start>,24089
3,<end>,24089
2,。,20327
15,我,10810
8,的,7128
17,了,5208
4,你,5043
26,他,4254
25,？,3548
18,不,3365


In [30]:
print('Word count for 10 random samples:')
df_word_count.sample(10)

Word count for 10 random samples:


Unnamed: 0,char,count
931,尼,10
2910,潑,1
413,鱼,39
2009,佛,9
1240,拾,3
956,择,12
1936,策,7
984,袋,22
2737,磺,1
225,男,202


In [31]:
df_chinese_tokenized = df
df_chinese_tokenized['chinese_tokenized'] = tokenizer_seq

In [32]:
df_chinese_tokenized.head()

Unnamed: 0,english,chinese,english_split,chinese_split,chinese_tokenized
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]","[1, 1924, 3, 2]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]","[1, 7, 33, 3, 2]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]","[1, 7, 95, 397, 5, 3, 2]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]","[1, 208, 208, 160, 2]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]","[1, 208, 12, 46, 160, 2]"


Save the tokenized dataset.

In [33]:
df_chinese_tokenized.to_json(path.join(googleDrivePathPrefix,'data/cmn-processed-tokenized.json'))

In [34]:
df_reproduced = pd.read_json(path.join(googleDrivePathPrefix,'data/cmn-processed-tokenized.json'))
df_reproduced.head()

Unnamed: 0,english,chinese,english_split,chinese_split,chinese_tokenized
0,Hi .,嗨 。,"[Hi, .]","[<start>, 嗨, 。, <end>]","[1, 1924, 3, 2]"
1,Hi .,你好 。,"[Hi, .]","[<start>, 你, 好, 。, <end>]","[1, 7, 33, 3, 2]"
2,Run .,你用跑的 。,"[Run, .]","[<start>, 你, 用, 跑, 的, 。, <end>]","[1, 7, 95, 397, 5, 3, 2]"
3,Wait !,等等 ！,"[Wait, !]","[<start>, 等, 等, ！, <end>]","[1, 208, 208, 160, 2]"
4,Wait !,等一下 ！,"[Wait, !]","[<start>, 等, 一, 下, ！, <end>]","[1, 208, 12, 46, 160, 2]"


Save tokenizer.

In [37]:
data = tokenizer.to_json()
with open(path.join(googleDrivePathPrefix,'data/tokenizer.json'), 'w') as f:
          json.dump(data, f)