# Chinese Word Segmentation
Using tokenizer, padding and binary classification

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
# import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras. preprocessing.sequence import pad_sequences

In [14]:
train = pd.read_csv("./datasets/chinese_word_segmentation/as_training.utf8", names=['data'])
print(train.head())

                    data
0                   時間　：
1  三月　十日　（　星期四　）　上午　十時　。
2                   地點　：
3      學術　活動　中心　一樓　簡報室　。
4                   主講　：


In [15]:
train = train[:int(len(train)*0.1)]

## Create data and labels

In [16]:
y = train['data'].copy()
X = train['data'].str.replace('\u3000', '')
print(X.head())
print(type(X))
print(y.head())

0               時間：
1    三月十日（星期四）上午十時。
2               地點：
3      學術活動中心一樓簡報室。
4               主講：
Name: data, dtype: object
<class 'pandas.core.series.Series'>
0                     時間　：
1    三月　十日　（　星期四　）　上午　十時　。
2                     地點　：
3        學術　活動　中心　一樓　簡報室　。
4                     主講　：
Name: data, dtype: object


In [17]:
def create_labels(data):
    label = []
    index = 0

    while index < len(data) - 1:
        if data[index + 1] == '\u3000':
            label.append(1)
            index += 2
        else:
            label.append(0)
            index += 1
            
    if index == len(data) - 1:
        label.append(1)
    return label

y = y.apply(create_labels)

In [18]:
print(y)
print(type(y))

0                                                [0, 1, 1]
1               [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1]
2                                                [0, 1, 1]
3                     [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1]
4                                                [0, 1, 1]
                               ...                        
70890                                      [0, 1, 0, 1, 1]
70891    [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, ...
70892    [1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, ...
70893        [0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]
70894           [1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1]
Name: data, Length: 70895, dtype: object
<class 'pandas.core.series.Series'>


## Tokenization and Padding

In [19]:
tokenizer = Tokenizer(oov_token='<OOV>', split='\u3000', char_level=True)
tokenizer.fit_on_texts(train['data'])
char_index = tokenizer.word_index
total_chars = len(json.loads(tokenizer.get_config()['word_counts']))

In [20]:
print(list(char_index.keys())[:100])
print(len(char_index))

['<OOV>', '\u3000', '，', '的', '。', '是', '一', '人', '不', '有', '我', '在', '、', '這', '了', '「', '」', '個', '他', '以', '生', '為', '要', '來', '們', '會', '就', '中', '自', '之', '大', '時', '到', '：', '而', '所', '能', '也', '心', '上', '學', '可', '說', '你', '？', '對', '如', '子', '得', '出', '成', '與', '作', '家', '麼', '現', '年', '好', '道', '過', '多', '於', '都', '然', '和', '後', '事', '很', '那', '去', '國', '因', '己', '下', '發', '但', '地', '文', '理', '著', '意', '想', '無', '看', '天', '面', '實', '種', '沒', '方', '當', '經', '同', '只', '用', '力', '十', '此', '本', '定']
4396


In [21]:
max_length = X.str.len().max() # 188
# trunc_type = 'post'
padding = 'pre'

X_sequences = tokenizer.texts_to_sequences(X)
padded_X = pad_sequences(X_sequences, maxlen=max_length, padding=padding)
padded_y = pad_sequences(y, maxlen=max_length, padding=padding)
print(padded_X[1])
print(padded_X.shape)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0  115  259   97  143
  153  695  256  248  152   40 1031   97   32    5]
(70895, 164)


In [22]:
embedding_dim = 64

xIn = Input(shape=(max_length,))
x = Embedding(total_chars, embedding_dim, mask_zero=True, input_length=max_length)(xIn) # mask_zero will ignore timestamps with 0 (aka ignoring the padding)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
# x = Bidirectional(LSTM(128, return_sequences=True))(x)

# x = Flatten()(x) don't need to flatten, just put output layer as 2 neurons
# x = Dense(64, activation='swish')(x)
# x = Dense(64, activation='swish')(x)
x = Dense(64, activation='swish')(x)
# x = Dense(64, activation='swish')(x)
xOut = Dense(2, activation='linear')(x) # softmax is computed by loss function, so don't use activation="softmax" here

model = Model(inputs=xIn, outputs=xOut)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 164)]             0         
                                                                 
 embedding (Embedding)       (None, 164, 64)           281280    
                                                                 
 bidirectional (Bidirectiona  (None, 164, 256)         197632    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 164, 256)         394240    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 164, 64)           16448     
                                                                 
 dense_1 (Dense)             (None, 164, 2)            130   

In [23]:
class MaskedSequenceLoss(tf.keras.losses.Loss):
    def __init__(
        self,
        average_across_timesteps=False,
        average_across_batch=False,
        sum_over_timesteps=True,
        sum_over_batch=True,
        softmax_loss_function=None,
        name=None,
        reduction=None, # dummy arg so it can be used as custom object when loading saved model
    ):
        super().__init__()
        self.opts = {
            "average_across_timesteps": average_across_timesteps,
            "average_across_batch": average_across_batch,
            "sum_over_timesteps": sum_over_timesteps,
            "sum_over_batch": sum_over_batch,
            "softmax_loss_function": softmax_loss_function,
            "name": name,
        }
    
    def call(self, y_true, y_pred):
        return tfa.seq2seq.sequence_loss(y_pred, y_true,
                                         weights=tf.cast(y_pred._keras_mask, tf.float32) if hasattr(y_pred, "_keras_mask") else tf.ones(y_true.shape),
                                         **self.opts)

def binary_crossentropy_arg_names_changed(labels, logits):
#     print(labels.numpy(), logits.numpy())
    output = tf.nn.sigmoid_cross_entropy_with_logits(tf.cast(labels, tf.float32)[..., tf.newaxis], logits)
    print(output)
    return output

model.compile(optimizer='adam', loss=MaskedSequenceLoss(), metrics=['acc'])

In [24]:
epochs = 1

callbacks = [
    # tf.keras.callbacks.ModelCheckpoint('./8_best_model', monitor='acc', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='acc', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='acc', factor=0.1, patience=3, verbose=1)
]

# padded_X = tf.convert_to_tensor(padded_X)
# padded_y = tf.convert_to_tensor(padded_y)
print(type(padded_y))
history = model.fit(padded_X, padded_y, batch_size=570, epochs=epochs, validation_split=0.2, callbacks=callbacks)
model.save("demo_model.h5")

<class 'numpy.ndarray'>

InvalidArgumentError: Graph execution error:

Detected at node 'model/embedding/embedding_lookup' defined at (most recent call last):
    File "C:\coding\python\cpython\python-3.10\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\coding\python\cpython\python-3.10\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
      app.start()
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\kernelapp.py", line 728, in start
      self.io_loop.start()
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\coding\python\cpython\python-3.10\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\coding\python\cpython\python-3.10\lib\asyncio\base_events.py", line 1899, in _run_once
      handle._run()
    File "C:\coding\python\cpython\python-3.10\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell
      await result
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\ipkernel.py", line 423, in do_execute
      res = shell.run_cell(
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\lzh75\AppData\Local\Temp\ipykernel_18136\1591191128.py", line 12, in <module>
      history = model.fit(padded_X, padded_y, batch_size=570, epochs=epochs, validation_split=0.2, callbacks=callbacks)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 1694, in fit
      val_logs = self.evaluate(
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 2040, in evaluate
      tmp_logs = self.test_function(iterator)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 1820, in test_function
      return step_function(self, iterator)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 1804, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 1792, in run_step
      outputs = model.test_step(data)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 1756, in test_step
      y_pred = self(x, training=False)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\engine\base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "D:\coding\ml\irs_ml\irs_ml_venv\lib\site-packages\keras\layers\core\embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model/embedding/embedding_lookup'
indices[275,160] = 4396 is not in [0, 4395)
	 [[{{node model/embedding/embedding_lookup}}]] [Op:__inference_test_function_36774]

## Evaluate model

In [None]:
model = tf.keras.models.load_model("8_best_model_weights")
model.summary()

In [None]:
# custom_objects = { "MaskedSequenceLoss": MaskedSequenceLoss }
# with tf.keras.utils.custom_object_scope(custom_objects):
#     model = tf.keras.models.load_model("saved-models/bidirectional-lstm/epoch8_valloss0.0042")

In [None]:
# show predicted results in sentences
def segment_sentence(sentence, skip_array):
    # assert len(sentence) == len(skip_array)
    segmented_sentence = ""
    for i in range(len(sentence)):
        segmented_sentence += sentence[i]
        if skip_array[i] == 1:
            segmented_sentence += " "
    return segmented_sentence

In [None]:
test_sentences = [
    "生日快樂",
    "我的名字是",
    "今天的天氣很不可思議",
    "中文單詞",
    "雪花飄飄北風蕭蕭",
    "Google 的免費服務可即時在英語和 100 多種其他語言之間翻譯單詞、短語和網頁。",
    "公教学生是个具有高尚情操、坚韧个性，同时热爱生活，热爱学习，并且愿为人群服务的领袖、双语学者、与彬彬君子。", # Fail
    "明天更有一場「希望大樹」締造最多雙胞胎集合挑戰金氏世界紀錄活動。", # OK
    "張玨的這番話讓目前還在台大唸博士班的郭淑珍及她的雙胞胎妹妹郭淑玲感受最深", # OK except that it splits 張玨
    "然而，就其思想倾向而言，它却是属于日本战后派的，是战后派文学的一个组成部分。", # Fail. Output: "然 而 ， 就 其 思 想 倾 向而 言 ， 它 却是 属 于 日 本 战 后 派 的， 是 战 后 派 文 学 的一 个 组成 部分 。 "
    "如果說電影《遠離賭城》是尼可拉斯凱吉藝術成就上的轉捩點", # OK except that it doesn't separate 如果說
    "吳宇森正計劃拍攝一部二次大戰的電影《Ｗｉｎｄｔａｌｋｅｒｓ》", # OK (二次大戰 should not be separated)
    "雄立獅島式是炎黃萬世其無疆",
    "你好我的名字是傑夫",
    "不過成員練唱時投入的程度可不輸給一般專業合唱團",
    "你他媽到底在說我什麼，你這個小婊子？我會讓你知道我畢業於海豹突擊隊班，我曾參與過無數次對基地組織的秘密突襲，並確認殺死了 300 多人。我接受過大猩猩戰爭的訓練，我是整個美國武裝部隊中的頂級狙擊手。你對我來說什麼都不是，只是另一個目標。我會用地球上從未見過的精確度把你他媽擦掉，記住我他媽的話。你認為你可以在互聯網上對我說那些狗屎嗎？再想想，混蛋。在我們說話的時候，我正在聯繫我在美國的秘密間諜網絡，你的 IP 正在被追踪，所以你最好為風暴做好準備，蛆蟲。這場風暴會摧毀你稱之為生命的可悲小東西。你他媽死定了，孩子。我可以在任何地方，任何時間，我可以用七百多種方式殺死你，而這只是我的徒手。我不僅在徒手格斗方面受過廣泛的訓練，而且我還可以使用美國海軍陸戰隊的整個武器庫，我會盡其所能地使用它來將你的悲慘屁股從大陸上抹去，你這個小混蛋。如果你能知道你那小小的“聰明”評論會給你帶來什麼樣的邪惡報應，也許你會忍住你的舌頭。但你不能，你沒有，現在你要付出代價，你這個該死的白痴。我會在你身上發火，你會淹死的。你他媽死定了，孩子。"[:187],
    "你瞅啥！瞅你咋地！再瞅一个试试！试试就试试！",
]

for test_sentence in test_sentences:
    test_sentence_sequence = tokenizer.texts_to_sequences([test_sentence])[0]
    test_sentence_sequence_padded = pad_sequences([test_sentence_sequence],
                                                                                  maxlen=max_length)[0]

    actual_pred_start_idx = max_length - len(test_sentence)
    test_preds = model.predict(test_sentence_sequence_padded[tf.newaxis, ...])[0, actual_pred_start_idx:]
    probabilities = tf.nn.softmax(test_preds)
    skip_array = tf.argmax(probabilities, axis=-1)

    segment_sentence(test_sentence, skip_array)

## Save Model

In [None]:
import datetime as dt
dtime = dt.time()
now = dt.datetime.now()
now.strftime("%Y-%m-%d %H-%M-%S")

model.save(f'8_Chinese_Word_Segmentation/8_saved_models/{now}.h5')