In [1]:
import numpy as np
import os
import time
import random

import gensim 
import re

# Khmer Word Segmentation base on Bi-Directional Recurrent Neural Network

In [2]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Loading the dataset

In [3]:
path_to_file='../dictionaries/km_spellchecker_OOo30_V_1.0.2.oxt/dictionaries/km_KH.dic'

# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 542510 characters


In [4]:
def create_word_bag(text):
    wordList = list(re.sub("[^\s\S]", "\n",  text).split())
    wordList.pop(0)
    
    print(f'legth of word in the list {len(wordList)}')
    print(wordList[:10])
    return wordList

## Process the text

### Vectorize the text

Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [5]:
bag_of_words = create_word_bag(text)

legth of word in the list 56839
['។ល។', '៛', 'ក', 'ក៏', 'កក', 'កក់', 'កកកុញ', 'កកកុះ', 'កក់ក្ដៅ', 'កក់ខែ']


In [6]:
vectorizer = CountVectorizer(encoding='utf-8')

vectorizer.fit(bag_of_words)

# vectorizer.fit_transform(bag_of_words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
analyze = vectorizer.build_analyzer()

dtm = vectorizer.transform(bag_of_words)

## Building model

In [8]:
OPT='rmsprop'
LOSS='categorical_crossentropy'

ACT='softmax'

EPOCHS=3

In [9]:
model = Sequential()

In [10]:
forward_layer = LSTM(10, return_sequences=True)

In [11]:
backward_layer = LSTM(10, activation='relu', return_sequences=True,
                      go_backwards=True)

In [12]:
model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                         input_shape=dtm.shape))

model.add(Dense(5))
model.add(Activation(ACT))

## Prepare the model

### Compile the model

In [13]:
model.compile(loss=LOSS, optimizer=OPT, metrics=['accuracy'])

### Summary

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 56839, 20)         516240    
_________________________________________________________________
dense (Dense)                (None, 56839, 5)          105       
_________________________________________________________________
activation (Activation)      (None, 56839, 5)          0         
Total params: 516,345
Trainable params: 516,345
Non-trainable params: 0
_________________________________________________________________


### Configure checkpoints

In [15]:
# Directory where the checkpoints will be saved
checkpoint_dir = '../storage/training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{EPOCHS}")

checkpoint_callback=ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Configure tensorboard

In [16]:
log_dir=f'storage/logs/{EPOCHS}'

tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

## Train the model