In [1]:
import os
import numpy as np
from typing import Tuple, List, Dict

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.layers import *
#from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.python.eager import context
import json

  from ._conv import register_converters as _register_converters


# Data Preprocessing in Full

In [2]:
def prepare_dataset(original_file: str, input_path: str):
    """
    :param original_file; Full path to the original dataset
    :param input_path; Path to the directory to store processed files, with the filename, but no extension
    """
    lines = []
    labels = []
    with open(original_file, "r", encoding="utf-8-sig") as file:
        for line in file:
            line_chars, line_labels = get_chars_and_labels(line)
            lines.append(line_chars)
            labels.append(line_labels)

    with open(input_path+"_input.utf8", "w", encoding="utf-8-sig") as file:
        for line in lines:
            if line.strip():
                file.write(line+"\n")

    with open(input_path+"_labels.utf8", "w", encoding="utf-8-sig") as file:
        for label in labels:
            if label.strip():
                file.write(label+"\n")



def get_chars_and_labels(line: str) -> Tuple[str, str]:
    """
    :param line; A line from the dataset as str
    :return chars; Compressed string for the line
    :return labels; Word_segment code for the line. Same len as chars
    """
    chars = ""
    labels = ""

    words =  line.strip().split(" ")
    for word in words:
        chars += word
        
        word_len = len(word)
        if (word_len == 1):
            labels += "S"
        elif (word_len > 1):
            labels += "B"+"I"*(word_len-2)+"E"

    return chars, labels

In [3]:
prepare_dataset("msr_pku_training.utf8", "msr_pku_training_")

In [4]:
def load_dataset(input_path: str, label_path: str) -> Tuple[List[str], List[str]]:
    """
    :param input_path; Path to the input dataset
    :param label_path; Path to the file containing the corresponding labels for the input dataset
    :return sentences; List of sentences in input_file
    :return labels; List of corresponding word segment codes in label_path. Same len as sentences
    """
    sentences = []
    with open(input_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            sentences.append(line.strip())

    labels = []
    with open(label_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            labels.append(line.strip())

    return sentences, labels


def get_unigrams(sentence: str) -> List[str]:
    """
    :param sentence; A line from the dataset as str
    :return unigrams; List of unigrams in the line
    :return bigrams; List of bigrams in the line
    """
    unigrams = []

    sentence_len = len(sentence)

    for k in range(sentence_len-1):
        unigrams.append(sentence[k])

    unigrams.append(sentence[sentence_len-1])

    return unigrams

def make_X_vocab(sentences: List[str]) -> Dict[str, int]:
    '''
    :param sentences; List of input sentences from the dataset
    :return unigrams_vocab; Dictionary from unigram to int
    :return bigrams_vocab; Dictionary from bigram to int
    '''
    vocab = {"UNK": 0}

    for sentence in sentences:
        unigrams = get_unigrams(sentence)

        for k in range(len(unigrams)):
            if unigrams[k] not in vocab:
                vocab[unigrams[k]] = len(vocab)

    return vocab


def make_Y_vocab(labels: List[str]) -> Dict[str, int]:
    """
    :param labels; List of label codes
    :return labels_vocab; Dictionary from label code to int 
    """
    #labels_vocab = {"UNK": 0}
    labels_vocab = dict()
    for label_line in labels:
        for label in label_line:
            if label not in labels_vocab:
                labels_vocab[label] = len(labels_vocab)

    return labels_vocab

def make_X(sentences: List[str], vocab: Dict[str, int]) -> np.ndarray:
    """
    :param sentences; List of sentences
    :param unigrams_vocab; Unigram vocabulary
    :param bigrams_vocab; Bigram vocabulary
    :return X; Matrix storing all sentences' feature vector 
    """
    X1 = []
    for sentence in sentences:
        x_temp = []
        unigrams = get_unigrams(sentence)
        for i in range(len(unigrams)):
            x_temp.append(vocab[unigrams[i]])

        X1.append(np.array(x_temp))

    X1 = np.array(X1)
    return X1

def make_Y(labels: List[str], labels_vocab: Dict[str, int]) -> np.ndarray:
    """
    :param labels; List of word segment codes, line by line
    :param labels_vocab; Label codes vocab
    :return y; Vector of label code indices
    """
    y = []
    one_hot = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
    for label_line in labels:
        y_temp = []
        for label in label_line:
            y_temp.append( one_hot [labels_vocab[label]] )
            #y_temp.append( labels_vocab[label] )
        y.append(np.array(y_temp))
    
    return np.array(y)

In [5]:
sentences, labels = load_dataset("msr_pku_training__input.utf8", "msr_pku_training__labels.utf8")

In [6]:
print(len(sentences))
print(len(sentences[0]))
print(sentences[0])

105972
48
“人们常说生活是一部教科书，而血与火的战争更是不可多得的教科书，她确实是名副其实的‘我的大学’。


In [7]:
print(len(labels))
print(len(labels[0]))
print(labels[0])

105972
48
SBESSBESSSBIESSSSSSBESSBIIESBIESSBESBIIESSSSBESS


In [8]:
vocab = make_X_vocab(sentences)
print(len(vocab))

5367


In [9]:
labels_vocab = make_Y_vocab(labels)
labels_vocab

{'S': 0, 'B': 1, 'E': 2, 'I': 3}

In [10]:
X = make_X(sentences, vocab)
y = make_Y(labels, labels_vocab)

In [11]:
X_ = pad_sequences(X, truncating='pre', padding='post', maxlen=50)
y_ = pad_sequences(y, truncating='pre', padding='post', maxlen=50)

In [12]:
print(X_.shape)
print(y_.shape)

(105972, 50)
(105972, 50, 4)


In [13]:
train_x2, dev_x2, train_y2, dev_y2 = train_test_split(X_, y_, test_size=.05)

In [14]:
print(train_x2.shape)
print(train_y2.shape)
print(dev_x2.shape)
print(dev_y2.shape)

(100673, 50)
(100673, 50, 4)
(5299, 50)
(5299, 50, 4)


In [15]:
#Writing the vocabularies to file

with open('x_vocab_uni_only.utf8', 'w', encoding="utf-8-sig") as file:
    file.write(json.dumps(vocab))
    
with open('y_vocab.utf8', 'w', encoding="utf-8-sig") as file:
    file.write(json.dumps(labels_vocab))

# The model

In [16]:
vocab_size = len(vocab)

In [17]:
#This class helps with logging

class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        self.val_log_dir = os.path.join(log_dir, 'validation')
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

    def set_model(self, model):
        if context.executing_eagerly():
            self.val_writer = tf.contrib.summary.create_file_writer(self.val_log_dir)
        else:
            self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def _write_custom_summaries(self, step, logs=None):
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if 'val_' in k}
        if context.executing_eagerly():
            with self.val_writer.as_default(), tf.contrib.summary.always_record_summaries():
                for name, value in val_logs.items():
                    tf.contrib.summary.scalar(name, value.item(), step=step)
        else:
            for name, value in val_logs.items():
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.val_writer.add_summary(summary, step)
        self.val_writer.flush()

        logs = {k: v for k, v in logs.items() if not 'val_' in k}
        super(TrainValTensorBoard, self)._write_custom_summaries(step, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

In [20]:
#Please take note that most of this part was extracted from class exercises, with some additions

def create_keras_model(vocab_size, embedding_size=64, hidden_size=256):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    model.add(Embedding(vocab_size, embedding_size, mask_zero=True))
    
    model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    
    model.add(Dense(4, activation='softmax'))
    optimizer = K.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])

    return model
    

In [21]:
batch_size = 32
epochs = 30
model_name = "uni_only_50pad_3bilstm.hdf5"

#checks if the FINAL model was saved and loads it instead of creating a new one
if os.path.exists(model_name):
    model = load_model(model_name)
    print("Using a pre-saved model")
    model.summary()
else:
    model = create_keras_model(vocab_size)
    model.summary()
    print("Training a new model")
    
    filepath = "models/uni_only_50pad_3bilstm-model-{epoch:02d}-{val_acc:.2f}.hdf5"
    checkpoint = K.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
    callbacks_list = [TrainValTensorBoard(write_graph=False), checkpoint]
    
    print("\nStarting training...")
    model.fit(train_x2, train_y2, epochs=epochs, batch_size=batch_size,
              shuffle=True, validation_data=(dev_x2, dev_y2), callbacks=callbacks_list) 
    print("Training complete.\n")
    
    #Save the FINAL model for later reuse
    model.save(model_name)
    print("Trained model saved for later use")

    print("\nEvaluating test...")
    loss_acc = model.evaluate(dev_x2, dev_y2, verbose=0)
    print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Using a pre-saved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          343488    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 512)         657408    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 512)         1574912   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 4)           2052      
Total params: 4,152,772
Trainable params: 4,152,772
Non-trainable params: 0
_________________________________________________________________


# Predictions on the dev data

In [22]:
pred_dev_y2 = model.predict(dev_x2)
print(pred_dev_y2.shape)
print(dev_y2.shape)

(5299, 50, 4)
(5299, 50, 4)


In [26]:
#Preview predicted predictions for the first five characters of the first line

print(pred_dev_y2[0,0:5])
print([id_to_label[k] for k in np.argmax(pred_dev_y2[0,0:5], 1)])

print(dev_y2[0,0:5])
print([id_to_label[k] for k in np.argmax(dev_y2[0,0:5], 1)])

[[9.9977511e-01 2.6404706e-07 2.2462867e-04 4.9865708e-08]
 [9.9999952e-01 1.9819115e-09 4.3202360e-07 8.9653760e-08]
 [9.9096340e-01 9.0339603e-03 2.0147495e-06 7.5938999e-07]
 [7.4362538e-06 9.9911708e-01 2.3175540e-05 8.5239398e-04]
 [2.3481037e-09 1.7470664e-06 1.1806457e-05 9.9998641e-01]]
['S', 'S', 'S', 'B', 'I']
[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 0 1]]
['S', 'S', 'S', 'B', 'I']


In [27]:
id_to_label = {v:k for k,v in labels_vocab.items()}

In [29]:
val_labels = []
for label in dev_y2:
    val_labels.append([id_to_label[k] for k in np.argmax(label, 1)])
    
pred_labels = []
for pred in pred_dev_y2:
    pred_labels.append([id_to_label[k] for k in np.argmax(pred, 1)])