Read embeddings matrix.

In [1]:
import os
import numpy as np
from gensim.models import KeyedVectors


# Create embeddings_index

kv = KeyedVectors.load_word2vec_format(
        os.path.join('data', 'GoogleNews-vectors-negative300.bin'), 
        binary = True
      )

embeddings_index = {}
for word, vector in zip(list(kv.index_to_key), kv.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings_index[word] = coefs



Read pure data and split it.

In [2]:
import tensorflow as tf
import string
import random

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import one_hot

tf.random.set_seed(1137)
random.seed(1137)

def preprocess(text):
    # Split the text by space
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator)
    
    text = text.split()
    return text

def to_embedding(tokens, embeddings_index):
    # Transform the tokens into embeddings
    
    embeddings = []
    for i in range(0, len(tokens)):
        try:
            embeddings.append(embeddings_index[tokens[i]])
        except:
            continue
    return embeddings

def add_padding(embeddings, padding_width = None):
    #
    
    emb_padded = pad_sequences(embeddings, maxlen=padding_width, padding='pre', dtype='float32')
    return emb_padded

def process_text(sentences, embeddings_index, padding = None):
    # Combine the processes
    
    result = [ preprocess(sentence) for sentence in sentences ]
    result = [ to_embedding(sentence, embeddings_index) for sentence in result ]
    result = add_padding(result, padding)
    return result

# Define senses
SENSE = {
    1: 'a social event at which a group of people meet to talk, eat, drink, dance, etc.', # 派對
    2: 'an organization of people with particular political beliefs', # 政黨
    3: 'a single entity which can be identified as one for the purposes of the law' # （法庭）當事人；⋯⋯方
}

# Read labeled data

with open(os.path.join('data', 'party.labeled.txt'), 'r', encoding="utf-8") as f:
    data = f.read().strip().split('\n')


pure_data = [[text, label] for sent_id, label, text in [line.split('\t', 2) for line in data]]

# Split the pure_data

middle = 50

pure_data_train = pure_data[:middle]
pure_data_test = pure_data[middle:]

Data augmentation.

Method 1: Random words replacing with similar word.

In [51]:
# Replace some words in the original sentence

def replace_random_word(pure_data_train):
    NUMBER_REPLACED = 6

    pdt_r = []

    for d in pure_data_train:

        text = d[0]
        label = d[1]

        # Clean the punctuation
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        text = text.translate(translator)

        t = text.split()

        pdt_r.append([' '.join(t), label])

        # Generate the indices of the words that are about to being replaced
        replaced_index = []

        while(len(replaced_index) < NUMBER_REPLACED):
            r = random.randrange(len(t))
            if(t[r] == 'party' or r in replaced_index):
                continue
            replaced_index.append(r)

        for idx in replaced_index:

            temp_t = t.copy()
            # Replace the word with the most similar word
            try:
                temp_t[idx] = kv.most_similar(positive=[t[idx]], topn=1)[0][0]
                pdt_r.append([' '.join(temp_t), label])
            except:
                pass
    return pdt_r

pdt_r = replace_random_word(pure_data_train)

In [52]:
data_train = pdt_r

Train the model.

In [53]:
# Process input
X_train = process_text([d[0] for d in data_train], embeddings_index)
X_test = process_text([d[0] for d in pure_data_test], embeddings_index)

Y_train = [int(d[1])-1 for d in data_train]
Y_train = tf.one_hot(Y_train, 3, axis=1, dtype=tf.float32)
Y_test = [int(d[1])-1 for d in pure_data_test]
Y_test = tf.one_hot(Y_test, 3, axis=1, dtype=tf.float32)

print(f"[Train size: Test size]: [{X_train.shape[0]}: {X_test.shape[0]}]")

# Training

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

_, PADDING_WIDTH, EMBEDDING_DIM = X_train.shape
BATCH_SIZE = 2
EPOCHS = 5
OUTPUT_CATEGORY = len(SENSE)

model = Sequential()
model.add(tf.keras.layers.LSTM(4))
model.add(tf.keras.layers.Dense(64))
model.add(tf.keras.layers.Dense(OUTPUT_CATEGORY, activation='sigmoid'))

model.compile(optimizer='Adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                metrics=['accuracy'])

history = model.fit(
    X_train, Y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)


# Evaluate

results = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
print(f"Test loss: {results[0]}")
print(f"Test accuracy: {results[1]}")

[Train size: Test size]: [308: 657]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.8439046740531921
Test accuracy: 0.7579908967018127
