In [56]:
import numpy as np
import collections
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras import Model, layers
import csv
import re
import pylab
import os
import pandas as pd

In [57]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Function for Char

In [58]:
# Read data with [character]
def vocabulary(strings):
    chars = sorted(list(set(list(''.join(strings)))))
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    vocab_size = len(chars)
    return vocab_size, char_to_ix

In [59]:
def preprocess_char(strings, char_to_ix, MAX_LENGTH):
    data_chars = [list(d.lower()) for _, d in enumerate(strings)]
    for i, d in enumerate(data_chars):
        if len(d)>MAX_LENGTH:
            d = d[:MAX_LENGTH]
        elif len(d) < MAX_LENGTH:
            d += [' '] * (MAX_LENGTH - len(d))
            
    data_ids = np.zeros([len(data_chars), MAX_LENGTH], dtype=np.int64)
    for i in range(len(data_chars)):
        for j in range(MAX_LENGTH):
            data_ids[i, j] = char_to_ix[data_chars[i][j]]
    return np.array(data_ids)

In [60]:
def read_data_chars():
    x_train, y_train, x_test, y_test = [], [], [], []
    cop = re.compile("[^a-z^A-Z^0-9^,^.^' ']")
    with open('./train_medium.csv', encoding='utf-8') as filex:
        reader = csv.reader(filex)
        for row in reader:
            data = cop.sub("", row[1])
            x_train.append(data)
            y_train.append(int(row[0]))

    with open('./test_medium.csv', encoding='utf-8') as filex:
        reader = csv.reader(filex)
        for row in reader:
            data = cop.sub("", row[1])
            x_test.append(data)
            y_test.append(int(row[0]))


    vocab_size, char_to_ix = vocabulary(x_train+x_test)
    x_train = preprocess_char(x_train, char_to_ix, MAX_DOCUMENT_LENGTH)
    y_train = np.array(y_train)
    x_test = preprocess_char(x_test, char_to_ix, MAX_DOCUMENT_LENGTH)
    y_test = np.array(y_test)

    x_train = tf.constant(x_train, dtype=tf.int64)
    y_train = tf.constant(y_train, dtype=tf.int64)
    x_test = tf.constant(x_test, dtype=tf.int64)
    y_test = tf.constant(y_test, dtype=tf.int64)

    return x_train, y_train, x_test, y_test, vocab_size

# Function for Word

In [61]:
def clean_str(text):
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip().lower()

    return text

In [62]:
def build_word_dict(contents):
    words = list()
    for content in contents:
        for word in word_tokenize(clean_str(content)):
            words.append(word)

    word_counter = collections.Counter(words).most_common()
    word_dict = dict()
    word_dict["<pad>"] = 0
    word_dict["<unk>"] = 1
    word_dict["<eos>"] = 2
    for word, _ in word_counter:
        word_dict[word] = len(word_dict)
    return word_dict

In [63]:
def preprocess_word(contents, word_dict, document_max_len):
    x = list(map(lambda d: word_tokenize(clean_str(d)), contents))
    x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
    x = list(map(lambda d: d + [word_dict["<eos>"]], x))
    x = list(map(lambda d: d[:document_max_len], x))
    x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<pad>"]], x))
    return x

In [64]:
def read_data_words():
    x_train, y_train, x_test, y_test = [], [], [], []
    cop = re.compile("[^a-z^A-Z^0-9^,^.^' ']")
    with open('./train_medium.csv', encoding='utf-8') as filex:
        reader = csv.reader(filex)
        for row in reader:
            data = cop.sub("", row[1])
            x_train.append(data)
            y_train.append(int(row[0]))

    with open('./test_medium.csv', encoding='utf-8') as filex:
        reader = csv.reader(filex)
        for row in reader:
            data = cop.sub("", row[1])
            x_test.append(data)
            y_test.append(int(row[0]))

    word_dict = build_word_dict(x_train+x_test)
    x_train = preprocess_word(x_train, word_dict, MAX_DOCUMENT_LENGTH)
    y_train = np.array(y_train)
    x_test = preprocess_word(x_test, word_dict, MAX_DOCUMENT_LENGTH)
    y_test = np.array(y_test)

    x_train = [x[:MAX_DOCUMENT_LENGTH] for x in x_train]
    x_test = [x[:MAX_DOCUMENT_LENGTH] for x in x_test]
    x_train = tf.constant(x_train, dtype=tf.int64)
    y_train = tf.constant(y_train, dtype=tf.int64)
    x_test = tf.constant(x_test, dtype=tf.int64)
    y_test = tf.constant(y_test, dtype=tf.int64)

    vocab_size = tf.get_static_value(tf.reduce_max(x_train))
    vocab_size = max(vocab_size, tf.get_static_value(tf.reduce_max(x_test))) + 1
    return x_train, y_train, x_test, y_test, vocab_size

# Models

In [65]:
FILTER_SHAPE2 = [20, 1]
N_FILTERS = 10
MAX_DOCUMENT_LENGTH = 100
one_hot_size = 256
HIDDEN_SIZE = 20
POOLING_WINDOW = 4
POOLING_STRIDE = 2
MAX_LABEL = 15
EMBEDDING_SIZE = 20

batch_size = 128
no_epochs = 2
lr = 0.01

seed = 10
tf.random.set_seed(seed)

In [66]:
# Build model
tf.keras.backend.set_floatx('float32')
class CharCNN(Model):
    def __init__(self, vocab_size=256):
        super(CharCNN, self).__init__()
        self.vocab_size = vocab_size
        
        # Weight variables and CNN cell
        self.conv1 = layers.Conv2D(N_FILTERS, [20,256], padding='VALID', activation='relu', use_bias=True)
        self.pool1 = layers.MaxPool2D(POOLING_WINDOW, POOLING_STRIDE, padding='SAME')
        self.conv2 = layers.Conv2D(N_FILTERS, FILTER_SHAPE2, padding='VALID', activation='relu', use_bias=True)
        self.pool2 = layers.MaxPool2D(POOLING_WINDOW, POOLING_STRIDE, padding='SAME')
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(MAX_LABEL, activation='softmax')

    def call(self, x, drop_rate=0.5):
        # forward
        x = tf.one_hot(x, one_hot_size)
        x = x[..., tf.newaxis] 
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = tf.nn.dropout(x, drop_rate)
        logits = self.dense(x)
        return logits

In [67]:
# Build model
tf.keras.backend.set_floatx('float32')
class WordCNN(Model):
    def __init__(self, vocab_size=20):
        super(WordCNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = layers.Embedding(vocab_size, EMBEDDING_SIZE, input_length=MAX_DOCUMENT_LENGTH)
        
        # Weight variables and CNN cell
        self.conv1 = layers.Conv2D(N_FILTERS, [20,20], padding='VALID', activation='relu', use_bias=True)
        self.pool1 = layers.MaxPool2D(POOLING_WINDOW, POOLING_STRIDE, padding='SAME')
        self.conv2 = layers.Conv2D(N_FILTERS, FILTER_SHAPE2, padding='VALID', activation='relu', use_bias=True)
        self.pool2 = layers.MaxPool2D(POOLING_WINDOW, POOLING_STRIDE, padding='SAME')
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(MAX_LABEL, activation='softmax')

    def call(self, x, drop_rate=0.5):
        # forward
        x = self.embedding(x)
        x = x[..., tf.newaxis] 
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = tf.nn.dropout(x, drop_rate)
        logits = self.dense(x)
        return logits

In [68]:
# Build model
tf.keras.backend.set_floatx('float32')
class CharRNN(Model):
    def __init__(self, vocab_size=256, hidden_dim=20):
        super(CharRNN, self).__init__()
        # Hyperparameters
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        
        # Weight variables and RNN cell
        self.rnn = layers.RNN(
            tf.keras.layers.GRUCell(self.hidden_dim), unroll=True)
        self.dense = layers.Dense(MAX_LABEL, activation=None)

    def call(self, x, drop_rate):
        # forward logic
        x = tf.one_hot(x, one_hot_size)
        x = self.rnn(x)
        x = tf.nn.dropout(x, drop_rate)
        logits = self.dense(x)
    
        return logits

In [69]:
# Build model
tf.keras.backend.set_floatx('float32')
class WordRNN(Model):
    def __init__(self, vocab_size, hidden_dim=20):
        super(WordRNN, self).__init__()
        # Hyperparameters
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding = layers.Embedding(vocab_size, 20, input_length=MAX_DOCUMENT_LENGTH)
        
        # Weight variables and RNN cell
        self.rnn = layers.RNN(
            tf.keras.layers.GRUCell(self.hidden_dim), unroll=True)
        self.dense = layers.Dense(MAX_LABEL, activation=None)

    def call(self, x, drop_rate):
        # forward logic
        embedding = self.embedding(x)
        encoding = self.rnn(embedding)

        encoding = tf.nn.dropout(encoding, drop_rate)
        logits = self.dense(encoding)
    
        return logits

# Training and Testing Function

In [70]:
# Training function
def train_step(model, x, label, drop_rate):
    with tf.GradientTape() as tape:
        out = model(x, drop_rate)
        loss = loss_object(label, out)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
    train_loss(loss)
    train_accuracy(labels, out)

In [71]:
# Testing function
def test_step(model, x, label, drop_rate=0):
    out = model(x,drop_rate)
    t_loss = loss_object(label, out)
    test_loss(t_loss)
    test_accuracy(label, out)

# Load Data

In [72]:
fig_dir = "./all_figure"
if not os.path.exists(fig_dir):
    os.mkdir(fig_dir)

acc_file = "accuracy.xlsx"

question = [1,2,3,4,'5.1','5.2','5.3','5.4'] #Question

for model_ in question:
    if model_ == 1: #CharCNN drop
        x_train, y_train, x_test, y_test, _ = read_data_chars()
        model = CharCNN(256)
        drop = False
        model_name = "CharCNN_no_drop"
    elif model_ == 2:  #WordCNN drop
        x_train, y_train, x_test, y_test, vocab_size = read_data_words()
        model = WordCNN(vocab_size)
        drop = False
        model_name = "WordCNN_no_drop"
    elif model_ == 3:  #CharRNN drop
        x_train, y_train, x_test, y_test, _ = read_data_chars()
        model = CharRNN(256, HIDDEN_SIZE)
        drop = False
        model_name = "CharRNN_no_drop"
    elif model_ == 4:  #WordRNN drop
        x_train, y_train, x_test, y_test, vocab_size = read_data_words()
        model = WordRNN(vocab_size, HIDDEN_SIZE)
        drop = False
        model_name = "WordRNN_no_drop"
    elif model_ == '5.1':
        x_train, y_train, x_test, y_test, _ = read_data_chars()
        model = CharCNN(256)
        drop = True
        model_name = "CharCNN_w_drop"
    elif model_ == '5.2':  #WordCNN drop
        x_train, y_train, x_test, y_test, vocab_size = read_data_words()
        model = WordCNN(vocab_size)
        drop = True
        model_name = "WordCNN_w_drop"
    elif model_ == '5.3':  #CharRNN drop
        x_train, y_train, x_test, y_test, _ = read_data_chars()
        model = CharRNN(256, HIDDEN_SIZE)
        drop = True
        model_name = "CharRNN_w_drop"
    elif model_ == '5.4':  #WordRNN drop
        x_train, y_train, x_test, y_test, vocab_size = read_data_words()
        model = WordRNN(vocab_size, HIDDEN_SIZE)
        drop = True
        model_name = "WordRNN_w_drop"
    else:
        raise NotImplementedError(f'Error on Model If-Else')
    
    print(f"Question {model_} Model - {model_name}")
    
    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(batch_size)
    test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)

    # Choose optimizer and loss function for training
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # Select metrics to measure the loss and the accuracy of the model. 
    # These metrics accumulate the values over epochs and then print the overall result.
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

    train_acc = []
    train_cost = []
    test_acc = []
    test_cost = []
    
    for epoch in range(no_epochs):
        # Reset the metrics at the start of the next epoch
        train_loss.reset_states()
        train_accuracy.reset_states()
        test_loss.reset_states()
        test_accuracy.reset_states()

        for images, labels in train_ds:
            if drop == True:
                train_step(model, images, labels, drop_rate=0.5)
            else:
                train_step(model, images, labels, drop_rate=0)

        for images, labels in test_ds:
            test_step(model, images, labels, drop_rate=0)

        train_acc.append(train_accuracy.result())
        train_cost.append(train_loss.result())
        test_acc.append(test_accuracy.result())
        test_cost.append(test_loss.result())
        template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
        print (template.format(epoch+1,
                              train_loss.result(),
                              train_accuracy.result(),
                              test_loss.result(),
                              test_accuracy.result()))
    
    pylab.figure()
    pylab.plot(test_acc, label='Test')
    pylab.plot(train_acc, label='Train')
    pylab.xlabel('epochs')
    pylab.ylabel('{model_name} Accuracy')
    pylab.legend(loc='lower right')
    pylab.savefig(f"{fig_dir}/q{model_}_{model_name}_Accuracy.png")
    pylab.close()

    # Plot train error
    pylab.figure()
    pylab.plot(train_cost, label='Train')
    pylab.plot(test_cost, label='Test')
    pylab.xlabel('epochs')
    pylab.ylabel('{model_name} Loss')
    pylab.legend(loc='lower right')
    pylab.savefig(f"{fig_dir}/q{model_}_{model_name}_loss.png")
    pylab.close()

    if os.path.exists(acc_file):
        df = pd.read_excel(acc_file)
        df[model_name+"_test_acc"] = np.array(test_acc)
    else:
        df = pd.DataFrame(np.array(test_acc), columns=[model_name+"_test_acc"])
    df.to_excel(acc_file, index=False)

Question 1 Model - CharCNN_no_drop
Epoch 1, Loss: 2.6183876991271973, Accuracy: 0.18464285135269165, Test Loss: 2.5028140544891357, Test Accuracy: 0.3257142901420593
Epoch 2, Loss: 2.445434331893921, Accuracy: 0.3700000047683716, Test Loss: 2.365260362625122, Test Accuracy: 0.4628571569919586


KeyboardInterrupt: 