In [4]:
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn import random_projection
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

from keras.layers import Input ,Dense, Dropout, Activation, LSTM, Embedding
from keras.layers import Conv1D, Conv2D, Convolution2D, MaxPool2D,MaxPooling2D, Flatten, Reshape, BatchNormalization, Concatenate
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import Model
from keras import metrics
from keras.regularizers import l1

stop_words = set(stopwords.words('english') + list(string.punctuation))

import warnings
warnings.filterwarnings('ignore')

stop_words = set(stopwords.words('english') + list(string.punctuation))


# -------------- Helper Functions --------------
def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g.
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens


def get_sequence(data, seq_length, vocab_dict):
    '''
    :param data: a list of words, type: list
    :param seq_length: the length of sequences,, type: int
    :param vocab_dict: a dict from words to indices, type: dict
    return a dense sequence matrix whose elements are indices of words,
    '''
    data_matrix = np.zeros((len(data), seq_length), dtype=int)
    for i, doc in enumerate(data):
        for j, word in enumerate(doc):
            # YOUR CODE HERE
            if j == seq_length:
                break
            word_idx = vocab_dict.get(word, 1) # 1 means the unknown word
            data_matrix[i, j] = word_idx
    return data_matrix


def read_data(file_name, input_length, vocab=None):
    """
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)
    df['words'] = df['text'].apply(tokenize)

    if vocab is None:
        vocab = set()
        for i in range(len(df)):
            for word in df.iloc[i]['words']:
                vocab.add(word)
    vocab_dict = dict()
    vocab_dict['<pad>'] = 0 # 0 means the padding signal
    vocab_dict['<unk>'] = 1 # 1 means the unknown word
    vocab_size = 2
    for v in vocab:
        vocab_dict[v] = vocab_size
        vocab_size += 1

    data_matrix = get_sequence(df['words'], input_length, vocab_dict)
    stars = df['stars'].apply(int) - 1
    return df['review_id'], stars, data_matrix, vocab
# ----------------- End of Helper Functions-----------------


def load_data(input_length):
     # Load training data and vocab
    train_id_list, train_data_label, train_data_matrix, vocab = read_data("./data/train.csv", input_length)
    K = max(train_data_label)+1  # labels begin with 0

    # Load testing data
    test_id_list, _, test_data_matrix, _ = read_data("./data/valid.csv", input_length, vocab=vocab)
    test_data_label = pd.read_csv("./data/valid.csv")['stars'] - 1
    
    print("Vocabulary Size:", len(vocab))
    print("Training Set Size:", len(train_id_list))
    print("Test Set Size:", len(test_id_list))
    print("Training Set Shape:", train_data_matrix.shape)
    print("Testing Set Shape:", test_data_matrix.shape)

    # Converts a class vector to binary class matrix.
    # https://keras.io/utils/#to_categorical
    train_data_label = keras.utils.to_categorical(train_data_label, num_classes=K)
    test_data_label = keras.utils.to_categorical(test_data_label, num_classes=K)
    return train_data_matrix, train_data_label, test_data_matrix, test_data_label, vocab

In [2]:
# Hyperparameters

input_length = 300
embedding_size = 300
hidden_size = 100
batch_size = 100
dropout_rate = 0.3
filters = 100
kernel_sizes = [3, 4, 5]
padding = 'valid'
activation = 'relu'
strides = 1
pool_size = 2
learning_rate = 0.1
total_epoch = 10

In [5]:
train_data_matrix, train_data_label, test_data_matrix, test_data_label, vocab = load_data(input_length)

Vocabulary Size: 114544
Training Set Size: 100000
Test Set Size: 10000
Training Set Shape: (100000, 300)
Testing Set Shape: (10000, 300)


In [6]:
train_data_matrix.shape

(100000, 300)

In [7]:
# Data shape
N = train_data_matrix.shape[0]
K = train_data_label.shape[1]

input_size = len(vocab) + 2
output_size = K

print(input_size)
print(output_size)

114546
5


## Glove

In [8]:
from gensim.models import KeyedVectors 

filename = './glove.840B.300d.txt.word2vec'
embeddings_index = KeyedVectors.load_word2vec_format(filename, binary=False)

In [9]:
vocab_dict = dict()
vocab_dict['<pad>'] = 0 # 0 means the padding signal
vocab_dict['<unk>'] = 1 # 1 means the unknown word
vocab_size = 2
for v in vocab:
    vocab_dict[v] = vocab_size
    vocab_size += 1
embedding_matrix = np.random.random((len(vocab) + 2, embedding_size))
for word, i in vocab_dict.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]
#     if i < len(vocab) + 2:
#         embedding_vector = embeddings_index[word]
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector

In [10]:
embedding_layer = Embedding(len(vocab) + 2,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=input_length,
                            trainable=False)

## LSTM + Attention

In [11]:
# Creating Callbacks
# ModelCheckpoints is used to save the model after every epoch
# EarlyStopping is used to stop training when the validation loss has not improved after 2 epochs
# Tensorboard is used tovisualize dynamic graphs of the training and test metrics

from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
cbks = [callbacks.ModelCheckpoint(filepath='./checkpoint_model.h5', monitor='val_loss', save_best_only=True),
        callbacks.EarlyStopping(monitor='val_loss', patience=2), callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)]

In [12]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [14]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, Dropout


inp = Input(shape=(input_length, ))

x = embedding_layer(inp)

x = Bidirectional(LSTM(300, return_sequences=True, dropout=dropout_rate,
                       recurrent_dropout=dropout_rate))(x)

x = Attention(input_length)(x)

x = Dense(256, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)

optimizer = Adam()

# compile model
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 300)          34363800  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300, 600)          1442400   
_________________________________________________________________
attention_2 (Attention)      (None, 600)               900       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               153856    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 1285      
Total para

In [15]:
# training
model.fit(train_data_matrix, train_data_label, epochs=10, batch_size=batch_size)

# (Error because of mistyping)
#from keras.models import load_model
#model = load_model('./checkpoint_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


OSError: Unable to open file (Unable to open file: name = './checkpoint_model.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [16]:
# testing
train_score = model.evaluate(train_data_matrix, train_data_label, batch_size=batch_size)
test_score = model.evaluate(test_data_matrix, test_data_label, batch_size=batch_size)

print('Training Loss: {}\n Training Accuracy: {}\n'
      'Testng Loss: {}\n Testing accuracy: {}'.format(
          train_score[0], train_score[1],
          test_score[0], test_score[1]))

Training Loss: 0.5139199042320252
 Training Accuracy: 0.7809399987459182
Testng Loss: 0.7623750281333923
 Testing accuracy: 0.6919000029563904


## END

In [17]:
model.save('Bidirectional_Attention_100000_840B300d_10epochs.h5')
pre_id_list, _, pre_data_matrix, _ = read_data("data/test.csv", input_length, vocab=vocab)
# predicting
test_pre = model.predict(pre_data_matrix, batch_size=batch_size)
prediction = test_pre.argmax(axis=1)+1
sub_df = pd.DataFrame()
sub_df["review_id"] = pre_id_list
sub_df["pre"] = prediction
sub_df.to_csv("pre_Bidirectional_Attention_100000_840B300d_10epochs.csv", index=False)