# Sentence classification by MorphConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
    + train, validation : splitting `ratings_train.txt` (150k reviews) for train (120k reviews) and validation (30k reviews)
    + test : `ratings_test.txt` (50k reviews)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
import konlpy
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pprint import pprint

print(tf.__version__)

1.11.0


### Loading dataset

In [2]:
ratings = pd.read_csv('./ratings_train.txt', sep = '\t')[['document', 'label']]
ratings_tst = pd.read_csv('./ratings_test.txt', sep = '\t')[['document', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

ratings.document[ratings.document.isna()] = ''
ratings_tst.document[ratings_tst.document.isna()] = ''

print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

5 3
0 0


In [3]:
val_indices = np.random.choice(a = range(ratings.shape[0]), size = int(ratings.shape[0] * .2),
                               replace = False)
tr_indices = np.delete(arr = range(ratings.shape[0]), obj = val_indices, axis = 0)

ratings_tr = ratings.iloc[tr_indices,:]
ratings_val = ratings.iloc[val_indices,:]

print(ratings_tr.shape, ratings_val.shape, ratings_tst.shape)

(120000, 2) (30000, 2) (50000, 2)


### Preprocessing dataset

In [4]:
mecab = konlpy.tag.Mecab() # 굳이 Mecab 형태소 분석기가 아니여도 상관없음

In [5]:
%%time
# train
X_tr = ratings_tr.document.apply(mecab.morphs).tolist()
y_tr = ratings_tr.label.tolist()

# validation
X_val = ratings_val.document.apply(mecab.morphs).tolist()
y_val = ratings_val.label.tolist()

# test
X_tst = ratings_tst.document.apply(mecab.morphs).tolist()
y_tst = ratings_tst.label.tolist()

CPU times: user 14.5 s, sys: 124 ms, total: 14.6 s
Wall time: 14.6 s


#### Building vocabulary and connecting vocabulary with fasttext embedding

In [6]:
# training dataset 기반으로 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in X_tr]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=15)

In [7]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [8]:
%%time
# final preprocessing

X_tr = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tr))
X_tr = pad_sequences(sequences = X_tr, maxlen = 30, padding = 'post', value = 1.)

X_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_val))
X_val = pad_sequences(sequences = X_val, maxlen = 30, padding = 'post', value = 1.)

X_tst = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tst))
X_tst = pad_sequences(sequences = X_tst, maxlen = 30, padding = 'post', value = 1.)

CPU times: user 2.92 s, sys: 11.7 ms, total: 2.93 s
Wall time: 2.94 s


### Define WordConv class

In [13]:
keras.layers.Conv1D?

In [14]:
keras.layers.Dense?

In [16]:
keras.layers.Dense?

In [None]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
#             self._is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                     trainable = True)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                        trainable = False)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self._X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self._X)

        with tf.variable_scope('convoluion_layer'):
            # 3_gram
            gram_3 = keras.layers.Conv1D(filters = 100, kernel_size = 3,
                                         activation = keras.activations.relu,
                                         initializer = 'he_uniform', padding = 'valid')
            static_3 = gram_3(static_batch) 
            non_static_3 = gram_3(non_static_batch)
            
            # 4_gram
            gram_4 = keras.layers.Conv1D(filters = 100, kernel_size = 4,
                                         activation = keras.activations.relu,
                                         initializer = 'he_uniform', padding = 'valid')
            static_4 = gram_4(static_batch)
            non_static_4 = gram_4(non_static_batch)
            
            # 5_gram
            gram_5 = keras.layers.Conv1D(filters = 100, kernel_size = 5,
                                         activation = keras.activations.relu,
                                         initializer = 'he_uniform', padding = 'valid')
            static_5 = gram_5(static_batch)
            non_static_5 = gram_5(non_static_batch)

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)

        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes,
                                       kernel_regularizer = keras.regularizers.l2(.5))(flattened)
            self.__score = keras.layers.Dropout
            keras.layers.Dropout
            
            self._score = keras.regularizers.deserializedropout(inputs = score, is_training = self._is_training)
            
        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self._y, logits = self._score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self._prediction = tf.argmax(self._score, axis = -1)
        
        # predict instance method for small dataset
        def predict(self, sess, x_data, is_training = False):
            feed_prediction = {self._X : x_data, self._is_training : is_training}
            return sess.run(self._prediction, feed_dict = feed_prediction)

### Create a model of WordConv

In [None]:
# hyper-parameter
lr = .003
epochs = 5
batch_size = 100
total_step = int(X_tr.shape[0] / batch_size)
print(total_step)

In [None]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [None]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [None]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [None]:
word_conv = WordConv(X = x_data,
                     y = y_data,
                     n_of_classes = 2,
                     embedding = vocab.embedding.idx_to_vec.asnumpy())

In [None]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = word_conv.total_loss)

### Training

In [None]:
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [None]:
%%time

tr_loss_hist = []
val_loss_hist = []

for epoch in range(epochs):

    avg_tr_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, tr_loss = sess.run(fetches = [training_op, word_conv.total_loss],
                                             feed_dict = {handle : tr_handle, word_conv._is_training : True})
            avg_tr_loss += tr_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = word_conv.total_loss,
                                feed_dict = {handle : val_handle, word_conv._is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_tr_loss /= tr_step
    avg_val_loss /= val_step
    tr_loss_hist.append(avg_tr_loss)
    val_loss_hist.append(avg_val_loss)
    
    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))

### Test

In [None]:
tst_dataset = tf.data.Dataset.from_tensor_slices((X_tst, y_tst))
tst_dataset = tst_dataset.batch(batch_size = batch_size)
tst_iterator = tst_dataset.make_initializable_iterator()

In [None]:
tst_handle = sess.run(tst_iterator.string_handle())

In [None]:
y_tst_hat = np.array([])

sess.run(tst_iterator.initializer)

try:
    while True:
        y_tst_tmp = sess.run(word_conv._prediction,
                            feed_dict = {handle : tst_handle,
                                         word_conv._is_training : False})
        y_tst_hat= np.append(y_tst_hat,y_tst_tmp)

except tf.errors.OutOfRangeError:
    pass

In [None]:
print('test acc : {:.2%}'.format(np.mean(y_tst_hat == np.array(y_tst))))