# Sentence classification by WordConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
import konlpy
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
from tensorflow import keras
from pprint import pprint

slim = tf.contrib.slim

### Loading dataset

In [2]:
rating = pd.read_csv('./ratings.txt', sep = '\t')[['document', 'label']]

# document 내용이 없는 문서는 제거
rating = rating.loc[map(lambda elm : not elm, rating.document.isna()),:]
print(rating.shape)

(199992, 2)


### Preprocessing dataset

#### Spliting train, validation, test 

In [3]:
mecab = konlpy.tag.Mecab()

In [4]:
dataset = np.array([[doc, label] for doc, label in zip(rating.document, rating.label)])
pprint(dataset[:5])

array([['어릴때보고 지금다시봐도 재밌어요ㅋㅋ', '1'],
       ['디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.',
        '1'],
       ['폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.', '1'],
       ['와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지', '1'],
       ['안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.', '1']], dtype='<U142')


In [5]:
# Splitting train, validation, test
tst_indices = np.random.choice(range(len(dataset)),
                               size = int(len(dataset) * .1),
                               replace = False)
tst_dataset = dataset[tst_indices]
training_dataset = np.delete(dataset, tst_indices, axis = 0)    

val_indices = np.random.choice(range(len(training_dataset)),
                               size = int(len(training_dataset) * .2),
                               replace = False)
val_dataset = training_dataset[val_indices]
tr_dataset = np.delete(training_dataset, val_indices, axis = 0)

tr_dataset = tr_dataset.tolist()
val_dataset = val_dataset.tolist()
tst_dataset = tst_dataset.tolist()
print(np.shape(tr_dataset), np.shape(val_dataset), np.shape(tst_dataset))

(143995, 2) (35998, 2) (19999, 2)


In [6]:
# train
X_tr = list(map(lambda sen : mecab.morphs(sen[0]), tr_dataset))
y_tr = list(map(lambda sen : int(sen[1]), tr_dataset))

# validation
X_val = list(map(lambda sen : mecab.morphs(sen[0]), val_dataset))
y_val = list(map(lambda sen : int(sen[1]), val_dataset))

# test
X_tst = list(map(lambda sen : mecab.morphs(sen[0]), tst_dataset))
y_tst = list(map(lambda sen : int(sen[1]), tst_dataset))

#### Building vocabulary and connecting vocab with fasttext

In [7]:
# X_tr에 대한 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in X_tr]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=15)

In [8]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [9]:
# final preprocessing
X_tr = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tr))
X_tr = keras.preprocessing.sequence.pad_sequences(sequences = X_tr, maxlen = 30, padding = 'post',
                                                  value = 1.)

X_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_val))
X_val = keras.preprocessing.sequence.pad_sequences(sequences = X_val, maxlen = 30, padding = 'post',
                                                   value = 1.)

X_tst = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tst))
X_tst = keras.preprocessing.sequence.pad_sequences(sequences = X_tst, maxlen = 30, padding = 'post',
                                                   value = 1.)

### Define WordConv class

In [10]:
class WordConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self._X = X
            self._y = y
            self._is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                     trainable = True)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                        trainable = False)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self._X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self._X)
            
        with tf.variable_scope('convoluion_layer', reuse = tf.AUTO_REUSE):
            with slim.arg_scope([slim.conv1d], num_outputs = 100, padding = 'VALID'):
                
                static_3 = slim.conv1d(inputs = static_batch, kernel_size = 3, scope = '3_gram')
                non_static_3 = slim.conv1d(inputs = non_static_batch, kernel_size = 3, scope = '3_gram')

                static_4 = slim.conv1d(inputs = static_batch, kernel_size = 4, scope = '4_gram')
                non_static_4 = slim.conv1d(inputs = non_static_batch, kernel_size = 4, scope = '4_gram')

                static_5 = slim.conv1d(inputs = static_batch, kernel_size = 5, scope = '5_gram')
                non_static_5 = slim.conv1d(inputs = non_static_batch, kernel_size = 5, scope = '5_gram')

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = slim.fully_connected(inputs = flattened, num_outputs = n_of_classes,
                                         weights_regularizer = slim.l2_regularizer(.05),
                                         activation_fn = None)
            self._score = slim.dropout(inputs = score, is_training = self._is_training)
            
        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self._y, logits = self._score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self._prediction = tf.argmax(self._score, axis = -1)
        
        def predict(self, sess, x_data, is_training = False):
            feed_prediction = {self._X : x_data, self._is_training : is_training}
            return sess.run(self._prediction, feed_dict = feed_prediction)

### Create a model of WordConv

In [11]:
# hyper-parameter
lr = .003
epochs = 5
batch_size = 128
total_step = int(X_tr.shape[0] / batch_size)
print(total_step)

1124


In [12]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [13]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [14]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [15]:
word_conv = WordConv(X = x_data,
                     y = y_data,
                     n_of_classes=2,
                     embedding = vocab.embedding.idx_to_vec.asnumpy())

In [16]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = word_conv.total_loss)

### Training

In [17]:
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [18]:
%%time

tr_loss_hist = []
val_loss_hist = []

for epoch in range(epochs):

    avg_tr_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, tr_loss = sess.run(fetches = [training_op, word_conv.total_loss],
                                             feed_dict = {handle : tr_handle, word_conv._is_training : True})
            avg_tr_loss += tr_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = word_conv.total_loss,
                                feed_dict = {handle : val_handle, word_conv._is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_tr_loss /= tr_step
    avg_val_loss /= val_step
    tr_loss_hist.append(avg_tr_loss)
    val_loss_hist.append(avg_val_loss)
    
    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))

epoch :   1, tr_loss : 0.635, val_loss : 0.396
epoch :   2, tr_loss : 0.455, val_loss : 0.385
epoch :   3, tr_loss : 0.406, val_loss : 0.329
epoch :   4, tr_loss : 0.398, val_loss : 0.330
epoch :   5, tr_loss : 0.372, val_loss : 0.353
CPU times: user 1h 20min 11s, sys: 5min 2s, total: 1h 25min 13s
Wall time: 11min 50s


### Test

In [19]:
tst_dataset = tf.data.Dataset.from_tensor_slices((X_tst, y_tst))
tst_dataset = tst_dataset.batch(batch_size = batch_size)
tst_iterator = tst_dataset.make_initializable_iterator()

In [20]:
tst_handle = sess.run(tst_iterator.string_handle())

In [21]:
y_tst_hat = np.array([])

sess.run(tst_iterator.initializer)

try:
    while True:
        y_tst_tmp = sess.run(word_conv._prediction,
                            feed_dict = {handle : tst_handle,
                                         word_conv._is_training : False})
        y_tst_hat= np.append(y_tst_hat,y_tst_tmp)

except tf.errors.OutOfRangeError:
    pass

In [22]:
print('test acc : {:.2%}'.format(np.mean(y_tst_hat == np.array(y_tst))))

test acc : 84.87%
