#### REF
* https://github.com/dmlc/gluon-nlp/blob/master/docs/api/notes/data_api.rst

## Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, auc
from mxnet import gluon


import time, re
import multiprocessing as mp
import itertools
from tqdm import tqdm, tqdm_notebook
import mxnet as mx
import spacy
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'

### Another Data preparation
* Input data shape::$ (batch \times word \times vocab )$ 
* Split data: training & validation
* Create data iterator for training

In [2]:
MAX_SENTENCE_LENGTH = 20
MAX_VOCAB = 10000

In [3]:
nlp = spacy.load("en")

word_freq = collections.Counter()
max_len = 0
num_rec = 0
print('Count words and build vocab...')
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _lab, _sen = line.decode('utf8').strip().split('\t')
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태 
        # 제거를 위해 [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_VOCAB - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

print('Prepare data...')
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sen = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sen)
        y.append(int(_label))
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태
        words = [x for x in words if x != '-PRON-'] # '-PRON-' 제거
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

Count words and build vocab...
Prepare data...


Unnamed: 0,yn,index
0,0,3091
1,1,3995


In [4]:
tr_idx = np.random.choice(range(len(x)), int(len(x) * .8))
va_idx = [x for x in range(len(x)) if x not in tr_idx]

tr_x = [x[i] for i in tr_idx]
tr_y = [y[i] for i in tr_idx]
va_x = [x[i] for i in va_idx]
va_y = [y[i] for i in va_idx]

batch_size = 16

train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

In [5]:
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn, rnn
import mxnet as mx
context = mx.gpu()

In [6]:
a = nd.random.normal(shape = (10, 1), ctx = context)

### Parameters

In [7]:
learning_rate = .0002
log_interval = 100
emb_dim = 100 # Emb dim
hidden_dim = 30 # Hidden dim for LSTM

In [14]:
class Sentence_Representation(nn.Block): ## Using LSTMCell : Only use the last time step
    def __init__(self, emb_dim, hidden_dim, vocab_size, dropout = .2, **kwargs):
        super(Sentence_Representation, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        with self.name_scope():
            self.f_hidden = []
            self.b_hidden = []
            self.embed = nn.Embedding(self.vocab_size, self.emb_dim)
            self.drop = nn.Dropout(.2)
            self.bi_rnn = rnn.BidirectionalCell(
                rnn.LSTMCell(hidden_size = self.hidden_dim // 2),
                rnn.LSTMCell(hidden_size = self.hidden_dim // 2)
            )
            
    def forward(self, x, _f_hidden):
        embeds = self.embed(x) # batch * time step * embedding
        _, h = self.bi_rnn.unroll(length = embeds.shape[1] \
                                , inputs = embeds \
                                , layout = 'NTC' \
                                , merge_outputs = True)
        #print('h shape = {}'.format(nd.concat(h[1], h[3], dim = 1).shape))
        return nd.concat(h[1], h[3], dim = 1)

    
    def begin_state(self, *args, **kwargs):
        return self.bi_rnn.begin_state(*args, **kwargs)

In [15]:
class SA_Classifier(nn.Block):
    def __init__(self, sen_rep, classifier, batch_size, context, **kwargs):
        super(SA_Classifier, self).__init__(**kwargs)
        self.batch_size = batch_size
        self.context = context
        with self.name_scope():
            self.sen_rep = sen_rep
            self.classifier = classifier
            
    def forward(self, x):
        hidden = self.sen_rep.begin_state(func = mx.nd.zeros \
                                               , batch_size = self.batch_size \
                                               , ctx = self.context)

        _x = self.sen_rep(x, hidden) # Use the last hidden step
        # Extract hidden state from _x
        _x = self.classifier(_x)
        return _x           

In [16]:
sen_rep = Sentence_Representation(emb_dim, hidden_dim, MAX_VOCAB)
classifier = nn.Sequential()
classifier.add(nn.Dense(16, activation = 'relu'))
classifier.add(nn.Dense(8, activation = 'relu'))
classifier.add(nn.Dense(1))
sa = SA_Classifier(sen_rep, classifier, 2, context)
sa.collect_params().initialize(mx.init.Xavier(), ctx = context)
trainer = gluon.Trainer(sa.collect_params(), 'adam', {'learning_rate': 1e-3})

In [17]:
emb_dim = 50 # Emb dim
hidden_dim = 30 # Hidden dim for LSTM
sa = SA_Classifier(sen_rep, classifier,  batch_size, context)
loss = gluon.loss.SigmoidBCELoss()
trainer = gluon.Trainer(sa.collect_params(), 'adam', {'learning_rate': 1e-3})

In [18]:
def evaluate(net, dataIterator, context):
    dataIterator.reset()
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    for i, batch in enumerate(dataIterator):
        data =  batch.data[0].as_in_context(context)
        label = batch.data[1].as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (output > 0.5).reshape((-1,))
        #print('cor = {}'.format(pred == label))
        total_L += L.sum().asscalar()
        total_sample_num += len(label)
        total_correct_num += (pred == label).sum().asscalar()
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, dataIterator.num_data//dataIterator.batch_size + 1,
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    return avg_L, acc

In [19]:
n_epoch = 5
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    # Epoch training stats
    start_epoch_time = time.time()
    epoch_L = 0.0
    epoch_sent_num = 0
    epoch_wc = 0
    # Log interval training stats
    start_log_interval_time = time.time()
    log_interval_wc = 0
    log_interval_sent_num = 0
    log_interval_L = 0.0
    
    for i, batch in enumerate(train_data):
        _data = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        L = 0
        wc = len(_data)
        log_interval_wc += wc
        epoch_wc += wc
        log_interval_sent_num += _data.shape[1]
        epoch_sent_num += _data.shape[1]
        with autograd.record():
            _out = sa(_data)
            L = L + loss(_out, _label).mean().as_in_context(context)
        L.backward()
        trainer.step(_data.shape[0])
        log_interval_L += L.asscalar()
        epoch_L += L.asscalar()
        if (i + 1) % log_interval == 0:
            tqdm.write('[Epoch {} Batch {}/{}] elapsed {:.2f} s, \
                    avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, train_data.num_data//train_data.batch_size,
                    time.time() - start_log_interval_time,
                    log_interval_L / log_interval_sent_num,
                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
            # Clear log interval training stats
            start_log_interval_time = time.time()
            log_interval_wc = 0
            log_interval_sent_num = 0
            log_interval_L = 0
    end_epoch_time = time.time()
    test_avg_L, test_acc = evaluate(sa, valid_data, context)
    tqdm.write('[Epoch {}] train avg loss {:.6f}, valid acc {:.2f}, \
        valid avg loss {:.6f}, throughput {:.2f}K wps'.format(
        epoch, epoch_L / epoch_sent_num,
        test_acc, test_avg_L, epoch_wc / 1000 /
        (end_epoch_time - start_epoch_time)))

HBox(children=(IntProgress(value=0, description='epoch', max=5), HTML(value='')))

[Epoch 0 Batch 100/354] elapsed 6.92 s,                     avg loss 0.022838, throughput 0.23K wps
[Epoch 0 Batch 200/354] elapsed 6.84 s,                     avg loss 0.002648, throughput 0.23K wps
[Epoch 0 Batch 300/354] elapsed 6.84 s,                     avg loss 0.001264, throughput 0.23K wps
[Batch 100/200] elapsed 3.75 s
[Batch 200/200] elapsed 3.75 s
[Epoch 0] train avg loss 0.007751, valid acc 0.99,         valid avg loss 0.032405, throughput 0.23K wps
[Epoch 1 Batch 100/354] elapsed 6.84 s,                     avg loss 0.000864, throughput 0.23K wps
[Epoch 1 Batch 200/354] elapsed 6.84 s,                     avg loss 0.000721, throughput 0.23K wps
[Epoch 1 Batch 300/354] elapsed 6.83 s,                     avg loss 0.000169, throughput 0.23K wps
[Batch 100/200] elapsed 3.74 s
[Batch 200/200] elapsed 3.75 s
[Epoch 1] train avg loss 0.000504, valid acc 0.99,         valid avg loss 0.027796, throughput 0.23K wps
[Epoch 2 Batch 100/354] elapsed 6.84 s,                     avg lo

### Prediction

In [20]:
def get_pred(net, iterator):
    pred_sa = []
    label_sa = []
    va_text = []
    iterator.reset()
    for i, batch in enumerate(iterator):
        if i % 100 == 0:
            print('i = {}'.format(i))
        data =  batch.data[0].as_in_context(context)
        label = batch.data[1].as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (nd.sigmoid(output) > 0.5).reshape((-1,))
        pred_sa.extend(pred.asnumpy())
        label_sa.extend(label.asnumpy())
        va_text.extend([' '.join([idx2word[np.int(x)] for x in y.asnumpy() if idx2word[np.int(x)] is not 'PAD']) for y in data])
    pred_sa_pd = pd.DataFrame(pred_sa, columns  = ['pred_sa'])
    label_pd = pd.DataFrame(label_sa, columns = ['label'])
    text_pd = pd.DataFrame(va_text, columns = ['text'])
    res = pd.concat([text_pd, pred_sa_pd, label_pd], axis = 1)
    return res

In [21]:
result = get_pred(sa, valid_data)

i = 0
i = 100


In [22]:
len(va_y)

3190

### Number of wrong classification

In [23]:
len(result[result['pred_sa'] != result['label']])

21

In [25]:
result[result['pred_sa'] != result['label']].iloc[2]['text']

'ok time to update wow have update for a long time ok so yeah watch over the hedge and mission'