#### REF
* https://github.com/dmlc/gluon-nlp/blob/master/docs/api/notes/data_api.rst

## Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, auc
from mxnet import gluon


import time, re
import multiprocessing as mp
import itertools
from tqdm import tqdm, tqdm_notebook
import mxnet as mx
import spacy
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'

### Another Data preparation
* Input data shape::$ (batch \times word \times vocab )$ 
* Split data: training & validation
* Create data iterator for training

In [2]:
MAX_SENTENCE_LENGTH = 20
MAX_VOCAB = 10000

In [None]:
nlp = spacy.load("en")

word_freq = collections.Counter()
max_len = 0
num_rec = 0
print('Count words and build vocab...')
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _lab, _sen = line.decode('utf8').strip().split('\t')
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태 
        # 제거를 위해 [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_VOCAB - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

print('Prepare data...')
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sen = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sen)
        y.append(int(_label))
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태
        words = [x for x in words if x != '-PRON-'] # '-PRON-' 제거
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

Count words and build vocab...
Prepare data...


In [4]:
## Data process - tr/va split and define iterator

tr_idx = np.random.choice(range(len(x)), int(len(x) * .8))
va_idx = [x for x in range(len(x)) if x not in tr_idx]

tr_x = [x[i] for i in tr_idx]
tr_y = [y[i] for i in tr_idx]
va_x = [x[i] for i in va_idx]
va_y = [y[i] for i in va_idx]

batch_size = 16

train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

In [5]:
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn, rnn
import mxnet as mx
context = mx.cpu()

### Parameters

In [6]:
learning_rate = .0002
log_interval = 100
emb_dim = 100 # Emb dim
hidden_dim = 30 # Hidden dim for LSTM

In [7]:
class Sentence_Representation(nn.Block):
    def __init__(self, EMB_DIM, HIDDEN_DIM, VOCAB_SIZE, dropout = .2, **kwargs):
        super(Sentence_Representation, self).__init__(**kwargs)
        self.VOCAB_SIZE = VOCAB_SIZE
        self.EMB_DIM = EMB_DIM
        self.HIDDEN_DIM = HIDDEN_DIM
        with self.name_scope():
            self.hidden = []
            self.embed = nn.Embedding(VOCAB_SIZE, EMB_DIM)
            self.lstm = rnn.LSTM(HIDDEN_DIM // 2, num_layers= 2 \
                                 , dropout = dropout, input_size = EMB_DIM \
                                 , bidirectional=True)
            self.drop = nn.Dropout(.2)

    def forward(self, x, hidden):
        embeds = self.embed(x) # batch * time step * embedding: NTC
        lstm_out, self.hidden = self.lstm(nd.transpose(embeds, (1, 0, 2)), hidden) #TNC로 변환
        _hid = [nd.transpose(x, (1, 0, 2)) for x in self.hidden]
        # Concatenate depreciated. use concat. input list of tensors
        _hidden = nd.concat(*_hid)
        return lstm_out, self.hidden

    def begin_state(self, *args, **kwargs):
        return self.lstm.begin_state(*args, **kwargs)

In [13]:
class SA_Classifier(nn.Block):
    def __init__(self, sen_rep, classifier, batch_size, context, **kwargs):
        super(SA_Classifier, self).__init__(**kwargs)
        self.batch_size = batch_size
        self.context = context
        with self.name_scope():
            self.sen_rep = sen_rep
            self.classifier = classifier
            
    def forward(self, x):
        hidden = self.sen_rep.begin_state(func = mx.nd.zeros, batch_size = self.batch_size, ctx = self.context)
        #_x, _ = self.sen_rep(x, hidden)
        # Use the last cell state from both directions
        _, _x = self.sen_rep(x, hidden) 
        #print('x shape = {}'.format(_x[0].shape)) # Hidden state
        #print('x shape = {}'.format(_x[1].shape)) # Cell state
        # state = (2 * num_layers, batch_size, num_hidden): 2 for left and right LSTM
        # Select the last layer for both of left LSTM and right LSTM
        x = nd.concat(_x[1][1, :, :], _x[1][3, :, :], dim = -1)
        x = self.classifier(x)
        return x           

In [14]:
sen_rep = Sentence_Representation(emb_dim, hidden_dim, MAX_VOCAB)
#sen_rep.collect_params().initialize(mx.init.Xavier(), ctx = context)

classifier = nn.Sequential()
classifier.add(nn.Dense(16, activation = 'relu'))
classifier.add(nn.Dense(8, activation = 'relu'))
classifier.add(nn.Dense(1))

In [15]:
emb_dim = 50 # Emb dim
hidden_dim = 30 # Hidden dim for LSTM
sa = SA_Classifier(sen_rep, classifier,  batch_size, context)
sa.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SigmoidBCELoss()
trainer = gluon.Trainer(sa.collect_params(), 'adam', {'learning_rate': 1e-3})

In [16]:
def evaluate(net, dataIterator, context):
    dataIterator.reset()
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    for i, batch in enumerate(dataIterator):
        data =  batch.data[0].as_in_context(context)
        label = batch.data[1].as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (output > 0.5).reshape((-1,))
        #print('cor = {}'.format(pred == label))
        total_L += L.sum().asscalar()
        total_sample_num += len(label)
        total_correct_num += (pred == label).sum().asscalar()
        #print('total_correct_num = {}, total_correct_num = {}'.format(total_correct_num, total_sample_num))
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, dataIterator.num_data//dataIterator.batch_size + 1,
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    return avg_L, acc

In [17]:
n_epoch = 5
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    # Epoch training stats
    start_epoch_time = time.time()
    epoch_L = 0.0
    epoch_sent_num = 0
    epoch_wc = 0
    # Log interval training stats
    start_log_interval_time = time.time()
    log_interval_wc = 0
    log_interval_sent_num = 0
    log_interval_L = 0.0
    
    for i, batch in enumerate(train_data):
        _data = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        L = 0
        wc = len(_data)
        log_interval_wc += wc
        epoch_wc += wc
        log_interval_sent_num += _data.shape[1]
        epoch_sent_num += _data.shape[1]
        with autograd.record():
            _out = sa(_data)
            L = L + loss(_out, _label).mean().as_in_context(context)
        L.backward()
        trainer.step(_data.shape[0])
        log_interval_L += L.asscalar()
        epoch_L += L.asscalar()
        if (i + 1) % log_interval == 0:
            tqdm.write('[Epoch {} Batch {}/{}] elapsed {:.2f} s, \
                    avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, train_data.num_data//train_data.batch_size,
                    time.time() - start_log_interval_time,
                    log_interval_L / log_interval_sent_num,
                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
            # Clear log interval training stats
            start_log_interval_time = time.time()
            log_interval_wc = 0
            log_interval_sent_num = 0
            log_interval_L = 0
    end_epoch_time = time.time()
    test_avg_L, test_acc = evaluate(sa, valid_data, context)
    tqdm.write('[Epoch {}] train avg loss {:.6f}, valid acc {:.2f}, \
        valid avg loss {:.6f}, throughput {:.2f}K wps'.format(
        epoch, epoch_L / epoch_sent_num,
        test_acc, test_avg_L, epoch_wc / 1000 /
        (end_epoch_time - start_epoch_time)))

HBox(children=(IntProgress(value=0, description='epoch', max=5), HTML(value='')))

[Epoch 0 Batch 100/354] elapsed 1.67 s,                     avg loss 0.025181, throughput 0.96K wps
[Epoch 0 Batch 200/354] elapsed 1.62 s,                     avg loss 0.003241, throughput 0.99K wps
[Epoch 0 Batch 300/354] elapsed 1.61 s,                     avg loss 0.001014, throughput 0.99K wps
[Batch 100/199] elapsed 0.47 s
[Batch 200/199] elapsed 0.47 s
[Epoch 0] train avg loss 0.008539, valid acc 0.98,         valid avg loss 0.060673, throughput 0.98K wps
[Epoch 1 Batch 100/354] elapsed 1.61 s,                     avg loss 0.001011, throughput 0.99K wps
[Epoch 1 Batch 200/354] elapsed 1.62 s,                     avg loss 0.000398, throughput 0.99K wps
[Epoch 1 Batch 300/354] elapsed 1.69 s,                     avg loss 0.000343, throughput 0.94K wps
[Batch 100/199] elapsed 0.47 s
[Batch 200/199] elapsed 0.47 s
[Epoch 1] train avg loss 0.000509, valid acc 0.99,         valid avg loss 0.046512, throughput 0.98K wps
[Epoch 2 Batch 100/354] elapsed 1.62 s,                     avg lo

In [20]:
import numpy as np
# We need to specify batch_size explicitly becuase we need that in reshaping
idx = np.random.choice(len(va_idx), batch_size)
va_txt = [origin_txt[_idx] for _idx in va_idx]
va_txt = [va_txt[j] for j in idx]
va_txt = pd.DataFrame(va_txt, columns = ['txt'])
y_pred_sa = sa(nd.array([va_x[i] for i in idx], ctx = context))
pred_sa = [nd.round(val).asnumpy() for val in nd.sigmoid(y_pred_sa)] 
pred_sa_pd = pd.DataFrame(pred_sa, columns  = ['pred_sa'])
label_pd = pd.DataFrame([va_y[j] for j in idx], columns = ['label'])
result = pd.concat([va_txt, pred_sa_pd, label_pd], axis = 1)
result.head(10)

Unnamed: 0,txt,pred_sa,label
0,BROKEBACK MOUNTAIN STINKS..,1.0,0
1,"Da Vinci Code = Up, Up, Down, Down, Left, Righ...",0.0,0
2,Da Vinci Code sucks.,0.0,0
3,I love Brokeback Mountain.,1.0,1
4,dudeee i LOVED brokeback mountain!!!!,1.0,1
5,Brokeback Mountain was boring.,0.0,0
6,The Da Vinci Code is awesome..,1.0,1
7,I love The Da Vinci Code...,1.0,1
8,Brokeback Mountain was so awesome.,1.0,1
9,I am going to start reading the Harry Potter s...,1.0,1


In [21]:
result[result['pred_sa'] != result['label']].shape

(1, 3)

### Result

* Only  no comments are mis-classified