#### REF
* https://github.com/dmlc/gluon-nlp/blob/master/docs/api/notes/data_api.rst

## Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import collections
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, auc
from mxnet import gluon


import time, re
import multiprocessing as mp
import itertools
from tqdm import tqdm, tqdm_notebook
import mxnet as mx
import spacy
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'

In [2]:
MAX_SENTENCE_LENGTH = 20
MAX_VOCAB = 10000

### Preprocessing using Spacy

In [3]:
nlp = spacy.load("en")

word_freq = collections.Counter()
max_len = 0
num_rec = 0
print('Count words and build vocab...')
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _lab, _sen = line.decode('utf8').strip().split('\t')
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태 
        # 제거를 위해 [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_VOCAB - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

print('Prepare data...')
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sen = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sen)
        y.append(int(_label))
        words = [token.lemma_ for token in nlp(_sen) if token.is_alpha] # Stop word제거 안한 상태
        words = [x for x in words if x != '-PRON-'] # '-PRON-' 제거
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

Count words and build vocab...
Prepare data...


Unnamed: 0,yn,index
0,0,3091
1,1,3995


In [4]:
## Data process - tr/va split and define iterator

tr_idx = np.random.choice(range(len(x)), int(len(x) * .8))
va_idx = [x for x in range(len(x)) if x not in tr_idx]

tr_x = [x[i] for i in tr_idx]
tr_y = [y[i] for i in tr_idx]
tr_origin = [origin_txt[i] for i in tr_idx]

va_x = [x[i] for i in va_idx]
va_y = [y[i] for i in va_idx]
va_origin = [origin_txt[i] for i in va_idx]

batch_size = 16

learning_rate = .0002
log_interval = 100


train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

In [5]:
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn, rnn
from mxnet.ndarray.linalg import gemm2
import mxnet as mx
context = mx.gpu()

#### Sentence Representation

In [6]:
class Sentence_Representation(nn.Block):
    def __init__(self, **kwargs):
        super(Sentence_Representation, self).__init__()
        for (k, v) in kwargs.items():
            setattr(self, k, v)
        with self.name_scope():
            self.embed = nn.Embedding(self.vocab_size, self.emb_dim)
            self.conv1 = nn.Conv2D(channels = 8, kernel_size = (3, self.emb_dim), activation = 'relu')
            self.maxpool1 = nn.MaxPool2D(pool_size = (self.max_sentence_length -3 + 1, 1), strides = (1, 1))
            self.conv2 = nn.Conv2D(channels = 8, kernel_size = (4, self.emb_dim), activation = 'relu')            
            self.maxpool2 = nn.MaxPool2D(pool_size = (self.max_sentence_length -4 + 1, 1), strides = (1, 1))
            self.conv3 = nn.Conv2D(channels = 8, kernel_size = (5, self.emb_dim), activation = 'relu')
            self.maxpool3 = nn.MaxPool2D(pool_size = (self.max_sentence_length -5 + 1, 1), strides = (1, 1))
            self.conv4 = nn.Conv2D(channels = 8, kernel_size = (6, self.emb_dim), activation = 'relu') 
            self.maxpool4 = nn.MaxPool2D(pool_size = (self.max_sentence_length -6 + 1, 1), strides = (1, 1))

    def forward(self, x):
        embeds = self.embed(x) # batch * time step * embedding
        embeds = embeds.expand_dims(axis = 1)
        _x1 = self.conv1(embeds)
        _x1 = self.maxpool1(_x1)
        _x1 = nd.reshape(_x1, shape = (-1, 8))
        
        _x2 = self.conv2(embeds)
        _x2 = self.maxpool2(_x2)
        _x2 = nd.reshape(_x2, shape = (-1, 8))
        
        _x3 = self.conv3(embeds)
        _x3 = self.maxpool3(_x3)
        _x3 = nd.reshape(_x3, shape = (-1, 8))
        
        _x4 = self.conv4(embeds)
        _x4 = self.maxpool4(_x4)
        _x4 = nd.reshape(_x4, shape = (-1, 8))

        _x = nd.concat(_x1, _x2, _x3, _x4)
        return _x

#### Classifier

In [7]:
classifier = nn.Sequential()
classifier.add(nn.Dense(16, activation = 'relu'))
classifier.add(nn.Dense(8, activation = 'relu'))
classifier.add(nn.Dense(1))
classifier.collect_params().initialize(mx.init.Xavier(), ctx = context)

#### Sentiment analysis classifier

In [8]:
class SA_CNN_Classifier(nn.Block):
    def __init__(self, sen_rep, classifier, context, **kwargs):
        super(SA_CNN_Classifier, self).__init__(**kwargs)
        self.context = context
        with self.name_scope():
            self.sen_rep = sen_rep
            self.classifier = classifier
            
    def forward(self, x):
        # sentence representation할 때 hidden의 context가 cpu여서 오류 발생. context를 gpu로 전환
        x = self.sen_rep(x)
        res = self.classifier(x)
        return res

#### Initiate sentiment classifier

In [9]:
emb_dim = 50 # Emb dim
param = {'emb_dim': emb_dim, 'vocab_size': vocab_size, 'max_sentence_length': MAX_SENTENCE_LENGTH, 'dropout': .2}
sen_rep = Sentence_Representation(**param)
sen_rep.collect_params().initialize(mx.init.Xavier(), ctx = context)


In [10]:
sa = SA_CNN_Classifier(sen_rep, classifier, context)
loss = gluon.loss.SigmoidBCELoss()
trainer = gluon.Trainer(sa.collect_params(), 'adam', {'learning_rate': 1e-3})

In [11]:
def evaluate(net, dataIterator, context):
    dataIterator.reset()
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    for i, batch in enumerate(dataIterator):
        data =  batch.data[0].as_in_context(context)
        label = batch.data[1].as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (output > 0.5).reshape((-1,))
        total_L += L.sum().asscalar()
        total_sample_num += len(label)
        total_correct_num += (pred == label).sum().asscalar()
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, dataIterator.num_data//dataIterator.batch_size,
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    return avg_L, acc

In [12]:
n_epoch = 5
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    # Epoch training stats
    start_epoch_time = time.time()
    epoch_L = 0.0
    epoch_sent_num = 0
    epoch_wc = 0
    # Log interval training stats
    start_log_interval_time = time.time()
    log_interval_wc = 0
    log_interval_sent_num = 0
    log_interval_L = 0.0
    
    for i, batch in enumerate(train_data):
        _data = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        L = 0
        wc = len(_data)
        log_interval_wc += wc
        epoch_wc += wc
        log_interval_sent_num += _data.shape[1]
        epoch_sent_num += _data.shape[1]
        with autograd.record():
            _out = sa(_data)
            L = L + loss(_out, _label).mean().as_in_context(context)
        L.backward()
        trainer.step(_data.shape[0])
        log_interval_L += L.asscalar()
        epoch_L += L.asscalar()
        if (i + 1) % log_interval == 0:
            tqdm.write('[Epoch {} Batch {}/{}] elapsed {:.2f} s, \
                    avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, train_data.num_data//train_data.batch_size,
                    time.time() - start_log_interval_time,
                    log_interval_L / log_interval_sent_num,
                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
            # Clear log interval training stats
            start_log_interval_time = time.time()
            log_interval_wc = 0
            log_interval_sent_num = 0
            log_interval_L = 0
    end_epoch_time = time.time()
    test_avg_L, test_acc = evaluate(sa, valid_data, context)
    tqdm.write('[Epoch {}] train avg loss {:.6f}, valid acc {:.2f}, \
        valid avg loss {:.6f}, throughput {:.2f}K wps'.format(
        epoch, epoch_L / epoch_sent_num,
        test_acc, test_avg_L, epoch_wc / 1000 /
        (end_epoch_time - start_epoch_time)))

HBox(children=(IntProgress(value=0, description='epoch', max=5), HTML(value='')))

[Epoch 0 Batch 100/354] elapsed 0.88 s,                     avg loss 0.027326, throughput 1.82K wps
[Epoch 0 Batch 200/354] elapsed 0.81 s,                     avg loss 0.003084, throughput 1.97K wps
[Epoch 0 Batch 300/354] elapsed 0.81 s,                     avg loss 0.001511, throughput 1.98K wps
[Batch 100/199] elapsed 0.37 s
[Epoch 0] train avg loss 0.009157, valid acc 0.99,         valid avg loss 0.016872, throughput 1.93K wps
[Epoch 1 Batch 100/354] elapsed 0.81 s,                     avg loss 0.000595, throughput 1.97K wps
[Epoch 1 Batch 200/354] elapsed 0.81 s,                     avg loss 0.000603, throughput 1.98K wps
[Epoch 1 Batch 300/354] elapsed 0.81 s,                     avg loss 0.000287, throughput 1.97K wps
[Batch 100/199] elapsed 0.37 s
[Epoch 1] train avg loss 0.000450, valid acc 0.99,         valid avg loss 0.014424, throughput 1.97K wps
[Epoch 2 Batch 100/354] elapsed 0.81 s,                     avg loss 0.000188, throughput 1.97K wps
[Epoch 2 Batch 200/354] elap

### Prediction

In [13]:
def get_pred(net, iterator):
    pred_sa = []
    label_sa = []
    va_text = []
    iterator.reset()
    for i, batch in enumerate(iterator):
        if i % 100 == 0:
            print('i = {}'.format(i))
        data =  batch.data[0].as_in_context(context)
        label = batch.data[1].as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (nd.sigmoid(output) > 0.5).reshape((-1,))
        pred_sa.extend(pred.asnumpy())
        label_sa.extend(label.asnumpy())
        va_text.extend([' '.join([idx2word[np.int(x)] for x in y.asnumpy() if idx2word[np.int(x)] is not 'PAD']) for y in data])
    pred_sa_pd = pd.DataFrame(pred_sa, columns  = ['pred_sa'])
    label_pd = pd.DataFrame(label_sa, columns = ['label'])
    text_pd = pd.DataFrame(va_text, columns = ['text'])
    res = pd.concat([text_pd, pred_sa_pd, label_pd], axis = 1)
    return res

## Classification results

In [14]:
result = get_pred(sa, valid_data)

i = 0
i = 100


In [15]:
result[result.pred_sa != result.label].shape

(19, 3)

In [16]:
result[result.pred_sa != result.label].head(10)

Unnamed: 0,text,pred_sa,label
1,that not even an exaggeration and at midnight ...,0.0,1.0
14,the da vinci code backtory on various religiou...,0.0,1.0
566,like mission impossible but hate tom cruise ge...,0.0,1.0
914,harry potter and the philosopher stone rowling...,0.0,1.0
921,be plan so no biggie be admire sister harry po...,0.0,1.0
924,this be undoubtedly a big deal as do not purch...,0.0,1.0
937,write a harry potter poem for a chance to win ...,0.0,1.0
948,enjoy discuss harry potter and know there be s...,0.0,1.0
978,harry potter and the veil of darkness by ocean...,0.0,1.0
981,enjoy take harry potter quiz,0.0,1.0


In [17]:
wrong = result[result.pred_sa != result.label]
for i in range(20):
    print('{} --- Label:{}'.format(wrong['text'].iloc[i], wrong['label'].iloc[i]))

that not even an exaggeration and at midnight go to wal mart to buy the da vinci code which be --- Label:1.0
the da vinci code backtory on various religious historical figure and such be interesting at time but more of scifi --- Label:1.0
like mission impossible but hate tom cruise get that straight update day in a row like magic and shit --- Label:1.0
harry potter and the philosopher stone rowling strangely a fan of hp fanfic but not of the book --- Label:1.0
be plan so no biggie be admire sister harry potter collection while wait --- Label:1.0
this be undoubtedly a big deal as do not purchase movie a la dvd the only one own be series --- Label:1.0
write a harry potter poem for a chance to win a fabulous harry potter prize --- Label:1.0
enjoy discuss harry potter and know there be still a lot for to learn about --- Label:1.0
harry potter and the veil of darkness by oceans phoenix word total and count year fic actually look fairly interesting --- Label:1.0
enjoy take harry potter quiz 

IndexError: single positional indexer is out-of-bounds