In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kionkim/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
!python -m spacy download en


[93m    Linking successful[0m
    /opt/venv/lib/python3.6/site-packages/en_core_web_sm -->
    /opt/venv/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [8]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize

In [9]:
word_freq = collections.Counter()
max_len = 0
num_rec = 0

with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        label, sentence = line.decode('utf8').strip().split('\t')
        words = nltk.word_tokenize(sentence.lower())
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

In [10]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_FEATURES - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

In [11]:
idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

In [12]:
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sentence = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sentence)
        y.append(int(_label))
        words = nltk.word_tokenize(_sentence.lower())
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

In [13]:
pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

Unnamed: 0,yn,index
0,0,3091
1,1,3995


## Sentence representation: Average of BOW

In [14]:
def one_hot(x, vocab_size):
    res = np.zeros(shape = (vocab_size))
    res[x] = 1
    return res

In [15]:
x_1 = np.array([np.sum(np.array([one_hot(word, MAX_FEATURES) for word in example]), axis = 0) for example in x])

## Data process - tr/va split and define iterator

In [16]:
tr_idx = np.random.choice(range(x_1.shape[0]), int(x_1.shape[0] * .8))
va_idx = [x for x in range(x_1.shape[0]) if x not in tr_idx]

In [17]:
tr_x = x_1[tr_idx, :]
tr_y = [y[i] for i in tr_idx]
va_x = x_1[va_idx, :]
va_y = [y[i] for i in va_idx]

In [18]:
tr_x.shape

(5668, 2000)

## Classification

* If we transform sentence into machine-understandable form via average of BOW, we can separate representation and classification
* Here, we will apply various classifiers

### XGBoost

In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
xgb = XGBClassifier()
xgb.fit(tr_x, tr_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
y_pred_xgb = xgb.predict(va_x)
pred_xgb = [round(val) for val in y_pred_xgb]

# Check predictions
#pred_pd= pd.DataFrame(pred_xgb, columns = ['pred']).reset_index()
#pred_pd.groupby(['pred']).count()

  if diff:


In [23]:
accuracy_xgb = accuracy_score(va_y, pred_xgb)
print('Accuracy: %.2f%%'%(accuracy_xgb * 100.0))

Accuracy: 97.68%


### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier()
rf.fit(tr_x, tr_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
y_pred_rf = rf.predict(va_x)
pred_rf = [round(val) for val in y_pred_rf]

In [27]:
accuracy_rf = accuracy_score(va_y, pred_rf)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

Accuracy: 97.87%


### SVM

In [28]:
from sklearn import svm

In [29]:
models = (svm.SVC(kernel = 'linear', C = 1.0), # C: SVM Regularization parameter
          svm.LinearSVC(C = 1.0),
          svm.SVC(kernel = 'rbf', gamma = .7, C = 1.0),
          svm.SVC(kernel = 'poly', degree = 3, C = 1.0)
)

models = (mdl.fit(tr_x, tr_y) for mdl in models)

In [30]:
y_pred_svm = (mdl.predict(va_x) for mdl in models)
pred_svm = [[round(val) for val in _pred] for _pred in y_pred_svm]

In [31]:
accuracy_svm = [accuracy_score(va_y, pred) for pred in pred_svm]
print('Accuracy: {}'.format(np.round(accuracy_svm, 4)*100))

Accuracy: [99.08 99.05 94.02 56.06]


## Check results

In [33]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_rf_pd = pd.DataFrame(pred_rf, columns  = ['pred_rf'])
pred_xgb_pd = pd.DataFrame(pred_xgb, columns  = ['pred_xgb'])
pred_svm_svc_pd = pd.DataFrame(pred_svm[2], columns  = ['pred_svm'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_rf_pd, pred_xgb_pd, pred_svm_svc_pd, label_pd], axis = 1)

In [34]:
result.head()

Unnamed: 0,txt,pred_rf,pred_xgb,pred_svm,label
0,that's not even an exaggeration ) and at midni...,1,0,1,1
1,I thought the Da Vinci Code was a pretty good ...,1,1,1,1
2,The Da Vinci Code was REALLY good.,0,0,1,1
3,THE DA VINCI CODE is AN AWESOME BOOK....,1,1,1,1
4,"Thing is, I enjoyed The Da Vinci Code.",0,0,1,1


In [36]:
print('# of error case from RF : {}'.format(result[result['pred_rf'] != result['label']].shape[0]))
print('# of error case from XGB: {}'.format(result[result['pred_xgb'] != result['label']].shape[0]))
print('# of error case from SVM: {}'.format(result[result['pred_svm'] != result['label']].shape[0]))


# of error case from RF : 67
# of error case from XGB: 73
# of error case from SVM: 188


### DNN with embedding layer

In [37]:
import mxnet as mx
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn
context = mx.gpu()

In [38]:
class MLP(nn.Block):
    def __init__(self, input_dim, emb_dim, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.embed = nn.Embedding(input_dim = input_dim, output_dim = emb_dim)
            self.dense1 = nn.Dense(64)
            #self.dense2 = nn.Dense(32, activation = 'relu')
            self.bn = nn.BatchNorm()
            self.dense2 = nn.Dense(2)
            
    def forward(self, x):
        x = self.embed(x)
        x = self.dense1(x)
        x = self.bn(x)
        x = nd.relu(x)
        x = self.dense2(x)
        return x

In [39]:
def acc_f(label, pred):
    pred = pred.ravel()
    label = label.ravel()
    #print('pred = {}'.format(pred))
    #print('label = {}'.format(label))
    corr = ((pred > 0.5) == label)*1.
    return (((pred > 0.5) == label)*1.).mean()
tr_metric = mx.metric.CustomMetric(acc_f)
va_metric = mx.metric.CustomMetric(acc_f)

In [40]:
n_epoch = 10
batch_size = 64
from tqdm import tqdm, tqdm_notebook
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'

In [41]:
train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

In [42]:
mlp = MLP(input_dim = MAX_FEATURES, emb_dim = 50)
mlp.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(mlp.collect_params(), 'adam', {'learning_rate': 1e-3})

In [43]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            _out = mlp(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        #print(n_obs)
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    #print(pred)
    #print([round(p) for p in pred]) # 기본이 float임
    #print(label)
    #print('**** ' + str(n_obs))
    #print(label[:10])
    #print(pred[:10])
    #print([round(p) for p in pred][:10])
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label)).asnumpy()
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

Epoch 0: tr_loss = [0.06689082], tr_acc= 0.9777036516853933, va_loss = [0.71115905], va_acc= 0.6271875
Epoch 1: tr_loss = [0.00672516], tr_acc= 0.9989466292134831, va_loss = [0.13715069], va_acc= 0.9653125
Epoch 2: tr_loss = [0.00187768], tr_acc= 0.9996488764044944, va_loss = [0.23311073], va_acc= 0.9325
Epoch 3: tr_loss = [0.00110336], tr_acc= 0.9998244382022472, va_loss = [0.03475095], va_acc= 0.9865625
Epoch 4: tr_loss = [0.00075083], tr_acc= 0.9998244382022472, va_loss = [0.03498295], va_acc= 0.9865625
Epoch 5: tr_loss = [0.00064765], tr_acc= 0.9996488764044944, va_loss = [0.03444985], va_acc= 0.9859375
Epoch 6: tr_loss = [0.0005337], tr_acc= 0.9998244382022472, va_loss = [0.03331061], va_acc= 0.9871875
Epoch 7: tr_loss = [0.00047138], tr_acc= 0.9998244382022472, va_loss = [0.03248437], va_acc= 0.9871875
Epoch 8: tr_loss = [0.00042152], tr_acc= 0.9998244382022472, va_loss = [0.03190098], va_acc= 0.9875
Epoch 9: tr_loss = [0.00037398], tr_acc= 0.9998244382022472, va_loss = [0.031760

In [44]:
y_pred_mlp = mlp(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [53]:
accuracy_mlp = accuracy_score(va_y, pred_mlp)
print('Accuracy: %.2f%%'%(accuracy_mlp * 100.0))

Accuracy: 98.76%


#### DNN without embedding

In [46]:
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense1 = nn.Dense(64)
            #self.dense2 = nn.Dense(32, activation = 'relu')
            self.bn = nn.BatchNorm()
            self.dense2 = nn.Dense(2)
            
    def forward(self, x):
        x = self.dense1(x)
        x = self.bn(x)
        x = nd.relu(x)
        x = self.dense2(x)
        return x

In [47]:
n_epoch = 10
batch_size = 64
from tqdm import tqdm, tqdm_notebook

In [48]:
mlp_no_embedding = MLP()
mlp_no_embedding.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(mlp_no_embedding.collect_params(), 'adam', {'learning_rate': 1e-3})

In [49]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            _out = mlp_no_embedding(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        #print(n_obs)
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    #print(pred)
    #print([round(p) for p in pred]) # 기본이 float임
    #print(label)
    #print('**** ' + str(n_obs))
    #print(label[:10])
    #print(pred[:10])
    #print([round(p) for p in pred][:10])
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label)).asnumpy()
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

Epoch 0: tr_loss = [0.19050199], tr_acc= 0.9357443820224719, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 1: tr_loss = [0.01419211], tr_acc= 0.9985955056179775, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 2: tr_loss = [0.00433564], tr_acc= 0.9996488764044944, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 3: tr_loss = [0.0024576], tr_acc= 0.9996488764044944, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 4: tr_loss = [0.00149932], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 5: tr_loss = [0.00096115], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 6: tr_loss = [0.00070163], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 7: tr_loss = [0.00055028], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 8: tr_loss = [0.00044363], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125
Epoch 9: tr_loss = [0.00036642], tr_acc= 1.0, va_loss = [0.0317603], va_acc= 0.9878125



In [50]:
y_pred_mlp_no_embedding = mlp_no_embedding(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp_no_embedding = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [52]:
accuracy_mlp_no_embedding = accuracy_score(va_y, pred_mlp_no_embedding)
print('Accuracy: %.2f%%'%(accuracy_mlp_no_embedding * 100.0))

Accuracy: 98.76%


## Errors

In [54]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_mlp_no_embedding_pd = pd.DataFrame(pred_mlp_no_embedding, columns  = ['pred_mlp_no_embedding'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_mlp_no_embedding_pd, label_pd], axis = 1)

In [55]:
result[result['pred_mlp_no_embedding'] != result['label']].shape

(39, 3)

In [56]:
_pred_score[:, 0]


[9.9993134e-01 9.9995065e-01 9.9994564e-01 9.9993896e-01 9.9991822e-01
 9.9994397e-01 9.9994290e-01 9.9993896e-01 9.9993610e-01 1.7375102e-02
 2.7934110e-03 3.2467667e-02 1.5153956e-05 7.8517452e-02 7.9589250e-04
 3.2973409e-01 2.2070535e-07 9.3622890e-05 8.1146415e-04 1.4700463e-05
 1.6620104e-05 3.6849084e-05 4.6369656e-05 1.1681904e-04 1.2161185e-02
 2.9147562e-01 3.3054352e-02 4.5397133e-05 1.2546702e-04 6.2344661e-08
 3.3202698e-05 5.2164240e-05 9.2523689e-05 1.2638983e-04 7.7245044e-05
 1.0179579e-04 3.1921965e-05 2.9245422e-05 6.6700428e-05 2.1928401e-05
 1.0722188e-04 2.4290648e-05 1.1346096e-02 3.3071337e-05 9.2709051e-05
 6.6700428e-05 1.0328748e-01 4.9137998e-05 2.8014177e-04 9.0252848e-05
 7.4576818e-08 1.1681904e-04 9.5113588e-05 4.1982674e-04 2.0698872e-03
 3.0947605e-04 7.0564187e-04 1.7729572e-04 6.6714069e-06 1.9732476e-04
 6.1972554e-05 3.5129324e-05 1.0271466e-02 3.1249336e-04]
<NDArray 64 @gpu(0)>