In [38]:
# -*- coding: utf-8 -*-

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, load_model
from keras.models import Sequential
from keras.initializers import Constant
import json
import pandas as pd

In [39]:
BASE_DIR = ''
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'data')
MAX_SEQUENCE_LENGTH = 800

In [3]:
from keras.datasets import imdb
#a = (x_train, y_train), b = (x_test, y_test)
a,b = imdb.load_data(path="imdb.npz",
                     num_words=None,
                     skip_top=10,
                     maxlen=MAX_SEQUENCE_LENGTH,
                     seed=113,
                     start_char=1,
                     oov_char=2,
                     index_from=3)

x_train = a[0]
y_train = a[1]
x_test = b[0]
y_test = b[1]

word_index = imdb.get_word_index()

In [4]:
y_test[:10]

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0], dtype=int64)

In [5]:
len(word_index)

88584

In [6]:
#plot the lengths distribution when the max len set as 1000

from matplotlib import pyplot as plt

p_lengths = []
for p in x_train:
    p_lengths.append(len(p))

plt.hist(p_lengths,bins=20)
plt.show()

<Figure size 640x480 with 1 Axes>

In [7]:
x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)
y_train = to_categorical(np.asarray(y_train))
y_test = to_categorical(np.asarray(y_test))

print('Shape of data tensor:', x_train.shape)
print('Shape of label tensor:', y_train.shape)

Shape of data tensor: (25000, 800)
Shape of label tensor: (25000, 2)


In [8]:
y_test[:10]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [9]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = len(word_index) + 4

# build RNN model
model = Sequential()
model.add(Embedding(vocab_size, 16))
model.add(LSTM(50))
model.add(Dense(2, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          1417408   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                13400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 102       
Total params: 1,430,910
Trainable params: 1,430,910
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_test, y_test))

In [None]:
# We can add a dropout layer into this LSTM model later. It will help!

In [None]:
#model.save('EX2_RNN(LSTM)_trained_model')

In [10]:
model = load_model('EX2_RNN(LSTM)_trained_model')

In [11]:
model.evaluate(x_test,y_test)



[0.6308990193229622, 0.8532014792063658]

In [12]:
# second, prepare text samples and their labels
print('Processing text dataset')

transcripts_x = []
transcripts_y = []

file_list = os.listdir('data')
for file in file_list:
    with open('data/' + file, 'r') as f:
        transcripts = json.load(f)
        transcripts_x.extend(transcripts['text'].values())
        transcripts_y.extend(transcripts['sentiment'].values())
        
#build a pandas df and drop all neutral rows
d = {'review': transcripts_x, 'label': transcripts_y}
df_transcripts = pd.DataFrame(data=d)

#drop all neutral rows
df_transcripts = df_transcripts[df_transcripts['label'] != 'neutral']

#replace positive and negative to pos and neg
df_transcripts['label'] = df_transcripts['label'].str.replace('positive','1')
df_transcripts['label'] = df_transcripts['label'].str.replace('negative','0')

transcripts_x = df_transcripts['review'].tolist()
transcripts_y = df_transcripts['label'].astype('int32').tolist()

Processing text dataset


In [13]:
transcripts_x[2]

"I'll just remind you that the units – those do not count in our unit totals nor do the units from Whole Foods Market. So, yeah, I would say essentially with that backdrop, we're still very, very encouraged by the demand and the reception from customers on the consumer side. We have Amazon fulfilled units are still growing faster than paid units. 3P is now up to 53% of total paid units."

In [14]:
transcripts_y[:10]

[1, 1, 1, 1, 0, 1, 1, 1, 1, 1]

In [15]:
def my_tokenize(text):
    tokens = text_to_word_sequence(text, 
                                   filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                                   lower=True, #all words in word_index are in lower case
                                   split=' ') 
    return tokens

In [16]:
transcripts_x_tokens = []
for t in transcripts_x:
    transcripts_x_tokens.append(my_tokenize(t))

In [17]:
transcripts_x_tokens[:50]

[['great',
  'thank',
  'you',
  'for',
  'taking',
  'my',
  'question',
  'i',
  'guess',
  'the',
  'big',
  'one',
  'is',
  'the',
  'deceleration',
  'in',
  'unit',
  'growth',
  'or',
  'online',
  'stores',
  'which',
  'are',
  'probably',
  'related',
  'to',
  'that',
  'i',
  'know',
  "it's",
  'a',
  'tough',
  '3q',
  'comp',
  'but',
  'could',
  'you',
  'comment',
  'a',
  'little',
  'bit',
  'about',
  'that',
  'and',
  'then',
  'kind',
  'of',
  'what',
  'initiatives',
  'could',
  'be',
  'most',
  'interesting',
  'to',
  'maybe',
  'reaccelerate',
  'that',
  'over',
  'the',
  'next',
  'couple',
  'of',
  'years',
  'what',
  'categories',
  'thank',
  'you'],
 ['thank',
  'you',
  'justin',
  'yeah',
  'let',
  'me',
  'just',
  'remind',
  'you',
  'a',
  'couple',
  'of',
  'things',
  'from',
  'last',
  'year',
  'we',
  'had',
  'two',
  'reactions',
  'on',
  'our',
  'super',
  'saver',
  'shipping',
  'threshold',
  'in',
  'the',
  'first',
  'ha

In [19]:
transcripts_x_index = []
for t_tokens in transcripts_x_tokens:
    t_index = []
    for t in t_tokens:
        #get index from word_index for current token
        try:
            i = word_index[t] 
        except KeyError:
            i = 0
        t_index.append(i)
    transcripts_x_index.append(t_index)

In [20]:
transcripts_x_index = pad_sequences(transcripts_x_index, maxlen=MAX_SEQUENCE_LENGTH)
transcripts_y = to_categorical(np.asarray(transcripts_y))

In [21]:
transcripts_y[:10]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [22]:
print('Shape of data tensor:', transcripts_x_index.shape)
print('Shape of label tensor:', transcripts_y.shape)

Shape of data tensor: (379, 800)
Shape of label tensor: (379, 2)


In [23]:
transcripts_x_index[:50]

array([[    0,     0,     0, ...,  8309,  1291,    22],
       [    0,     0,     0, ...,    74,    63,  1060],
       [    0,     0,     0, ...,   961,  1536, 16718],
       ...,
       [    0,     0,     0, ...,  5375,  3188,  5235],
       [    0,     0,     0, ...,  1787,    52,   943],
       [    0,     0,     0, ...,    94,    50,   275]])

In [24]:
#continuous training
model.fit(transcripts_x_index, transcripts_y,
          batch_size=128,
          epochs=10,
          validation_split=0.2)

Train on 303 samples, validate on 76 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1625adb51d0>

In [25]:
# Predicting the Test set results
y_prob = model.predict(transcripts_x_index)
y_classes = y_prob.argmax(axis=-1)

In [26]:
y_prob[:50]

array([[0.16057965, 0.8394204 ],
       [0.00660884, 0.99339116],
       [0.07583424, 0.9241657 ],
       [0.19638601, 0.80361396],
       [0.6208372 , 0.3791628 ],
       [0.07667067, 0.92332935],
       [0.00131032, 0.9986897 ],
       [0.2934942 , 0.70650584],
       [0.00985349, 0.99014646],
       [0.00766159, 0.9923384 ],
       [0.0016339 , 0.99836606],
       [0.10313594, 0.89686406],
       [0.00362228, 0.99637777],
       [0.00104525, 0.9989548 ],
       [0.03943031, 0.9605697 ],
       [0.00825464, 0.9917454 ],
       [0.01259301, 0.987407  ],
       [0.00663724, 0.9933628 ],
       [0.9120454 , 0.08795453],
       [0.6583337 , 0.34166628],
       [0.39149797, 0.60850203],
       [0.03180831, 0.9681916 ],
       [0.0016345 , 0.99836546],
       [0.00498992, 0.99501014],
       [0.01467577, 0.9853242 ],
       [0.495158  , 0.50484204],
       [0.02261806, 0.9773819 ],
       [0.00814352, 0.99185646],
       [0.06101499, 0.93898505],
       [0.02037236, 0.9796276 ],
       [0.

In [27]:
y_classes[:10]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [None]:
#####y_classes2 = np.ones(len(y_classes),dtype=np.int64) ^ y_classes

In [28]:
labels_index_2 = {0:'negative',1:'positive',2:'neutral'} 
def pred_vec_to_lebal(vec,labels_index_2):
    indices = [np.where(r==1)[0][0] for r in vec]
    labels = [labels_index_2[i] for i in indices]
    return labels
y_val_labels = pred_vec_to_lebal(transcripts_y,labels_index_2)

In [29]:
def pred_vec_to_lebal2(vec,labels_index_2):
    labels = [labels_index_2[i] for i in vec]
    return labels
y_classes_labels = pred_vec_to_lebal2(y_classes,labels_index_2)

In [30]:
y_classes_labels[:50]

['positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative']

In [31]:
from nltk.metrics import ConfusionMatrix
print(ConfusionMatrix(y_val_labels, y_classes_labels))

         |   n   p |
         |   e   o |
         |   g   s |
         |   a   i |
         |   t   t |
         |   i   i |
         |   v   v |
         |   e   e |
---------+---------+
negative | <26> 36 |
positive |  10<307>|
---------+---------+
(row = reference; col = test)



In [40]:
import sklearn.metrics
sklearn.metrics.confusion_matrix(y_val_labels, y_classes_labels, labels=["positive", "negative"])

array([[307,  10],
       [ 36,  26]], dtype=int64)

In [41]:
sklearn.metrics.precision_score(y_val_labels, y_classes_labels, average='weighted')

0.8667721391861181

In [42]:
sklearn.metrics.recall_score(y_val_labels, y_classes_labels, average='weighted')

0.8786279683377308

In [43]:
sklearn.metrics.accuracy_score(y_val_labels, y_classes_labels)

0.8786279683377308

In [44]:
#At this time, use imdb data evaluate the new model again.
model.evaluate(x_test,y_test)



[0.7497194261541393, 0.7305023476141522]

In [46]:
model.metrics_names 

['loss', 'acc']

In [47]:
model.save('EX2_RNN(LSTM)_transfer_learning_model')