In [9]:
# -*- coding: utf-8 -*-
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, SimpleRNN, LSTM
from keras.models import Sequential
from keras.initializers import Constant
import json

BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'data')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))



Indexing word vectors.
Found 400001 word vectors.


In [3]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {'negative':0,'positive':1,'neutral':2}  # dictionary mapping label name to numeric id

labels = []  # list of label ids
file_list = os.listdir(TEXT_DATA_DIR)
for file in file_list:
    with open('data/' + file, 'r') as f:
        transcripts = json.load(f)
        texts.extend(transcripts['text'].values())
        labels.extend(transcripts['sentiment'].values())

for i in range(len(labels)):
    labels[i] = labels_index[labels[i]]


print('Found %s texts.' % len(texts))

Processing text dataset
Found 622 texts.


In [4]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 3710 unique tokens.
Shape of data tensor: (622, 1000)
Shape of label tensor: (622, 3)


In [5]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [6]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [10]:
# build RNN model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(len(labels_index), activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         371100    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 451,803
Trainable params: 80,703
Non-trainable params: 371,100
_________________________________________________________________
Train on 498 samples, validate on 124 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2285076c710>

In [11]:
# Predicting the Test set results
y_prob = model.predict(x_val)
y_classes = y_prob.argmax(axis=-1)

In [15]:
y_prob

array([[0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.10576867, 0.5260474 , 0.3681839 ],
       [0.

In [12]:
labels_index_2 = {0:'negative',1:'positive',2:'neutral'} 
def pred_vec_to_lebal(vecs,labels_index_2):
    indices = [np.where(r==1)[0][0] for r in vecs]
    labels = [labels_index_2[i] for i in indices]
    return labels

In [13]:
y_val_labels = pred_vec_to_lebal(y_val,labels_index_2)
y_classes_labels = pred_vec_to_lebal(y_classes,labels_index_2)

In [14]:
from nltk.metrics import ConfusionMatrix
print(ConfusionMatrix(y_val_labels, y_classes_labels))

         |  n     p |
         |  e  n  o |
         |  g  e  s |
         |  a  u  i |
         |  t  t  t |
         |  i  r  i |
         |  v  a  v |
         |  e  l  e |
---------+----------+
negative | <9> .  . |
 neutral | 57 <.> . |
positive | 58  . <.>|
---------+----------+
(row = reference; col = test)

