In [58]:
# -*- coding: utf-8 -*-

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
import json

In [59]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'data')
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 50000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))



Indexing word vectors.
Found 400001 word vectors.


In [60]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {'negative':0,'positive':1,'neutral':2}  # dictionary mapping label name to numeric id

labels = []  # list of label ids
file_list = os.listdir(TEXT_DATA_DIR)
for file in file_list:
    with open('data/' + file, 'r') as f:
        transcripts = json.load(f)
        texts.extend(transcripts['text'].values())
        labels.extend(transcripts['sentiment'].values())

for i in range(len(labels)):
    labels[i] = labels_index[labels[i]]


print('Found %s texts.' % len(texts))

Processing text dataset
Found 622 texts.


In [33]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 3710 unique tokens.
Shape of data tensor: (622, 100)
Shape of label tensor: (622, 3)


In [34]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [35]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [36]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [47]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 3, activation='relu')(embedded_sequences)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(64, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Train on 498 samples, validate on 124 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a580072be0>

In [48]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 100)          371100    
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 98, 128)           38528     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 32, 128)           0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 30, 128)           49280     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 10, 128)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 8, 128)            49280     
__________

In [49]:
x_val[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,  120,  116,
        2084,    3, 1609, 1420,   33,   92,  933,    7,  515,   76,  342,
          12,  116,  103,  722, 3116, 3117, 3118,    2,  309, 3119, 3120,
        1863, 2093,    2, 3121,  958,   14,   59,   14,    8,   76,  312,
        1868,   12,    1,  558,  244, 2094,   49,   69,  545,  957,  839,
         587,   87,    9,   73,   80, 1004,    4, 1566,   17,  260,    1,
         479],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [52]:
# Predicting the Test set results
y_prob = model.predict(x_val)

In [53]:
y_prob

array([[0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.3050786 , 0.3562744 , 0.33864707],
       [0.

In [54]:
y_classes = y_prob.argmax(axis=-1)

In [55]:
labels_index_2 = {0:'negative',1:'positive',2:'neutral'} 
def pred_vec_to_lebal(vecs,labels_index_2):
    indices = [np.where(r==1)[0][0] for r in vecs]
    labels = [labels_index_2[i] for i in indices]
    return labels

In [56]:
y_val_labels = pred_vec_to_lebal(y_val,labels_index_2)
y_classes_labels = pred_vec_to_lebal(y_classes,labels_index_2)

In [57]:
from nltk.metrics import ConfusionMatrix
print(ConfusionMatrix(y_val_labels, y_classes_labels))

         |  n     p |
         |  e  n  o |
         |  g  e  s |
         |  a  u  i |
         |  t  t  t |
         |  i  r  i |
         |  v  a  v |
         |  e  l  e |
---------+----------+
negative |<11> .  . |
 neutral | 53 <.> . |
positive | 60  . <.>|
---------+----------+
(row = reference; col = test)

