# Sentiment Analysis
Using imdb dataset http://ai.stanford.edu/~amaas/data/sentiment/ 
25,000 reviews in the dataset

In [1]:
import importlib
import utils; importlib.reload(utils)
from utils import *
from __future__ import division, print_function
%matplotlib inline

Using cuDNN version 7003 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/imdb/'

In [3]:
%cd $DATA_HOME_DIR
path = DATA_HOME_DIR
model_path = path+'/models'

/home/hearth/ML/course/deeplearning1/nbs/data/imdb


Fetching and loading dataset

In [5]:
dsPath = 'imdb_full.pkl'                
f = open(dsPath, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [6]:
import json

with open('imdb_word_index.json') as json_data:
    w2id = json.load(json_data)
id2w = {v: k for k, v in w2id.items()}
#''.join([idx2word[o] for o in x_train[o]])

In [7]:
' '.join([id2w[o] for o in x_train[13568]])

'this movie is terrible but it has some good effects'

In [8]:
vocab_size = 5000

As many words appear only once in the whole dataset like most nouns, its useful to reduce the vocab size down to 5000. Since the words are arranged in the frequency they appear in the dataset, this is simple to do.

In [9]:
trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

Distribution of lengths of sentences

In [10]:
lengths = [len(i) for i in trn]
(max(lengths), min(lengths), sum(lengths) / float(len(lengths)))

(2493, 10, 237.71364)

In [11]:
lengths.index(min(lengths))

13568

Make all reviews the same length at 500 using padding (with zeros)

In [12]:
seq_len = 500

In [13]:
trn = sequence.pad_sequences(trn, maxlen=seq_len, value = 0)
test = sequence.pad_sequences(test, maxlen=seq_len, value = 0)

In [14]:
trn.shape

(25000, 500)

## Simple single layer NN Acc 87.5
Embedding layer is used in lieu of one hot encodding the vocabulary 

In [16]:
model = Sequential([
    #32 latent factors for each word
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

In [17]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba8eb13320>

## Convolution network with maxpooling (single layer) Acc 88.75

In [41]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, dropout=0.2),
    #Drop above turns off some of the latent factors while
    #the one below turns off whole words
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

  


In [42]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
conv1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_19 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 500, 64)           10304     
_________________________________________________________________
dropout_20 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 100)               1600100   
__________

In [44]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba6c283d30>

## Using pretrained embedding: Glove Dataset 
Results are consistently better using trained embeddings
Dataset: http://files.fast.ai/models/glove/6B.50d.tgz

In [15]:
def load_vectors(loc):
    with open(loc+'_words.pkl', 'rb') as f:
        d1 = pickle.load(f, encoding='latin1')
    with open(loc+'_idx.pkl', 'rb') as f:
        d2 = pickle.load(f, encoding='latin1')
    return (load_array(loc+'.dat'), d1, d2)
        

Simple function to create an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).

In [16]:
vecs, words, wordidx = load_vectors('6B.50d')

In [17]:
def create_emb():
    factors=vecs.shape[1]
    emb = np.zeros((vocab_size, factors))
    
    for i in range(1, len(emb)):
        word = id2w[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_id = wordidx[word]
            emb[i] = vecs[src_id]
        else:
            #If the word doesn't exist in glove, randomly init
            emb[i] = normal(scale=0.6, size=(factors,))
    
    #For the rare words id - randomly init as well
    emb[-1] = normal(scale=0.6, size=(factors,))
    emb/=3

    return emb

In [18]:
emb = create_emb()

In [25]:
conv2 = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, weights=[emb], trainable=True),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

  


In [33]:
conv2.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [34]:
conv2.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f21b0fb4710>

In [32]:
conv2.optimizer.lr=1e-4

In [31]:
conv2.fit(trn, labels_train, validation_data=(test, labels_test), epochs=5, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f21b6522080>

## Convolution model using multiple convolution sets of varying filter size Acc 89.69

In [41]:
from keras.layers import Merge

In [42]:
graph_in = Input ((vocab_size, 50))
convs = []
for conv_s in range(3, 8):
    x = Convolution1D(64, conv_s, border_mode='same', activation='relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)
out = Merge(mode="concat")(convs)
graph = Model(graph_in, out)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  


In [43]:
emb = create_emb()

In [44]:
conv3 = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, weights=[emb]),
    Dropout(0.2),
    graph,
    Dropout(0.5),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

  


In [45]:
conv3.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [46]:
conv3.fit(trn, labels_train, validation_data=(test, labels_test), epochs=5, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f21a27cc978>