**CNN/RNN - Sentiment Analysis**

Reference: https://github.com/rouseguy/DeepLearning-NLP/blob/master/notebooks/3.%20CNN%20-%20Text.ipynb

In [1]:
import numpy as np
from script import data_helpers
from script import w2v 
from script.w2v import train_word2vec
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, SpatialDropout1D, Convolution1D, MaxPooling1D, LSTM
from sklearn.cross_validation import train_test_split
np.random.seed(2)

Using TensorFlow backend.


## Train a Word2Vec model to get the embedding vectors

### Read in training data

In [2]:
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data() # Note, x is padded with zero in the end
print("Loading finished...")
print('There is a total of ' + str(len(vocabulary)) + ' words in vocabulary')
print('The shape of X is: ' + str(x.shape)) # 10662 sequences, every sequence has 56 words
print('The shape of Y is: ' + str(y.shape)) # 10662 results with either [1,0] - positive or [0,1] - negative

Loading data...
Loading finished...
There is a total of 18779 words in vocabulary
The shape of X is: (10662, 56)
The shape of Y is: (10662, 2)


### Apply `train_word2vec` function

In [4]:
# Model Hyperparameters
sequence_length = 56
embedding_dim = 20          
num_filters = 150
filter_size = 3
dropout_prob = 0.25

# Training parameters
batch_size = 32
num_epochs = 2

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count                        
context = 10        # Context window size

In [5]:
# train_word2vec
embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context)

Loading existing Word2Vec model '20features_1minwords_10context'


In [9]:
embedding_weights[0].shape # (Vocabulary length, embedding dim)

(18779, 20)

## Generate train/test set

In [55]:
import random
random.seed(1)
data = np.append(x,y,axis = 1)
train, test = train_test_split(data, test_size = 0.15,random_state = 0)
X_test = test[:,:-2]
Y_test = test[:,-2:]
X_train = train[:,:-2]
Y_train = train[:,-2:]

In [56]:
X_train.shape # Number of example, length of sequence

(9062, 56)

## Model training - 1, 1-d CNN

In [57]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), 
                    output_dim = embedding_dim, 
                    input_length = sequence_length,
                    weights = embedding_weights))

model.add(Convolution1D(filters = num_filters,
                         kernel_size = filter_size,
                         strides = 1,
                         padding = 'valid',
                         activation = 'relu'))

model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(dropout_prob))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 56, 20)            375580    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 54, 150)           9150      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 27, 150)           0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 27, 150)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 4050)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 8102      
Total params: 392,832
Trainable params: 392,832
Non-trainable params: 0
_________________________________________________________________


![image](https://cdn-images-1.medium.com/max/1200/1*h_L7fSoQhipTHFULgXmHyQ.png)

In [58]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=5)

Train on 9062 samples, validate on 1600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a2ac9bba8>

### Model prediction 

In [59]:
preds = model.predict_classes(X_test)
acc = sum(preds == np.array([0 if y_test[0] == 1 else 1 for y_test in Y_test])) / len(Y_test)
print('The Accuracy is:',acc) 

The Accuracy is: 0.741875


In [60]:
model.evaluate(X_test, Y_test, verbose=0)
print('The Accuracy is:',acc) 

The Accuracy is: 0.741875


## Model training - 2, LSTM

In [61]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), 
                    output_dim = embedding_dim, 
                    input_length = sequence_length,               
                    mask_zero = True, #https://keras.io/layers/embeddings/
                    weights = embedding_weights))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(units = 120, 
               dropout = 0.2,  
               recurrent_dropout = 0.2,
               return_sequences = False)) # (Batch size, time steps, units) - with return_sequences=True

model.add(Dense(2, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 56, 20)            375580    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 56, 20)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 120)               67680     
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 242       
Total params: 443,502
Trainable params: 443,502
Non-trainable params: 0
_________________________________________________________________


![image](https://d3ansictanv2wj.cloudfront.net/SentimentAnalysis16-38b6f3cbb7bae622fe0ba114db188666.png)

In [62]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=5)

score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 9062 samples, validate on 1600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.512823333144
Test accuracy: 0.76
