## Imports

In [8]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

## Data
Loading data and droppping unnecessary columns. Limiting the data size for the sake of tutorial

In [4]:
LIMIT = 10000
data = pd.read_csv('data/imdb_master.csv',header=0,names=['id','split','review','sentiment','file'])
data = data[['review','sentiment']].sample(frac=1)[:LIMIT]
data['review'].size

10000

## Pre-Processing
Filtering the review so only valid texts and words remain.  Defineing the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [7]:
data['review'] = data['review'].apply(lambda x: x.lower())
data['review'] = data['review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'pos'].size)
print(data[ data['sentiment'] == 'neg'].size)
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['review'].values)
X = tokenizer.texts_to_sequences(data['review'].values)
X = pad_sequences(X)

10084
9916


## Model

Composing the LSTM Network. Note that **embed_dim**, **lstm_out**, **batch_size**, **droupout_x** variables are hyperparameters, their values are somehow intuitive, can be and must be played with in order to achieve good results. Please also note that I am using **sigmoid** as activation function. The reason is that our Network is using **binary crossentropy**, and sigmoid is just the right activation method for that.

In [10]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1317, 128)         256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 1317, 128)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


## Training

Splitting the data into test and train sets

In [11]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.4, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6000, 1317) (6000, 2)
(4000, 1317) (4000, 2)


Training the Network. Ideally the number of epochs should be much higher. We need to keep training untill the accuracy saturates. 

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

In [None]:
model.save('models/model_sent_lstm.h5')

## Evaluation

Extracting a validation set, and measuring score and accuracy.

In [None]:
X_validate = X_test[2000:]
Y_validate = Y_test[2000:]
X_test = X_test[:2000]
Y_test = Y_test[:2000]
score, acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

Finally measuring the number of correct guesses(True Positive and True Negatives)

In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")