In [1]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os


Using TensorFlow backend.


In [35]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0

with open("training.txt",'rb') as ftrain:
    lines = [x.decode("ascii","ignore").strip().lower() for x in ftrain.readlines()]
    for line in lines:
        label , sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence)
        if( len(words) > maxlen):
            maxlen = len(words)
        for word in words:
            word_freqs[word]+=1
        num_recs +=1
        

In [22]:
print(maxlen)

42


In [36]:
num_recs

7086

In [23]:
len(word_freqs) 

2313

In [24]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40

In [25]:
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i+2 for i, x in
enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [38]:
X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
with open("training.txt",'rb') as ftrain:
    lines = [x.decode("ascii","ignore").strip().lower() for x in ftrain.readlines()]
    for line in lines:
        label, sentence = line.split("\t")
        words = nltk.word_tokenize(sentence)
        seqs = []
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i] = seqs
        y[i] = int(label)
        i += 1

X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)



In [39]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2,
random_state=42)

In [45]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 10
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=float(0.2), recurrent_dropout=float(0.2)))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])


In [46]:
history = model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE,
epochs=NUM_EPOCHS,
 validation_data=(Xtest, ytest))

Train on 5668 samples, validate on 1418 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1,40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent = " ".join([index2word[x] for x in xtest[0].tolist() if x != 0])
    print("predicted: %.0f  label:  %d   sent:   %s" % (ypred, ylabel, sent))

Test score: 0.037, accuracy: 0.992
predicted: 1  label:  1   sent:   i love harry potter..
predicted: 0  label:  0   sent:   my dad 's being stupid about brokeback mountain ...
predicted: 0  label:  0   sent:   oh , and brokeback mountain is a terrible movie ...
predicted: 1  label:  1   sent:   i love mission impossible 1 :
predicted: 1  label:  1   sent:   brokeback mountain was so awesome .
