# Section 3-3 - Recurrent Neural Network

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from time import time

np.random.seed(1337)

df = pd.read_csv('../data/rottentomatoes.csv')

In [4]:
count = CountVectorizer(analyzer='word')

df_train = df.iloc[:124800, :]

X_train = count.fit_transform(df_train['Phrase'])
y_train = df_train['Sentiment'].values
y_train_onehot = pd.get_dummies(df_train['Sentiment']).values

In [5]:
df_test = df.iloc[124800:, :]

X_test = count.transform(df_test['Phrase'])
y_test = df_test['Sentiment'].values

## Benchmark

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0, verbose=3)
model = model.fit(X_train, y_train)

y_prediction = model.predict(X_test)
print "accuracy", np.sum(y_prediction == y_test) / float(len(y_test))

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.2min finished


accuracy 0.526967370441


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.7s finished


## Pre-processing

In [8]:
from collections import defaultdict

name_to_index = defaultdict(int)

for i, item in enumerate(count.get_feature_names()):
    name_to_index[item] = i+1

In [9]:
sequencer = count.build_analyzer()

In [10]:
X_train_seq = []

for item in df_train['Phrase']:
    seq = [name_to_index[word] for word in sequencer(item)]
    X_train_seq.append(seq)

In [11]:
X_test_seq = []

for item in df_test['Phrase']:
    seq = [name_to_index[word] for word in sequencer(item)]
    X_test_seq.append(seq)

In [12]:
from keras.preprocessing import sequence

X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=48)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=48)

Using TensorFlow backend.


## Long Short-Term Memory

In [17]:
# https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py

from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM

start = time()

model = Sequential()
model.add(Embedding(len(name_to_index)+1, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(5))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_pad, y_train_onehot, nb_epoch=2)

print '\ntime taken %s seconds' % str(time() - start)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2

time taken 2296.85464907 seconds


In [18]:
y_prediction = model.predict_classes(X_test_pad)
print "\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test))


accuracy 0.601119641715
