In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.feature_extraction.text import CountVectorizer

np.random.seed(1337)

df = pd.read_csv('rottentomatoes.csv')

In [2]:
from keras.datasets import imdb

(x_train, v_train), (x_test, v_test) = imdb.load_data(nb_words=20000)

Using TensorFlow backend.


In [3]:
x_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [30]:
df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [4]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
df.shape[0]*4/5

124848

In [6]:
df.shape[0] - df.shape[0]*4/5

31212

In [7]:
count = CountVectorizer(analyzer='word')

df_train = df.iloc[100000:124800, :]

X_train = count.fit_transform(df_train['Phrase'])
y_train = pd.get_dummies(df_train['Sentiment']).values

In [8]:
df_test = df.iloc[124800:, :]

X_test = count.transform(df_test['Phrase'])
y_test = pd.get_dummies(df_test['Sentiment']).values

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0, verbose=3)
model = model.fit(X_train, df_train['Sentiment'].values)

y_prediction = model.predict(X_test)
print "accuracy", np.sum(y_prediction == df_test['Sentiment'].values) / float(len(y_test))

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.8s finished


accuracy 0.513755598209


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [38]:
for i in xrange(50):
    print count.get_feature_names()[i]

10
100
101
104
11
12
13
14
140
15
170
18
19
1930s
1937
1940s
1950s
1952
20
2002
20th
22
26
30
300
48
51
53
5ths
70s
72
90
90s
91
99
abc
abel
abiding
ability
able
ably
aboriginal
about
above
abrupt
absence
absent
absolute
absolutely
absorbing


In [10]:
sequencer = count.build_analyzer()

In [11]:
from collections import defaultdict

name_to_index = defaultdict(int)

for i, item in enumerate(count.get_feature_names()):
    name_to_index[item] = i+1

In [12]:
X_train_seq = []

for item in df_train['Phrase']:
    seq = [name_to_index[word] for word in sequencer(item)]
    X_train_seq.append(seq)

In [13]:
X_test_seq = []

for item in df_test['Phrase']:
    seq = [name_to_index[word] for word in sequencer(item)]
    X_test_seq.append(seq)

In [14]:
len(X_train_seq)

24800

In [15]:
len(X_test_seq)

31260

In [16]:
np.array(X_train_seq).shape

(24800,)

In [17]:
np.max(map(len, X_train_seq))

46

In [18]:
from keras.preprocessing import sequence

X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=48)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=48)

In [34]:
df_train['Phrase'].values[0]

'nothing we Westerners have seen before'

In [35]:
sequencer(df_train['Phrase'].values[0])

[u'nothing', u'we', u'westerners', u'have', u'seen', u'before']

In [20]:
X_train_seq[0]

[3445, 5505, 5536, 2333, 4377, 446]

In [21]:
X_train_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0, 3445, 5505,
       5536, 2333, 4377,  446], dtype=int32)

In [22]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

In [23]:
len(name_to_index)

9356

In [24]:
y_train.shape

(24800, 5)

In [25]:
max_features=len(name_to_index)+1

In [26]:
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(5))
model.add(Activation('sigmoid'))

In [27]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
model.fit(X_train_pad, y_train, nb_epoch=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11ce04c10>

In [29]:
model.evaluate(X_test_pad, y_test)



[0.37664536931388132, 0.83314779283751761]