In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, RMSprop
from keras.callbacks import ModelCheckpoint
from keras.datasets import mnist
from keras.utils import np_utils 
from gensim.models import Word2Vec
import numpy as np
import re
from gensim.parsing import strip_multiple_whitespaces

Using gpu device 0: Quadro K2000


In [2]:
#### return sentence matrix
def get_img(tokens,max_len = 50):
    img = []
    for w in tokens:
        try:
            img.append(model[w])
        except:
            pass
    if len(img) >= max_len:
        img = img[:max_len]
    elif len(img) > 0:
        n = max_len - len(img)
        img = np.vstack([img,np.zeros((n,300))])
    else:
        img = np.zeros((max_len,300))
    return np.array(img)

def preprocess(sentence):
    return strip_multiple_whitespaces(re.sub(r'[\W_]+', ' ', sentence)).split(' ')
def process_output(labels):
    return np_utils.to_categorical(labels, nb_classes=2)

In [3]:
pos_file ="data/rt-polarity.pos"
neg_file = "data/rt-polarity.neg"
w2v_file = "data/GoogleNews-vectors-negative300.bin"
pos_sentences = open(pos_file).readlines()
neg_sentences = open(neg_file).readlines()
%time model = Word2Vec.load_word2vec_format(w2v_file,binary=True)

CPU times: user 1min 51s, sys: 5.89 s, total: 1min 57s
Wall time: 1min 57s


In [4]:
pos_tokens = [preprocess(s) for s in pos_sentences]
labels = [1]*len(pos_tokens)
neg_tokens = [preprocess(s) for s in neg_sentences]
labels += [0]*len(neg_sentences)
imgs= [get_img(t) for t in pos_tokens+neg_tokens]
imgs = np.array(imgs)
labels = np.array(labels)
print imgs.shape, labels.shape

(10662, 50, 300) (10662,)


In [5]:
from sklearn.cross_validation import train_test_split
#train_X,test_X,train_y,test_y = train_test_split(imgs.reshape((imgs.shape[0],-1)),labels,test_size=0.2)
train_X,test_X,train_y,test_y = train_test_split(imgs.reshape((imgs.shape[0],1,imgs.shape[1],imgs.shape[2])),labels,test_size=0.2)
train_y = process_output(train_y)
test_y = process_output(test_y)
print train_X.shape, train_y.shape, test_X.shape, test_y.shape

(8529, 1, 50, 300) (8529, 2) (2133, 1, 50, 300) (2133, 2)


In [17]:
model = Sequential()
model.add(Convolution2D(nb_filter = 32, stack_size = 1, nb_row = 3, nb_col = 3, 
                        border_mode="full", activation="relu"))
model.add(Convolution2D(nb_filter = 32, stack_size = 32, nb_row = 3, nb_col = 3, 
                        activation="relu"))
model.add(MaxPooling2D(poolsize=(2, 2)))
model.add(Dropout(.25))

model.add(Flatten()) ## flatten to vectors - from convolution layer to vector layer
## 28x28 image after (2, 2)-pooling becomes (14, 14)
model.add(Dense(120000, 128, activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(128, 2, activation="softmax"))

## compile with loss function and optimizer
## benchmark the time to compile
%time model.compile(loss = "categorical_crossentropy", optimizer = "adadelta")

CPU times: user 9.41 s, sys: 503 ms, total: 9.91 s
Wall time: 9.92 s


In [18]:
## train the model
model.fit(train_X, train_y, batch_size=30, nb_epoch=10, 
          validation_split=0.2, show_accuracy=True, verbose=2)
print model.evaluate(test_X, test_y, show_accuracy=True)

Train on 6823 samples, validate on 1706 samples
Epoch 0
130s - loss: 0.6925 - acc: 0.5266 - val_loss: 0.6872 - val_acc: 0.5199
Epoch 1
131s - loss: 0.6673 - acc: 0.6110 - val_loss: 0.6505 - val_acc: 0.6213
Epoch 2
131s - loss: 0.6010 - acc: 0.6814 - val_loss: 0.5978 - val_acc: 0.6682
Epoch 3
132s - loss: 0.5530 - acc: 0.7201 - val_loss: 0.5539 - val_acc: 0.7157
Epoch 4
130s - loss: 0.5184 - acc: 0.7469 - val_loss: 0.5447 - val_acc: 0.7104
Epoch 5
130s - loss: 0.4808 - acc: 0.7689 - val_loss: 0.5635 - val_acc: 0.7034
Epoch 6
131s - loss: 0.4480 - acc: 0.7889 - val_loss: 0.7600 - val_acc: 0.6518
Epoch 7
130s - loss: 0.4039 - acc: 0.8216 - val_loss: 0.6137 - val_acc: 0.7128
Epoch 8
130s - loss: 0.3636 - acc: 0.8432 - val_loss: 0.6533 - val_acc: 0.7057
Epoch 9
130s - loss: 0.3088 - acc: 0.8681 - val_loss: 0.6725 - val_acc: 0.7192
[0.61854644848380469, 0.73886544772620721]
