In [3]:
import pickle
import numpy as np

from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.optimizers import Adam
from keras.constraints import unitnorm
from keras.regularizers import l2

from sklearn.metrics import roc_auc_score

In [9]:
def get_idx_from_sent(sent, word_idx_map, max_len=51):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = np.zeros(shape=max_len + 1, dtype=np.int32) # sentence + label
    for i in range(len(sent)):
        word = sent[i]
        if word in word_idx_map:
            x[i] = word_idx_map[word]
    
    return x

def make_idx_data(reviews, word_idx_map, max_len=51):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, val, test = [], [], []
    for rev in reviews:
        sent = get_idx_from_sent(rev['review_text'], word_idx_map, max_len)
        sent[-1] = rev['label']
        if rev['split'] == 1:
            train.append(sent)
        elif rev['split'] == 0:
            val.append(sent)
        else:
            test.append(sent)
    
    train = np.array(train, dtype=np.int)
    val = np.array(val, dtype=np.int)
    test = np.array(test, dtype=np.int)
    return [train, val, test]


print('Loading data...')
with open('imdb_train_val_test_data.pkl', 'rb') as pkl_file:
    data = pickle.load(pkl_file)

reviews, W, word_idx_map, vocab = data[0], data[1], data[2], data[3]
pkl_file.close()
print("data loaded!")
print(type(reviews))

datasets = make_idx_data(reviews, word_idx_map, max_len=1443)

Loading data...


data loaded!
<class 'list'>


In [12]:
# Train data
num_train = datasets[0].shape[0]
conv_input_width = W.shape[1] # word2vec length
conv_input_height = int(datasets[0].shape[1]-1) # max sentence length

train_X = np.zeros(shape=(num_train, conv_input_height), dtype=np.int32)
train_Y = np.zeros(shape=(num_train, 2), dtype=np.int32)
for i in range(num_train):
    for j in range(conv_input_height):
        train_X[i, j] = datasets[0][i, j]
    
    train_Y[i, datasets[0][i, -1]] = 1

print('train_X.shape = {}'.format(train_X.shape))
print('train_Y.shape = {}'.format(train_Y.shape))

train_X.shape = (20027, 1443)
train_Y.shape = (20027, 2)


In [13]:
# Val data
num_val = datasets[1].shape[0]
val_X = np.zeros(shape=(num_val, conv_input_height), dtype=np.int32)
val_Y = np.zeros(shape=(num_val, 2), dtype=np.int32)
for i in range(num_val):
    for j in range(conv_input_height):
        val_X[i, j] = datasets[1][i, j]
    
    val_Y[i, datasets[1][i, -1]] = 1

print('val_X.shape = {}'.format(val_X.shape))
print('val_Y.shape = {}'.format(val_Y.shape))

val_X.shape = (4973, 1443)
val_Y.shape = (4973, 2)


In [23]:
from keras.layers import Input, merge
from keras.models import Model
num_filters = 128
kernel_size = 3

input = Input(shape=(conv_input_height,), dtype='int32')
embedded = Embedding(input_dim=W.shape[0], output_dim=W.shape[1], input_length=conv_input_height,
                     weights=[W])(input)
embedded = Dropout(0.5)(embedded)

forwards = GRU(units=128)(embedded)
backwards = GRU(units=128, go_backwards=True)(embedded)

output = merge([forwards, backwards], mode='concat', concat_axis=1)
# output = Flatten()(output)
output = Dropout(0.5)(output)
output = Dense(2, kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))(output)
output = Activation('softmax')(output)

model = Model(input, output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 1443)          0                                            
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 1443, 300)     47948700    input_5[0][0]                    
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 1443, 300)     0           embedding_5[0][0]                
____________________________________________________________________________________________________
gru_7 (GRU)                      (None, 128)           164736      dropout_6[0][0]                  
___________________________________________________________________________________________

  
  name=name)


In [24]:
epoch = 0
val_acc = []
val_auc = []

In [25]:
num_epoch = 3

In [None]:
for i in range(num_epoch):
    model.fit(train_X, train_Y, batch_size=256, epochs=1, verbose=1)
    output = model.predict(val_X, batch_size=256, verbose=1)
    
    vacc = np.max([np.sum((output[:,1]>t)==(val_Y[:,1]>0.5)) * 1.0 / len(output) for t in np.arange(0.0, 1.0, 0.01)])
    vauc = roc_auc_score(val_Y, output)
    val_acc.append(vacc)
    val_auc.append(vauc)
    print('Epoch {}: validation accuracy = {:.3%}, validation AUC = {:.3%}'.format(epoch, vacc, vauc))
    epoch += 1

print('{} epochs passed'.format(epoch))
print('Accuracy on validation dataset:')
print(val_acc)
print('AUC on validation dataset:')
print(val_acc)

model.save_weights('c_lstm_3epochs.model')

  


ValueError: Error when checking target: expected dense_1 to have shape (None, 1) but got array with shape (20027, 2)