https://gist.github.com/dirko/1d596ca757a541da96ac3caa6f291229

http://dirko.github.io/Bidirectional-LSTMs-with-Keras/

In [3]:
# author: Keras==1.0.6
# mine: Keras==1.2.1
from keras.models import Sequential
import numpy as np
from keras.layers.recurrent import LSTM
from keras.layers.core import TimeDistributedDense, Activation
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from sklearn.cross_validation import train_test_split
from keras.layers import Merge
from keras.backend import tf
from lambdawithmask import Lambda as MaskLambda
from sklearn.metrics import confusion_matrix, accuracy_score

In [18]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
def reverse_func(x, mask=None):
    return tf.reverse(x, [False, True, False])

def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [5]:
raw = open('train.csv', 'r').readlines()
all_x = []
point = []
for line in raw:
    stripped_line = line.strip().split(',')
    point.append(stripped_line)
    if line == '""\r\n':
#         print "newline"
        all_x.append(point[:-1])
        point = []
all_x = all_x[:-1]
lengths = [len(x) for x in all_x]
# short_x = [x for x in all_x if len(x) < 64]
short_x = []
for l in all_x:
    short_x.extend(chunks(l, 64))

In [6]:
len(short_x)

3428

In [7]:
X = [[c[0] for c in x] for x in short_x]
y = [[c[1] for c in y] for y in short_x]

In [8]:
all_text = [c for x in X for c in x]
words = list(set(all_text))
word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}
labels = list(set([c for x in y for c in x]))
label2ind = {label: (index + 1) for index, label in enumerate(labels)}
ind2label = {(index + 1): label for index, label in enumerate(labels)}
print 'Input sequence length range: ', max(lengths), min(lengths)

Input sequence length range:  4741 58


In [9]:
maxlen = max([len(x) for x in X])
print 'Maximum sequence length:', maxlen

Maximum sequence length: 64


In [10]:
X_enc = [[word2ind[c] for c in x] for x in X]
X_enc_reverse = [[c for c in reversed(x)] for x in X_enc]
max_label = max(label2ind.values()) + 1
y_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y]
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]

In [11]:
X_enc_f = pad_sequences(X_enc, maxlen=maxlen)
X_enc_b = pad_sequences(X_enc_reverse, maxlen=maxlen)
y_enc = pad_sequences(y_enc, maxlen=maxlen)

In [12]:
(X_train_f, X_test_f, X_train_b,
 X_test_b, y_train, y_test) = train_test_split(X_enc_f, X_enc_b, y_enc,
                                               test_size=11*32, train_size=45*32, random_state=42)
print 'Training and testing tensor shapes:'
print X_train_f.shape, X_test_f.shape, X_train_b.shape, X_test_b.shape, y_train.shape, y_test.shape

Training and testing tensor shapes:
(1440, 64) (352, 64) (1440, 64) (352, 64) (1440, 64, 8) (352, 64, 8)


In [13]:
max_features = len(word2ind)
embedding_size = 128
hidden_size = 32
out_size = len(label2ind) + 1

In [14]:
model_forward = Sequential()
model_forward.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model_forward.add(LSTM(hidden_size, return_sequences=True))  

In [15]:
model_backward = Sequential()
model_backward.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model_backward.add(LSTM(hidden_size, return_sequences=True))
model_backward.add(MaskLambda(function=reverse_func, mask_function=reverse_func))

In [16]:
model = Sequential()

model.add(Merge([model_forward, model_backward], mode='concat'))
model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')



In [17]:
batch_size = 32
model.fit([X_train_f, X_train_b], y_train, batch_size=batch_size, nb_epoch=40,
          validation_data=([X_test_f, X_test_b], y_test))
score = model.evaluate([X_test_f, X_test_b], y_test, batch_size=batch_size)
print('Raw test score:', score)

Train on 1440 samples, validate on 352 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
('Raw test score:', 0.15355714004148135)


In [19]:
pr = model.predict_classes([X_train_f, X_train_b])
yh = y_train.argmax(2)
fyh, fpr = score(yh, pr)
print 'Training accuracy:', accuracy_score(fyh, fpr)
print 'Training confusion matrix:'
print confusion_matrix(fyh, fpr)

Training accuracy: 0.999954909763
Training confusion matrix:
[[  371     0     2     0     0     0     0]
 [    0    41     1     0     0     0     0]
 [    0     0 82151     0     0     0     0]
 [    0     0     1    98     0     0     0]
 [    0     0     0     0    14     0     0]
 [    0     0     0     0     0   266     0]
 [    0     0     0     0     0     0  5766]]


In [20]:
pr = model.predict_classes([X_test_f, X_test_b])
yh = y_test.argmax(2)
fyh, fpr = score(yh, pr)
print 'Testing accuracy:', accuracy_score(fyh, fpr)
print 'Testing confusion matrix:'
print confusion_matrix(fyh, fpr)

Testing accuracy: 0.972151311209
Testing confusion matrix:
[[    0     0     0     0     0     0     0     0]
 [    0    53     0    22     0     0     0     7]
 [    0     0     2    11     0     0     0     0]
 [   19    17     0 19737     5     0    13   149]
 [    2     0     0    17    15     0     0     1]
 [    0     0     0     0     0     3     0     0]
 [    0     0     0    26     0     0    12     0]
 [   15     0     0   296     0     0     0  1123]]
