This is based on this code: https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py

In [3]:
%load_ext autoreload
%autoreload 2

In [68]:
import numpy as np
from collections import defaultdict
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
import keras.layers.convolutional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from Metrics import rpf1
from load_data import load_process_essays

from window_based_tagger_config import get_config
from IdGenerator import IdGenerator as idGen

import Settings
import logging

import datetime

## Load and Pre-Process Essays

In [19]:
%%time
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
config = get_config(training_folder)

essays = load_process_essays(**config)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 files found
902 essays processed
CPU times: user 44.5 s, sys: 181 ms, total: 44.7 s
Wall time: 45.9 s


In [10]:
print("Started at: " + str(datetime.datetime.now()))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

TEST_SPLIT          = 0.2

Started at: 2017-03-04 21:23:03.025964


In [84]:
from numpy.random import shuffle
shuffle(essays)

## Prepare Tags

In [22]:
tagged_essays = essays
tag_freq = defaultdict(int)
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            for tag in tags:
                tag_freq[tag] += 1

In [48]:
EMPTY_TAG = "Empty"
regular_tags = list((t for t in tag_freq.keys() if t[0].isdigit()))
vtags = set(regular_tags)
vtags.add(EMPTY_TAG)
vtags

{'1',
 '11',
 '12',
 '13',
 '14',
 '2',
 '3',
 '4',
 '5',
 '50',
 '5b',
 '6',
 '7',
 'Empty'}

# Transform Essays into Training Data (Word Ids)

In [144]:
from collections import defaultdict
generator = idGen()
xs = []
ys = []

START = "<start>"
END   = "<end>"

ix2tag = {}
for ix, t in enumerate(vtags):
    ix2tag[ix] = t

# cut texts after this number of words (among top max_features most common words)
maxlen = 0
ys_bytag = defaultdict(list)
for essay in tagged_essays:
    for sentence in essay.sentences:
        row = []
        y_found = False
        y_seq = []
        for word, tags in [(START, set())] + sentence + [(END, set())]:
            id = generator.get_id(word) + 1 #starts at 0, but 0 used to pad sequences
            row.append(id)
            
            tags = vtags.intersection(tags)
            if word != START and word != END:
                for t in (vtags - set([EMPTY_TAG])):
                    if t in tags:
                        ys_bytag[t].append(1)
                    else:
                        ys_bytag[t].append(0)
            
            if len(tags) > 1:
                most_common = max(tags, key=lambda t: tag_freq[t])
                tags = set([most_common])
            if len(tags) == 0:
                tags.add(EMPTY_TAG)
            
            one_hot = []
            for t in vtags:
                if t in tags:
                    one_hot.append(1)
                else:
                    one_hot.append(0)
            y_seq.append(one_hot)
    
        ys.append(y_seq)
        xs.append(row)
        maxlen = max(len(row), maxlen)

max_features=generator.max_id() + 2

## Create Train - Test Split

In [171]:
maxlen = maxlen
num_training = int((1.0 - TEST_SPLIT) * len(xs))

X_train_orig, y_train_orig, X_test_orig, y_test_orig = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:]

def get_num_words(y_train):
    total = 0
    for row in y_train:
        total += len(row)-2 # subtract the start and end symbol
    return total

def split_dict(dct, train_split):
    td, vd = {}, {}
    for key, lst in dct.items():
        td[key] = lst[:train_split]
        vd[key] = lst[train_split:]
    return td, vd

num_train_wds = get_num_words(y_train_orig)
num_wds       = get_num_words(ys)
train_ys_bytag, test_ys_by_tag = split_dict(ys_bytag, num_train_wds)

In [172]:
code = '50'
print(num_train_wds, num_wds - num_train_wds, num_wds)
len(train_ys_bytag[code]),len(test_ys_by_tag[code]),len(ys_bytag[code]),

109536 27630 137166


(109536, 55260, 164796)

## <span style="color:red">Need to get the shapes above to align</span>

In [146]:
X_train = sequence.pad_sequences(X_train_orig, maxlen=maxlen) #30 seems good
X_test  = sequence.pad_sequences(X_test_orig,  maxlen=maxlen)

y_train = sequence.pad_sequences(y_train_orig, maxlen=maxlen)
y_test  = sequence.pad_sequences(y_test_orig,  maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape: ',  X_test.shape)
print()
print('y_train shape:', y_train.shape)
print('y_test shape: ',  y_test.shape)

X_train shape: (6633, 93)
X_test shape:  (1659, 93)

y_train shape: (6633, 93, 14)
y_test shape:  (1659, 93, 14)


In [147]:
embedding_size = 64
hidden_size    = 64
out_size = len(vtags)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model.add(LSTM(hidden_size, return_sequences=True))  
#merge
model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [148]:
batch_size = 128
epochs = 5

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
probs = model.predict_proba(X_test, batch_size=batch_size)    

Train on 5306 samples, validate on 1327 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [83]:
X_test.shape, probs.shape, maxlen

((1659, 93), (1659, 93, 14), 93)

In [101]:
print(len(X_test[0]))
X_test[0]

93


array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   7, 116, 181,
       182,   7,  10,   4, 183,  30,  96,  20, 129, 184, 185, 186, 187,
        12,  13], dtype=int32)

In [93]:
np.asarray(X_test_orig[0])

array([  1,   7, 116, 181, 182,   7,  10,   4, 183,  30,  96,  20, 129,
       184, 185, 186, 187,  12,  13])

In [None]:
print(probs[0].shape)
probs[0][-14:,:]

In [151]:
def collapse_results(ys, probs):
    ixs = np.argmax(probs, axis=2)
    ys_by_tag = defaultdict(list)
    for i in range(len(ys)):
        row_ixs = ixs[i,:]
        y_seq = ys[i]
        pred_ys = [ix2tag[j] for j in row_ixs[-(len(y_seq)):]]   
        # skip the start and end label
        pred_ys = pred_ys[1:len(pred_ys)-1]
        for pred_tag in pred_ys:
            ys_bytag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in(vtags - set([EMPTY_TAG, pred_tag])):
                ys_bytag[tag].append(0)
    return ys_bytag

test_pred_ys_by_tag = collapse_results(y_test_orig, probs)

In [156]:
total = 0
for row in y_test_orig:
    total += len(row)
total

30948

In [155]:
len(test_pred_ys_by_tag["5"]), len(test_ys_by_tag["5"])

(164796, 130533)

In [137]:
ixs = np.argmax(probs, axis=2)
i = 12
row_ixs = ixs[i,:]
y_seq = y_test_orig[i]
y_lbls = [ix2tag[j] for j in np.argmax(y_seq,axis=1)]
pred_ys = [ix2tag[j] for j in row_ixs[-(len(y_seq)):]]
len(y_seq),y_lbls, pred_ys

(10,
 ['Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty'],
 ['Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty',
  'Empty'])

### TODO
* Look into masking
* Try Bi-Directional LSTM (see example below)
* Use early stopping criteria

In [None]:
""" Bi Directional LSTM example from - https://github.com/fchollet/keras/issues/3086

from keras.models import Model
import numpy as np
from keras.layers import Masking, Activation, Input, LSTM, merge
a = np.array([[[.3,.1,.2,.2,.1,.1],[.2,.3,.3,.3,.3,.1],[0,0,0,0,0,0]]])

inputs = Input(shape=(3,6))
mask = Masking(mask_value=0.0)(inputs)
fw = LSTM(1,return_sequences=True)(mask)
bw = LSTM(1,return_sequences=True,go_backwards=True)(mask)
merged = merge([fw,bw],mode='sum')
model = Model(input=inputs,output=fw)
model2 = Model(input=inputs,output=bw)
model3 = Model(input=inputs,output=merged)
"""