This is based on this code: https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from collections import defaultdict
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
import keras.layers.convolutional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from Metrics import rpf1
from load_data import load_process_essays

from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from IdGenerator import IdGenerator as idGen
from results_procesor import ResultsProcessor, __MICRO_F1__
from Rpfa import micro_rpfa

import Settings
import logging

import datetime

Using TensorFlow backend.


In [65]:
#Helper Functions
def collapse_results(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    pred_ys_by_tag = defaultdict(list)
    for i in range(len(seq_lens)):
        row_ixs = preds[i,:]
        len_of_sequence = seq_lens[i] + 2
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]   
        # skip the start and end label
        pred_ys = pred_ys[1:-1]
        for pred_tag in pred_ys:
            pred_ys_by_tag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in(vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

def split_dict(dct, train_split):
    td, vd = {}, {}
    for key, lst in dct.items():
        td[key] = lst[:train_split]
        vd[key] = lst[train_split:]
    return td, vd

## Load and Pre-Process Essays

In [4]:
%%time
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
config = get_config(training_folder)

tagged_essays = load_process_essays(**config)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 files found
902 essays processed
CPU times: user 52.7 s, sys: 619 ms, total: 53.3 s
Wall time: 55.4 s


In [5]:
print("Started at: " + str(datetime.datetime.now()))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

TEST_SPLIT          = 0.2

Started at: 2017-03-08 19:56:28.021132


In [6]:
from numpy.random import shuffle
shuffle(tagged_essays)

## Prepare Tags

In [30]:
tag_freq = defaultdict(int)
unique_words = set()
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1
len(unique_words)

1641

In [8]:
EMPTY_TAG = "Empty"
regular_tags = list((t for t in tag_freq.keys() if t[0].isdigit()))
vtags = set(regular_tags)
vtags.add(EMPTY_TAG)
sorted(vtags)

['1',
 '11',
 '12',
 '13',
 '14',
 '2',
 '3',
 '4',
 '5',
 '50',
 '5b',
 '6',
 '7',
 'Empty']

# Transform Essays into Training Data (Word Ids)

In [9]:
from collections import defaultdict
generator = idGen(seed=1) # important as we zero pad sequences

ix2tag = {}
for ix, t in enumerate(vtags):
    ix2tag[ix] = t

def ids2tags(ids):
    return [generator.get_key(j) for j in ids]  

def lbls2tags(ixs):
    return [ix2tag[ix] for ix in ixs]

xs = []
ys = []

START = "<start>"
END   = "<end>"

# cut texts after this number of words (among top max_features most common words)
maxlen = 0
ys_bytag = defaultdict(list)
seq_lens = []
for essay in tagged_essays:
    for sentence in essay.sentences:
        row = []
        y_found = False
        y_seq = []
        for word, tags in [(START, set())] + sentence + [(END, set())]:
            id = generator.get_id(word) #starts at 0, but 0 used to pad sequences
            row.append(id)
            
            tags = vtags.intersection(tags)
            if word != START and word != END:
                for t in (vtags - set([EMPTY_TAG])):
                    if t in tags:
                        ys_bytag[t].append(1)
                    else:
                        ys_bytag[t].append(0)
            
            if len(tags) > 1:
                most_common = max(tags, key=lambda t: tag_freq[t])
                tags = set([most_common])
            if len(tags) == 0:
                tags.add(EMPTY_TAG)
            
            one_hot = []
            for t in vtags:
                if t in tags:
                    one_hot.append(1)
                else:
                    one_hot.append(0)
            y_seq.append(one_hot)
    
        seq_lens.append(len(row)-2)
        ys.append(y_seq)
        xs.append(row)
        maxlen = max(len(row), maxlen)

max_features=len(generator.get_ids())+1 #Need plus one maybe due to masking of sequences

## Create Train - Test Split

In [10]:
maxlen = maxlen
num_training = int((1.0 - TEST_SPLIT) * len(xs))

X_train_orig, y_train_orig, X_test_orig, y_test_orig = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:]
seq_len_train, seq_len_test = seq_lens[:num_training], seq_lens[num_training:]

num_train_wds = sum(seq_len_train)
num_wds       = sum(seq_lens)
train_ys_bytag, test_ys_by_tag = split_dict(ys_bytag, num_train_wds)

### Validate Dictionary Splitting Code

In [11]:
code = '50'
# Values below should align
print(num_train_wds, num_wds - num_train_wds, num_wds)
print(len(train_ys_bytag[code]),len(test_ys_by_tag[code]),len(ys_bytag[code]))
a = num_train_wds, num_wds - num_train_wds, num_wds
b = len(train_ys_bytag[code]),len(test_ys_by_tag[code]),len(ys_bytag[code])
assert a==b, "Splitting logic does not work correctly"

109660 27506 137166
109660 27506 137166


In [12]:
X_train = sequence.pad_sequences(X_train_orig, maxlen=maxlen) #30 seems good
X_test  = sequence.pad_sequences(X_test_orig,  maxlen=maxlen)

y_train = sequence.pad_sequences(y_train_orig, maxlen=maxlen)
y_test  = sequence.pad_sequences(y_test_orig,  maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape: ',  X_test.shape)
print()
print('y_train shape:', y_train.shape)
print('y_test shape: ',  y_test.shape)

X_train shape: (6633, 93)
X_test shape:  (1659, 93)

y_train shape: (6633, 93, 14)
y_test shape:  (1659, 93, 14)


## Single Layer LSTM

In [13]:
from keras.layers import Bidirectional

embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model.add(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["categorical_crossentropy"], sample_weight_mode="temporal")



In [79]:
batch_size = 128
epochs = 5 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

Train on 5306 samples, validate on 1327 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Recall: 0.3895, Precision: 0.6873, F1: 0.4972, Accuracy: 0.9846, Codes:  7194

In [15]:
code = "50"
assert len(test_pred_ys_by_tag[code]) == sum(seq_len_test) == len(test_ys_by_tag[code])
len(test_pred_ys_by_tag[code]), sum(seq_len_test), len(test_ys_by_tag[code])

(26466, 26466, 26466)

## Bi-Directional LSTM

In [None]:
from keras.layers import Bidirectional

embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")

In [None]:
batch_size = 128
epochs = 5 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

Bi-Directional LSTM, 128 hidden, 128 embedding, 20 epochs, batch size = 128 # 30 epochs worse

Recall: 0.8146, Precision: 0.8155, **F1: 0.8151**, Accuracy: 0.9930, Codes:  6549

## Try with Pre-Trained Embedding

## Load Glove 100 Dim Embeddings

In [15]:
import os

GLOVE_DIR = "/Users/simon.hughes/data/word_embeddings/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    #TODO - normalize to unit_vector here?
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [43]:
from numpy.linalg import norm

v = list(embeddings_index.values())
norms = []
for i in range(len(v)):
    norms.append(norm(v[i]))
mean_vec_len = np.mean(norms)
np.mean(norms), np.min(norms), np.max(norms)

(3.9767718, 0.037173089, 12.311416)

In [42]:
missed = set()
for wd in unique_words:
    if wd not in embeddings_index:
        missed.add(wd)
print(len(missed), len(unique_words), 100.0 * round(len(missed)/  len(unique_words),4), "%")

41 1641 2.5 %


In [58]:
# Filter down to just the items we want so matrix is not unnecessarily huge
small_emd_matrix = {}
new_ones = {}
shape = list(embeddings_index.values())[0].shape
for wd in unique_words:
    if wd in embeddings_index:
        coeff = embeddings_index[wd]
    else:
        v = np.random.random(shape)
        norm = np.linalg.norm(v)
        coeff = mean_vec_len*(v / norm)
        new_ones[wd] = coeff
    small_emd_matrix[wd] = coeff

In [62]:
# are random ones the correct size?
np.linalg.norm(list(new_ones.values())[0]), mean_vec_len

(3.9767718315124516, 3.9767718)

### Construct Embedding Matrix

In [61]:
EMBEDDING_DIM = shape[0]
embedding_matrix = np.zeros((max_features, EMBEDDING_DIM))
for word in unique_words:
    i = generator.get_id(word)
    embedding_vector = small_emd_matrix.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

(1644, 100)

In [70]:
set(embedding_matrix[generator.get_id("coral")] == small_emd_matrix["coral"])

{True}

## Train Bi-Directional LSTM With Glove Embeddings

In [63]:
embedding_layer = Embedding(max_features,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True,
                            mask_zero=True) # If false, initialize unfound words with all 0's
embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [67]:
batch_size = 128
epochs = 15 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

Train on 5306 samples, validate on 1327 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


Recall: 0.8063, Precision: 0.8098, F1: 0.8080, Accuracy: 0.9929, Codes:  6604

**Pre-Trained Embeddings - Tuned:**

Bi-Directional LSTM, 128 hidden, 128 embedding, 20 epochs, batch size = 128

Recall: 0.8063, Precision: 0.8098, F1: **0.8080**, Accuracy: 0.9929, Codes:  6604

## Keep Embeddings Fixed

In [None]:
# Set trainable to false
embedding_layer = Embedding(max_features,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False,
                            mask_zero=True) # If false, initialize unfound words with all 0's
embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [None]:
batch_size = 128
epochs = 5 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

Train on 5306 samples, validate on 1327 samples
Epoch 1/5


### TODO
* Switch to Theano backend - much faster for RNN's apparently (on GPU in particular)
* Look into masking
* Try Bi-Directional LSTM (see example below)
* Use early stopping criteria
* Embeddings:
 * Don't remove low frequency words
 * Normalize all vector lengths

In [43]:
""" Bi Directional LSTM example from - https://github.com/fchollet/keras/issues/3086

from keras.models import Model
import numpy as np
from keras.layers import Masking, Activation, Input, LSTM, merge
a = np.array([[[.3,.1,.2,.2,.1,.1],[.2,.3,.3,.3,.3,.1],[0,0,0,0,0,0]]])

inputs = Input(shape=(3,6))
mask = Masking(mask_value=0.0)(inputs)
fw = LSTM(1,return_sequences=True)(mask)
bw = LSTM(1,return_sequences=True,go_backwards=True)(mask)
merged = merge([fw,bw],mode='sum')
model = Model(input=inputs,output=fw)
model2 = Model(input=inputs,output=bw)
model3 = Model(input=inputs,output=merged)
"""
None