This is based on this code: https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py

In [1]:
#%load_ext autoreload
#%autoreload 2

## Note - To Get this working:

* Install CUDA and associated libraries, setup path
* Install bleeding edge theano (from src)
* Make sure the THEANO_FLAGS are set correctly via the environment var, or via the ~/.theanorc file
* Install and compile bleeding edge Keras (from src)
* `export KERAS_BACKEND=theano`
* `export KERAS_IMAGE_DIM_ORDERING='th'`
* `sh <project_root>/shell_scipts/setup_environment.sh` to install additional dependencies
* **DO NOT SET UNROLL=True** when creating RNN's - causes max recursion issue

## Trouble-Shooting

* You may need to clean the theano cache. To do so thoroughly, run this command from the shell:
 * `theano-cache purge`

In [2]:
import numpy as np
from collections import defaultdict
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
import keras.layers.convolutional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from Metrics import rpf1
from load_data import load_process_essays

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from IdGenerator import IdGenerator as idGen
from results_procesor import ResultsProcessor, __MICRO_F1__
from Rpfa import micro_rpfa

import Settings
import logging

import datetime

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GT 750M (CNMeM is disabled, cuDNN 5110)


In [3]:
#Helper Functions
def collapse_results(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    pred_ys_by_tag = defaultdict(list)
    for i in range(len(seq_lens)):
        row_ixs = preds[i,:]
        len_of_sequence = seq_lens[i] + 2
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]   
        # skip the start and end label
        pred_ys = pred_ys[1:-1]
        for pred_tag in pred_ys:
            pred_ys_by_tag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in(vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

def split_dict(dct, train_split):
    td, vd = {}, {}
    for key, lst in dct.items():
        td[key] = lst[:train_split]
        vd[key] = lst[train_split:]
    return td, vd

## Load and Pre-Process Essays

In [4]:
from Settings import Settings

settings = Settings()
settings.data_directory + "CoralBleaching/Thesis_Dataset/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/'

In [5]:
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
import pickle

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [6]:
with open(training_pickled, "rb+") as f:
    tagged_essays = pickle.load(f)
len(tagged_essays)

902

In [7]:
import datetime, logging
print("Started at: " + str(datetime.datetime.now()))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

TEST_SPLIT          = 0.2

Started at: 2017-03-12 17:06:28.208447


In [8]:
from numpy.random import shuffle
shuffle(tagged_essays)

## Prepare Tags

In [9]:
tag_freq = defaultdict(int)
unique_words = set()
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1

EMPTY_TAG = "Empty"
regular_tags = list((t for t in tag_freq.keys() if t[0].isdigit()))
vtags = set(regular_tags)
vtags.add(EMPTY_TAG)

len(unique_words)

1641

In [10]:
sorted(vtags)

['1',
 '11',
 '12',
 '13',
 '14',
 '2',
 '3',
 '4',
 '5',
 '50',
 '5b',
 '6',
 '7',
 'Empty']

# Transform Essays into Training Data (Word Ids)

In [11]:
from collections import defaultdict
generator = idGen(seed=1) # important as we zero pad sequences

ix2tag = {}
for ix, t in enumerate(vtags):
    ix2tag[ix] = t

def ids2tags(ids):
    return [generator.get_key(j) for j in ids]  

def lbls2tags(ixs):
    return [ix2tag[ix] for ix in ixs]

xs = []
ys = []

START = "<start>"
END   = "<end>"

# cut texts after this number of words (among top max_features most common words)
maxlen = 0
ys_bytag = defaultdict(list)
seq_lens = []
for essay in tagged_essays:
    for sentence in essay.sentences:
        row = []
        y_found = False
        y_seq = []
        for word, tags in [(START, set())] + sentence + [(END, set())]:
            id = generator.get_id(word) #starts at 0, but 0 used to pad sequences
            row.append(id)
            
            tags = vtags.intersection(tags)
            if word != START and word != END:
                for t in (vtags - set([EMPTY_TAG])):
                    if t in tags:
                        ys_bytag[t].append(1)
                    else:
                        ys_bytag[t].append(0)
            
            if len(tags) > 1:
                most_common = max(tags, key=lambda t: tag_freq[t])
                tags = set([most_common])
            if len(tags) == 0:
                tags.add(EMPTY_TAG)
            
            one_hot = []
            for t in vtags:
                if t in tags:
                    one_hot.append(1)
                else:
                    one_hot.append(0)
            y_seq.append(one_hot)
    
        seq_lens.append(len(row)-2)
        ys.append(y_seq)
        xs.append(row)
        maxlen = max(len(row), maxlen)

max_features=len(generator.get_ids())+1 #Need plus one maybe due to masking of sequences

## Create Train - Test Split

In [14]:
maxlen = maxlen
num_training = int((1.0 - TEST_SPLIT) * len(xs))

X_train_orig, y_train_orig, X_test_orig, y_test_orig = \
    xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:]

seq_len_train, seq_len_test = seq_lens[:num_training], seq_lens[num_training:]

num_train_wds = sum(seq_len_train)
num_wds       = sum(seq_lens)
train_ys_bytag, test_ys_by_tag = split_dict(ys_bytag, num_train_wds)

In [15]:
X_train = sequence.pad_sequences(X_train_orig, maxlen=maxlen) #30 seems good
X_test  = sequence.pad_sequences(X_test_orig,  maxlen=maxlen)

y_train = sequence.pad_sequences(y_train_orig, maxlen=maxlen)
y_test  = sequence.pad_sequences(y_test_orig,  maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape: ',  X_test.shape)
print()
print('y_train shape:', y_train.shape)
print('y_test shape: ',  y_test.shape)

X_train shape: (6633, 93)
X_test shape:  (1659, 93)

y_train shape: (6633, 93, 14)
y_test shape:  (1659, 93, 14)


## Single Layer LSTM

In [20]:
from keras.layers import Bidirectional

embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))

# Fast - consume_less="CPU - BUT DON'T UNROLL
model.add(GRU(hidden_size, return_sequences=True, consume_less="cpu"))
#DON'T UNROLL - causes max recursion issue
#model.add(GRU(hidden_size, return_sequences=True, consume_less="cpu", unroll=True))
#model.add(GRU(hidden_size, return_sequences=True))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["categorical_crossentropy"], sample_weight_mode="temporal")



In [21]:
%%time
from datetime import datetime

logging.info("Starting")

batch_size = 128
epochs = 20 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics
logging.info("Ended")

2017-03-12 17:17:47,442 : INFO : Starting


Train on 5306 samples, validate on 1327 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


2017-03-12 17:22:48,745 : INFO : Ended


CPU times: user 4min 47s, sys: 8.6 s, total: 4min 56s
Wall time: 5min 1s


In [24]:
micro_metrics

Recall: 0.4592, Precision: 0.7257, F1: 0.5625, Accuracy: 0.9861, Codes:  6810

In [15]:
code = "50"
assert len(test_pred_ys_by_tag[code]) == sum(seq_len_test) == len(test_ys_by_tag[code])
len(test_pred_ys_by_tag[code]), sum(seq_len_test), len(test_ys_by_tag[code])

(26466, 26466, 26466)

## Bi-Directional LSTM

In [26]:
from keras.layers import Bidirectional

embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
#model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))
model.add(Bidirectional(GRU(hidden_size, return_sequences=True, consume_less="cpu"), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [29]:
%%time
logging.info("Starting Training Bi-Directional LSTM")
batch_size = 128
epochs = 10 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

2017-03-12 17:50:12,375 : INFO : Starting Training Bi-Directional LSTM


Train on 5306 samples, validate on 1327 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 4min 56s, sys: 8.15 s, total: 5min 5s
Wall time: 5min 5s


In [30]:
micro_metrics

Recall: 0.8134, Precision: 0.8441, F1: 0.8284, Accuracy: 0.9935, Codes:  6810

Bi-Directional LSTM, 128 hidden, 128 embedding, 20 epochs, batch size = 128

Recall: 0.8146, Precision: 0.8155, **F1: 0.8151**, Accuracy: 0.9930, Codes:  6549

## Timings

In [31]:
!which python

/Users/simon.hughes/anaconda3/envs/keras_and_theano_bleeding_edge/bin/python


## Try with Pre-Trained Embedding

## Load Glove 100 Dim Embeddings

In [13]:
# see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
# for creating pre-filtered embeddings file
import pickle, os
embeddings_file = "/Users/simon.hughes/data/word_embeddings/glove.6B/cb_dict_glove.6B.100d.txt"

with open(embeddings_file, "rb+") as f:
    cb_emb_index = pickle.load(f)

In [14]:
from numpy.linalg import norm

v = list(cb_emb_index.values())
norms = []
for i in range(len(v)):
    norms.append(norm(v[i]))
mean_vec_len = np.mean(norms)
np.mean(norms), np.min(norms), np.max(norms)

(5.3327198, 2.332164, 8.0000887)

In [15]:
missed = set()
for wd in unique_words:
    if wd not in cb_emb_index:
        missed.add(wd)
print(len(missed), len(unique_words), 100.0 * round(len(missed)/  len(unique_words),4), "%")

41 1641 2.5 %


In [16]:
# Filter down to just the items we want so matrix is not unnecessarily huge
small_emd_matrix = {}
new_ones = {}
shape = list(cb_emb_index.values())[0].shape
for wd in unique_words:
    if wd in cb_emb_index:
        coeff = cb_emb_index[wd]
    else:
        v = np.random.random(shape)
        norm = np.linalg.norm(v)
        coeff = mean_vec_len*(v / norm)
        new_ones[wd] = coeff
    small_emd_matrix[wd] = coeff

In [17]:
# are random ones the correct size?
np.linalg.norm(list(new_ones.values())[0]), mean_vec_len

(5.3327198028564453, 5.3327198)

### Construct Embedding Matrix

In [18]:
EMBEDDING_DIM = shape[0]
embedding_matrix = np.zeros((max_features, EMBEDDING_DIM))
for word in unique_words:
    i = generator.get_id(word)
    embedding_vector = small_emd_matrix.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

(1644, 100)

In [19]:
set(embedding_matrix[generator.get_id("coral")] == small_emd_matrix["coral"])

{True}

## Train Bi-Directional LSTM With Glove Embeddings

In [22]:
from keras.layers import Bidirectional

embedding_layer = Embedding(max_features,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True,
                            mask_zero=True) # If false, initialize unfound words with all 0's
embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [None]:
batch_size = 128
epochs = 15 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

**Pre-Trained Embeddings - Tuned:**

Bi-Directional LSTM, 128 hidden, 128 embedding, 20 epochs, batch size = 128

Recall: 0.8063, Precision: 0.8098, F1: **0.8080**, Accuracy: 0.9929, Codes:  6604

## Keep Embeddings Fixed

In [71]:
# Set trainable to false
embedding_layer = Embedding(max_features,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False,
                            mask_zero=True) # If false, initialize unfound words with all 0's
embedding_size = 128
hidden_size    = 128
out_size = len(vtags)

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, consume_less="cpu", unroll=True), merge_mode="sum"))

model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['fmeasure'], sample_weight_mode="temporal")



In [73]:
batch_size = 128
epochs = 10 #20

results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.2, verbose=1)
preds = model.predict_classes(X_test, batch_size=batch_size)   
test_pred_ys_by_tag = collapse_results(seq_len_test, preds)
class2metrics = ResultsProcessor.compute_metrics(test_ys_by_tag, test_pred_ys_by_tag)
micro_metrics = micro_rpfa(class2metrics.values())
micro_metrics

Train on 5306 samples, validate on 1327 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Recall: 0.7735, Precision: 0.8121, F1: 0.7923, Accuracy: 0.9925, Codes:  6604

### TODO
* Switch to Theano backend - much faster for RNN's apparently (on GPU in particular)
* Look into masking
* Try Bi-Directional LSTM (see example below)
* Use early stopping criteria
* Embeddings:
 * Don't remove low frequency words
 * Normalize all vector lengths

In [43]:
""" Bi Directional LSTM example from - https://github.com/fchollet/keras/issues/3086

from keras.models import Model
import numpy as np
from keras.layers import Masking, Activation, Input, LSTM, merge
a = np.array([[[.3,.1,.2,.2,.1,.1],[.2,.3,.3,.3,.3,.1],[0,0,0,0,0,0]]])

inputs = Input(shape=(3,6))
mask = Masking(mask_value=0.0)(inputs)
fw = LSTM(1,return_sequences=True)(mask)
bw = LSTM(1,return_sequences=True,go_backwards=True)(mask)
merged = merge([fw,bw],mode='sum')
model = Model(input=inputs,output=fw)
model2 = Model(input=inputs,output=bw)
model3 = Model(input=inputs,output=merged)
"""
None