This is based on this code: https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py
and also on this code: https://github.com/simonhughes22/PythonNlpResearch/blob/master/notebooks/SEARN/CB%20-%20Keras%20-%20Train%20Tagger%20and%20Save%20CV%20Predictions%20For%20Word%20Tags-NO%20EXPLICIT.ipynb

# Problem Statement
* The reason this nbook exists is that the tags used in the SEARN model and thus also the CoRef model do not seem to have been generated by the best model (they are well below the best reported results)
* I am attempting to rectify this here, and if it works, can potentially re-run SEARN experiments also

# TODO
* Change to use 128 hidden units not 256 (was actually optimal settings)
* Then Validate accuracy
* Is that doesn't work, drop max len and ignore test data for the purposes of the CV run (on training data), then re-institute for training test model

In [1]:
#%load_ext autoreload
#%autoreload 2

In [2]:
#Check mongo is running
def is_mongo_runnning():
    import pymongo
    client = pymongo.MongoClient(serverSelectionTimeoutMS=100)
    db = client.metrics_codes
    coll = db.get_collection("CB_TAGGING_TD_AVG_PERCEPTRON_MOST_COMMON_TAG")
    l = list(coll.find({}))

In [3]:
is_mongo_runnning()

## Note - To Get this working:

* Install CUDA and associated libraries, setup path
* Install bleeding edge theano (from src)
* Make sure the THEANO_FLAGS are set correctly via the environment var, or via the ~/.theanorc file
* Install and compile bleeding edge Keras (from src)
* `export KERAS_BACKEND=theano`
* `export KERAS_IMAGE_DIM_ORDERING='th'`
* `sh <project_root>/shell_scipts/setup_environment.sh` to install additional dependencies
* **DO NOT SET UNROLL=True** when creating RNN's - causes max recursion issue

## Trouble-Shooting

* You may need to clean the theano cache. To do so thoroughly, run this command from the shell:
 * `theano-cache purge`

In [4]:
import numpy as np
import os
from collections import defaultdict
from joblib import Parallel, delayed
import dill

import keras
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
#from keras.layers import TimeDistributed
from keras.layers import TimeDistributedDense
from Metrics import rpf1
from load_data import load_process_essays
from wordtagginghelper import merge_dictionaries

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from DirUtils import dir_exists
from IdGenerator import IdGenerator as idGen
from results_procesor import ResultsProcessor, __MICRO_F1__
from Rpfa import micro_rpfa
from collections import defaultdict

import Settings
import logging

import datetime

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


## Load and Pre-Process Essays

In [5]:
import pickle
from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from Settings import Settings

CV_FOLDS = 5
DEV_SPLIT = 0.1

DATASET = "CoralBleaching"

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

training_pickled = settings.data_directory + DATASET + "/Thesis_Dataset/training.pl"

models_folder = root_folder + "Models/Bi-LSTM-4-Anaphora-Binary-Fixed/"

config = get_config(training_folder)
processor = ResultsProcessor(dbname="metrics_coref_rnn_fixed")

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


## Check Folders are Valid and Exist

In [7]:
models_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Models/Bi-LSTM-4-Anaphora-Binary-Fixed/'

In [9]:
def create_dir_if_missing(folder):
    if not dir_exists(folder):
        print("Dir missing, creating")
        os.makedirs(folder)
    assert dir_exists(folder)
    print("Done")

In [10]:
create_dir_if_missing(models_folder)

Done


## Load Essays

In [11]:
with open(training_pickled, "rb+") as f:
    tagged_essays = pickle.load(f)
len(tagged_essays)

902

In [12]:
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)

226 files found
226 essays processed


In [13]:
import datetime, logging
print("Started at: " + str(datetime.datetime.now()))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

Started at: 2018-09-01 11:32:25.471925


In [14]:
from numpy.random import shuffle
shuffle(tagged_essays)

## Prepare Tags

In [15]:
tag_freq = defaultdict(int)
unique_words = set()
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1

# for essay in tagged_essays_test:
#     for sentence in essay.sentences:
#         for word, tags in sentence:
#             unique_words.add(word)
#             for tag in tags:
#                 tag_freq[tag] += 1

EMPTY_TAG = "Empty"
regular_tags = list((t for t in tag_freq.keys() if t == "Anaphor"))

vtags = set(regular_tags)
vtags.add(EMPTY_TAG)

assert "Anaphor" in regular_tags
len(unique_words)

1641

In [16]:
sorted(regular_tags)

['Anaphor']

In [18]:
sorted(vtags)

['Anaphor', 'Empty']

# Transform Essays into Training Data (Word Ids)

* Computes `xs`, `ys`, `ys_bytag` and `seq_lens`
* `ys_bytag` includes **all tags** and does **not** focus only on the most common tag
* `ys` only includes the most common tag (so we can use cross entropy)
* `seq_lens` is without the start and end tags included (so we have to map back and forth to maintain mappings)
* `ys_bytag` also excludes the START and END tokens

## Get Max Sequence Length, Generate All Ids

In [17]:
ix2tag = {}
for ix, t in enumerate(vtags):
    ix2tag[ix] = t

generator = idGen(seed=1)  # important as we zero pad sequences

maxlen = 0
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            id = generator.get_id(word)  # starts at 0, but 0 used to pad sequences
        maxlen = max(maxlen, len(sentence) + 2)

def ids2tags(ids):
    return [generator.get_key(j) for j in ids]

def lbls2tags(ixs):
    return [ix2tag[ix] for ix in ixs]

maxlen

93

In [18]:
START = "<start>"
END = "<end>"

def get_training_data(tessays, max_seq_len=None):
    if max_seq_len is None:
        max_seq_len = maxlen
    
    # outputs
    xs = []
    ys = []
    ys_bytag = defaultdict(list)
    seq_lens = []

    # cut texts after this number of words (among top max_features most common words)
    for essay in tessays:
        for sentence in essay.sentences:
            row = []
            y_found = False
            y_seq = []
            for word, tags in [(START, set())] + sentence + [(END, set())]:
                id = generator.get_id(word)  # starts at 0, but 0 used to pad sequences
                row.append(id)

                # Make sure to include Causer:<num> and Result:<num> tags for the Anaphora labels
                tags = set([t.replace("Causer:","").replace("Result:","") for t in tags])
               
                # remove unwanted tags
                tags = vtags.intersection(tags)
                # retain all tags for evaluation (not just most common)
                # SKIP the START and END tags
                if word != START and word != END:
                    for t in (vtags - set([EMPTY_TAG])):
                        if t in tags:
                            ys_bytag[t].append(1)
                        else:
                            ys_bytag[t].append(0)

                # encode ys with most common tag only
                if "explicit" in tags:
                    tags.remove("explicit")

                if len(tags) > 1:
                    most_common = max(tags, key=lambda t: tag_freq[t])
                    tags = set([most_common])
                if len(tags) == 0:
                    tags.add(EMPTY_TAG)
                    
                assert len(tags) == 1, "Wrong number of tags:" + str(tags)

                one_hot = []
                for t in vtags:
                    if t in tags:
                        one_hot.append(1)
                    else:
                        one_hot.append(0)
                y_seq.append(one_hot)

            seq_lens.append(len(row) - 2)
            ys.append(y_seq)
            xs.append(row)

    xs = sequence.pad_sequences(xs, maxlen=max_seq_len)
    ys = sequence.pad_sequences(ys, maxlen=max_seq_len)
    assert xs.shape[0] == ys.shape[0], "Sequences should have the same number of rows"
    assert xs.shape[1] == ys.shape[1] == max_seq_len, "Sequences should have the same lengths"
    return xs, ys, ys_bytag, seq_lens

## Create Train - Test Split

In [19]:
# ## Create Train - Test Split
# Helper Functions
def collapse_results(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    pred_ys_by_tag = defaultdict(list)
    for i in range(len(seq_lens)):
        row_ixs = preds[i, :]
        len_of_sequence = seq_lens[i] + 2
        # sequences are padded from the left, take the preds from the end of the seq
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]
        # skip the start and end label
        pred_ys = pred_ys[1:-1]
        for pred_tag in pred_ys:
            pred_ys_by_tag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in (vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

def train_dev_split(lst, dev_split):
    # random shuffle
    shuffle(lst)
    num_training = int((1.0 - dev_split) * len(lst))
    return lst[:num_training], lst[num_training:]

In [20]:
e = tagged_essays[0]
e.sentences[0]

[('not', set()),
 ('only', set()),
 ('is', set()),
 ('there', set()),
 ('many', set()),
 ('different', set()),
 ('corals', set()),
 ('living', set()),
 ('in', set()),
 ('the', set()),
 ('ocean', set()),
 (',', set()),
 ('but', set()),
 ('many', set()),
 ('different', set()),
 ('ways', set()),
 ('for', set()),
 ('the', set()),
 ('corals', {'50'}),
 ('to', {'50'}),
 ('become', {'50'}),
 ('bleached', {'50'}),
 ('.', set())]

In [21]:
%%time
# use this name for a different function later
from CrossValidation import cross_validation as cv

folds = cv(tagged_essays, CV_FOLDS)
fold2training_data = {}
fold2dev_data = {}
fold2test_data = {}

fold2training_essays = {}
fold2dev_essays = {}
fold2test_essays = {}

for i, (essays_TD, essays_VD) in enumerate(folds):
    # further split into train and dev test
    essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT)
    fold2training_data[i] = get_training_data(essays_train)
    fold2dev_data[i]      = get_training_data(essays_dev)
    # Test Data
    fold2test_data[i]     = get_training_data(essays_VD)
    
    # also store essays
    fold2training_essays[i] = essays_train
    fold2dev_essays[i]      = essays_dev
    fold2test_essays[i]     = essays_VD

CPU times: user 3.67 s, sys: 99.5 ms, total: 3.77 s
Wall time: 3.79 s


In [22]:
# Make sure the generator is incremented on the test data too
_,_,_,_ = get_training_data(tagged_essays)
#_,_,_,_ = get_training_data(tagged_essays_test)

In [23]:
# with open(cv_folder + "td.dill", "wb") as f:
#     dill.dump(fold2training_data, f)

# with open(cv_folder + "td_essays.dill", "wb") as f:
#     dill.dump(fold2training_essays, f)

# with open(cv_folder + "devd.dill", "wb") as f:
#     dill.dump(fold2dev_data, f)

# with open(cv_folder + "devd_essays.dill", "wb") as f:
#     dill.dump(fold2dev_essays, f)
    
# with open(cv_folder + "vd.dill", "wb") as f:
#     dill.dump(fold2test_data, f)
    
# with open(cv_folder + "vd_essays.dill", "wb") as f:
#     dill.dump(fold2test_essays, f)
    
# with open(cv_folder + "generator.dill", "wb") as f:
#     dill.dump(generator, f)

# with open(cv_folder + "vtags.dill", "wb") as f:
#     dill.dump(vtags, f)

In [24]:
generator.get_id("coral"), generator.get_key(generator.get_id("coral"))

(26, 'coral')

## Load Glove 100 Dim Embeddings

In [25]:
# see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
# for creating pre-filtered embeddings file
import pickle, os
from numpy.linalg import norm

embeddings_file = "/Users/simon.hughes/data/word_embeddings/glove.6B/cb_dict_glove.6B.100d.txt"
# read data file
with open(embeddings_file, "rb+") as f:
    cb_emb_index = pickle.load(f)

In [26]:
missed = set()
for wd in unique_words:
    if wd not in cb_emb_index:
        missed.add(wd)
print(len(missed), len(unique_words), 100.0 * round(len(missed)/  len(unique_words),4), "%")

41 1641 2.5 %


### Construct Embedding Matrix

In [27]:
EMBEDDING_DIM = list(cb_emb_index.values())[0].shape[0]

def get_embedding_matrix(words, idgenerator, max_features, init='uniform', unit_length=False):
    embedding_dim = list(cb_emb_index.values())[0].shape[0]
    # initialize with a uniform distribution
    if init == 'uniform':
        # NOTE: the max norms for these is quite low relative to the embeddings
        embedding_matrix = np.random.uniform(low=-0.05, high=0.05, size=(max_features, embedding_dim))
    elif init == 'zeros':
        embedding_matrix = np.zeros(shape=(max_features, embedding_dim), dtype=np.float32)
    elif init == 'normal':
        raise Exception("Need to compute the mean and sd")
        #embedding_matrix = np.random.normal(mean, sd, size=(max_features, embedding_dim))
    else:
        raise Exception("Unknown init type")
    for word in words:
        i = idgenerator.get_id(word)
        embedding_vector = cb_emb_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    if unit_length:
        norms = np.linalg.norm(embedding_matrix, axis=1, keepdims=True)
        # remove 0 norms to prevent divide by zero
        norms[norms == 0.0] = 1.0
        embedding_matrix = embedding_matrix / norms
    return embedding_matrix

EMBEDDING_DIM

100

In [28]:
def score_predictions(model, xs, ys_by_tag, seq_len):
    preds = model.predict_classes(xs, batch_size=batch_size, verbose=0)
    pred_ys_by_tag = collapse_results(seq_len, preds)
    class2metrics = ResultsProcessor.compute_metrics(ys_by_tag, pred_ys_by_tag)
    micro_metrics = micro_rpfa(class2metrics.values())
    return micro_metrics, pred_ys_by_tag

In [29]:
from keras.layers import Bidirectional

def get_ts():
    # something screws up import so making local
    from datetime import datetime
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')

def get_file_ts():
    # something screws up import so making local
    from datetime import datetime
    return datetime.now().strftime('%Y%m%d_%H%M%S_%f')

embedding_size = EMBEDDING_DIM
hidden_size    = 128
out_size = len(vtags)
batch_size = 128

get_ts(), get_file_ts()

('2018-09-01 11:32:54.882616', '20180901_113254_882699')

## Train Bi-Directional LSTM With Glove Embeddings

In [30]:
max_features=len(generator.get_ids())+2 #Need plus one maybe due to masking of sequences
max_features

1645

In [31]:
def get_file_signature(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    lcls = locals()
    s = ""
    for k, val in sorted(lcls.items(), key = lambda tpl: (0,tpl[0]) if tpl[0] == 'fold_ix' else (1,tpl[0])):
        if val is not None:
            s += "{key}-{val}_".format(key=k, val=str(val))
    return s[:-1]

def get_file_name(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    fsig = get_file_signature(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
    return models_folder + fsig + ".h5"

get_file_name(0, True, True, 2, "sum", hidden_size)

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Models/Bi-LSTM-4-Anaphora-Binary-Fixed/fold_ix-0_bi_directional-True_hidden_size-128_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.h5'

In [32]:
def evaluate_fold(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    if use_pretrained_embedding:
        embedding_matrix = get_embedding_matrix(unique_words, generator, max_features, init='uniform',
                                                unit_length=False)
        embedding_layer = Embedding(max_features,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=maxlen,
                                    trainable=True,
                                    mask_zero=True)  # If false, initialize unfound words with all 0's
    else:
        embedding_layer = Embedding(max_features, embedding_size, input_length=maxlen, trainable=True, mask_zero=True)

    if bi_directional:
        rnn_layer_fact = lambda: Bidirectional(GRU(hidden_size, return_sequences=True, consume_less="cpu"),
                                               merge_mode=merge_mode)
    else:
        rnn_layer_fact = lambda: GRU(hidden_size, return_sequences=True, consume_less="cpu")

    model = Sequential()
    model.add(embedding_layer)
    for i in range(num_rnns):
        model.add(rnn_layer_fact())

    model.add(TimeDistributedDense(out_size))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', sample_weight_mode="temporal")

    X_train, y_train, train_ys_by_tag, seq_len_train = fold2training_data[fold_ix]
    X_dev, y_dev, dev_ys_by_tag, seq_len_dev = fold2dev_data[fold_ix]
    X_test, y_test, test_ys_by_tag, seq_len_test = fold2test_data[fold_ix]

    # init loop vars
    f1_scores = [-1]
    num_since_best_score = 0
    patience = 3
    best_weights = None

    for i in range(30):
        print("{ts}: Epoch={epoch}".format(ts=get_ts(), epoch=i))
        epochs = 1  # epochs per training instance
        results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, verbose=0)
        micro_metrics, _ = score_predictions(model, X_dev, dev_ys_by_tag, seq_len_dev)

        f1_score = micro_metrics.f1_score
        best_f1_score = max(f1_scores)
        if f1_score <= best_f1_score:
            num_since_best_score += 1
        else:  # score improved
            num_since_best_score = 0
            best_weights = model.get_weights()

        f1_scores.append(f1_score)
        if num_since_best_score >= patience:
            break

    # load best weights
    model.set_weights(best_weights)
    train_micro_metrics, train_predictions_by_tag = score_predictions(model, X_train, train_ys_by_tag, seq_len_train)
    test_micro_metrics, test_predictions_by_tag = score_predictions(model, X_test, test_ys_by_tag, seq_len_test)
    return model, train_predictions_by_tag, test_predictions_by_tag, train_ys_by_tag, test_ys_by_tag

## Hyper Param Tuning

In [33]:
processor.dbname

'metrics_coref_rnn_fixed'

In [36]:
def cross_validation(use_pretrained_embedding, bi_directional, num_rnns, maerge_mode, hidden_size):
    
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
    
    fold2model = {}
    for i in range(CV_FOLDS):
        result = evaluate_fold(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)         
        model, td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)
        
        fname = get_file_name(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
        model.save(fname)    
        fold2model[i] = model

    SUFFIX = "_RNN_BINARY_HYPERPARAM_TUNING"
    CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = []
    parameters["min_feat_freq"] = 0

    parameters["use_pretrained_embedding"] = use_pretrained_embedding
    parameters["bi-directional"] = bi_directional
    parameters["hidden_size"] = hidden_size
    parameters["merge_mode"] = merge_mode
    parameters["num_rnns"] = num_rnns

    wd_algo = "RNN"
    wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag,
                                               parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag,
                                               parameters, wd_algo)
    avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    print("CV micro F1: {f1:.4f}".format(f1=avg_f1))
    return fold2model


In [37]:
i = 0

In [38]:
%%time
import warnings
warnings.filterwarnings("ignore")

for use_pretrained_embedding in [True]: #, False]:
    for bi_directional in [True]:       #, False]:
        for num_rnns in [1, 2]:
            for merge_mode in ["sum"]:
                for hidden_size in [256]: # [64, 128, 256]:
                    i+=1
                    print("[{i}] Params {ts} - Embeddings={use_pretrained_embedding}, Bi-Direct={bi_directional} Num_Rnns={num_rnns} Hidden_Size={hidden_size}"\
                          .format(i=i, ts=get_ts(), use_pretrained_embedding=use_pretrained_embedding, bi_directional=bi_directional, num_rnns=num_rnns, hidden_size=hidden_size))
                    fold2model = cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
                    

[1] Params 2018-09-01 12:17:56.671507 - Embeddings=True, Bi-Direct=True Num_Rnns=1 Hidden_Size=64
Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


2018-09-01 12:17:57.632668: Epoch=0
2018-09-01 12:18:14.940454: Epoch=1
2018-09-01 12:18:27.517005: Epoch=2
2018-09-01 12:18:39.793032: Epoch=3
2018-09-01 12:18:59.465784: Epoch=0
2018-09-01 12:19:16.322303: Epoch=1
2018-09-01 12:19:29.228202: Epoch=2
2018-09-01 12:19:42.663677: Epoch=3
2018-09-01 12:20:03.340421: Epoch=0
2018-09-01 12:20:19.764903: Epoch=1
2018-09-01 12:20:31.981062: Epoch=2
2018-09-01 12:20:44.074663: Epoch=3
2018-09-01 12:21:04.624632: Epoch=0
2018-09-01 12:21:20.904372: Epoch=1
2018-09-01 12:21:33.790324: Epoch=2
2018-09-01 12:21:46.797867: Epoch=3
2018-09-01 12:22:09.039315: Epoch=0
2018-09-01 12:22:27.003270: Epoch=1
2018-09-01 12:22:41.022793: Epoch=2
2018-09-01 12:22:54.717539: Epoch=3
CV micro F1: 0.0000
[1] Params 2018-09-01 12:23:16.223296 - Embeddings=True, Bi-Direct=True Num_Rnns=1 Hidden_Size=64
[2] Params 2018-09-01 12:23:16.223406 - Embeddings=True, Bi-Direct=True Num_Rnns=1 Hidden_Size=128
2018-09-01 12:23:17.553502: Epoch=0
2018-09-01 12:23:49.070317:

KeyboardInterrupt: 

In [41]:
list(fold2model.keys())

[0, 1, 2, 3, 4]

In [42]:
predicts_by_fold = {}
for fold_ix in range(CV_FOLDS):
    X_test,  y_test,  test_ys_bytag_con_sent,  seq_len_test  = fold2test_data[fold_ix]
    model = fold2model[fold_ix]
    probs = model.predict_classes(X_test)
    predicts_by_fold[fold_ix] = probs



In [43]:
predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/'

In [44]:
def get_predictions_fname(fold, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    fsig = get_file_signature(fold, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
    return predictions_folder + fsig + ".dill"

for fold, preds in predicts_by_fold.items():
    fname = get_predictions_fname(fold, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
    with open(fname, "wb") as f:
        dill.dump(preds, f)

In [45]:
#Helper Functions
def predictions_to_tags(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    sentence_tags = []
    for i in range(len(seq_lens)):
        row_ixs = preds[i,:]
        len_of_sequence = seq_lens[i] + 2
        # sequences are padded from the left, take the preds from the end of the seq
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]
        # skip the start and end label
        pred_ys = pred_ys[1:-1]
        sentence_tags.append(pred_ys)
    return sentence_tags

## Assign Predicted Tags to Essay Objects

In [46]:
def assign_tags_to_essays(essays, preds, seq_len):
    pred_tags = predictions_to_tags(seq_len, preds)
    sent_ix = 0
    for essay in essays:
        ptagged_sentences = []
        for sent in essay.sentences:
            ptags = pred_tags[sent_ix]
            assert len(ptags) == len(sent), "Sentence and tags don't align - ntags %i , len(sentence) %i" % ((len(ptags),len(sent)))
            ptagged_sentences.append(ptags)
            sent_ix += 1
        assert len(ptagged_sentences) == len(essay.sentences), "Lens differ"
        essay.pred_tagged_sentences = ptagged_sentences
    assert sent_ix == len(pred_tags), "Predictions don't align with sequence lens"

all_ptagged_essays = []
for fold in fold2test_essays.keys():
    essays = fold2test_essays[fold]
    preds = predicts_by_fold[fold]
    _,_,_,seq_len = fold2test_data[fold]
    assign_tags_to_essays(essays, preds, seq_len)
    all_ptagged_essays.extend(essays)

fname = predictions_folder + "essays_train_" + get_file_signature(None, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size) + ".dill"
with open(fname, "wb") as f:
    dill.dump(all_ptagged_essays, f)

In [47]:
assert len(all_ptagged_essays) == len(tagged_essays)

In [48]:
# essay = all_ptagged_essays[0]
# essay.pred_tagged_sentences[0]

## Get Test Data Predictions

In [49]:
# seed to maxlen
max_test_len = maxlen
for essay in tagged_essays_test:
    for sentence in essay.sentences:
        for word, tags in sentence:
            id = generator.get_id(word)  # starts at 0, but 0 used to pad sequences
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1
        max_test_len = max(max_test_len, len(sentence) + 2)
        
max_features_test=len(generator.get_ids())+2 #Need plus one maybe due to masking of sequences
_,_,_,_ = get_training_data(tagged_essays_test, max_features_test)

maxlen, max_test_len, max_features, max_features_test

(93, 141, 1645, 1681)

In [50]:
# merge_mode is Bi-Directional only
def evaluate_test(num_rnns, merge_mode, hidden_size):
    embedding_matrix = get_embedding_matrix(unique_words, generator, max_features_test, init='uniform', unit_length=False)
    embedding_layer = Embedding(max_features_test,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_test_len,
                            trainable=True,
                            mask_zero=True) # If false, initialize unfound words with all 0's
    rnn_layer_fact = lambda : Bidirectional(GRU(hidden_size, return_sequences=True, consume_less="cpu"), merge_mode=merge_mode)
        
    model = Sequential()
    model.add(embedding_layer)
    for i in range(num_rnns):
        model.add(rnn_layer_fact())

    #model.add(TimeDistributed(Dense(out_size)))
    model.add(TimeDistributedDense(out_size))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', sample_weight_mode="temporal")
    
    essays_train, essays_dev = train_dev_split(tagged_essays, DEV_SPLIT)
    X_train, y_train,  train_ys_bytag_con_sent,  seq_len_train = get_training_data(essays_train, max_test_len)
    X_dev,   y_dev,    dev_ys_bytag_con_sent,    seq_len_dev   = get_training_data(essays_dev,   max_test_len)

    # init loop vars
    f1_scores = [-1]
    num_since_best_score = 0
    patience = 3
    best_weights = None

    for i in range(30):
    #for i in range(10):
        print("{ts}: Epoch={epoch}".format(ts=get_ts(), epoch=i))
        results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, validation_split=0.0, verbose=0)
        micro_metrics,_ = score_predictions(model, X_dev, dev_ys_bytag_con_sent, seq_len_dev)

        print(micro_metrics)
        print()

        f1_score = micro_metrics.f1_score
        best_f1_score = max(f1_scores)
        if f1_score <= best_f1_score:
            num_since_best_score += 1
        else: # score improved
            num_since_best_score = 0
            best_weights = model.get_weights()

        f1_scores.append(f1_score)
        if num_since_best_score >= patience:
            #print("Too long since an improvement, stopping")
            break
    
    print("Fold[{ix}] - Best F1 Score={f1}".format(ix=fold_ix, f1=best_f1_score))
    
    # load best weights
    model.set_weights(best_weights)
    return model

In [51]:
maxlen, max_test_len

(93, 141)

In [52]:
#test_model = evaluate_test(2, "sum", 256)
print(num_rnns, merge_mode, hidden_size)
test_model = evaluate_test(num_rnns, merge_mode, hidden_size)

2 sum 256
2018-08-12 00:46:56.035541: Epoch=0
Recall: 0.0000, Precision: 0.0000, F1: 0.0000, Accuracy: 0.9975, Codes:    37

2018-08-12 00:52:43.990674: Epoch=1
Recall: 0.0000, Precision: 0.0000, F1: 0.0000, Accuracy: 0.9975, Codes:    37

2018-08-12 00:58:04.875319: Epoch=2
Recall: 0.0270, Precision: 1.0000, F1: 0.0526, Accuracy: 0.9976, Codes:    37

2018-08-12 01:03:25.930000: Epoch=3
Recall: 0.2973, Precision: 0.3929, F1: 0.3385, Accuracy: 0.9971, Codes:    37

2018-08-12 01:08:46.674905: Epoch=4
Recall: 0.1081, Precision: 0.8000, F1: 0.1905, Accuracy: 0.9977, Codes:    37

2018-08-12 01:14:07.278627: Epoch=5
Recall: 0.2162, Precision: 0.5000, F1: 0.3019, Accuracy: 0.9975, Codes:    37

2018-08-12 01:19:27.452569: Epoch=6
Recall: 0.2162, Precision: 0.5000, F1: 0.3019, Accuracy: 0.9975, Codes:    37

Fold[4] - Best F1 Score=0.3384615384615385


In [53]:
X_test,  y_test,   test_ys_bytag_con_sent,   seq_len_test = get_training_data(tagged_essays_test, max_test_len)
test_preds = test_model.predict_classes(X_test)



In [54]:
test_preds.shape, len(test_ys_bytag_con_sent['4'])

((1918, 141), 0)

In [55]:
assign_tags_to_essays(tagged_essays_test, test_preds, seq_len_test)

fname = predictions_folder + "essays_test_" + get_file_signature(None, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size) + ".dill"
with open(fname, "wb") as f:
    dill.dump(tagged_essays_test, f)

In [56]:
e = tagged_essays_test[0]
list(zip(e.sentences[0],e.pred_tagged_sentences[0]))

[(('well', set()), 'Empty'),
 (('based', set()), 'Empty'),
 (('on', set()), 'Empty'),
 (('what', set()), 'Empty'),
 (('i', set()), 'Empty'),
 (('read', set()), 'Empty'),
 (('the', set()), 'Empty'),
 (('corals', {'50'}), 'Empty'),
 (('are', {'50'}), 'Empty'),
 (('INFREQUENT', {'50'}), 'Empty'),
 (('their', {'50'}), 'Empty'),
 (('colors', {'50'}), 'Empty'),
 ((',', set()), 'Empty'),
 (('coral', {'50'}), 'Empty'),
 (('bleaching', {'50'}), 'Empty'),
 (('are', set()), 'Empty'),
 (('a', set()), 'Empty'),
 (('serious', set()), 'Empty'),
 (('problem', set()), 'Empty'),
 (('with', set()), 'Empty'),
 (('a', set()), 'Empty'),
 (('serious', set()), 'Empty'),
 (('impact', set()), 'Empty'),
 (('on', set()), 'Empty'),
 (('the', set()), 'Empty'),
 (('worlds', set()), 'Empty'),
 (('coral', set()), 'Empty'),
 (('reefs', set()), 'Empty'),
 (('.', set()), 'Empty')]