This is based on this code: https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py

In [2]:
#%load_ext autoreload
#%autoreload 2

In [3]:
#Check mongo is running
import pymongo
client = pymongo.MongoClient()
db = client.metrics

## Note - To Get this working:

* Install CUDA and associated libraries, setup path
* Install bleeding edge theano (from src)
* Make sure the THEANO_FLAGS are set correctly via the environment var, or via the ~/.theanorc file
* Install and compile bleeding edge Keras (from src)
* `export KERAS_BACKEND=theano`
* `export KERAS_IMAGE_DIM_ORDERING='th'`
* `sh <project_root>/shell_scipts/setup_environment.sh` to install additional dependencies
* **DO NOT SET UNROLL=True** when creating RNN's - causes max recursion issue

## Trouble-Shooting

* You may need to clean the theano cache. To do so thoroughly, run this command from the shell:
 * `theano-cache purge`

In [4]:
import numpy as np
from collections import defaultdict
from joblib import Parallel, delayed
import dill

import keras
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from Metrics import rpf1
from load_data import load_process_essays
from wordtagginghelper import merge_dictionaries

#from gensim.models import Word2Vec

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [5]:
from window_based_tagger_config import get_config
from IdGenerator import IdGenerator as idGen
from results_procesor import ResultsProcessor, __MICRO_F1__
from Rpfa import micro_rpfa
from collections import defaultdict

import Settings
import logging

import datetime

## Load and Pre-Process Essays

In [6]:
import pickle
from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from Settings import Settings

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
models_folder = root_folder + "Models/Bi-LSTM_Stacked/"
cv_folder = root_folder + "CV_Data_Pickled/"

config = get_config(training_folder)
processor = ResultsProcessor(dbname="metrics_causal2")

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [7]:
print(training_pickled)

/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/training.pl


In [8]:
with open(training_pickled, "rb+") as f:
    tagged_essays = pickle.load(f)
len(tagged_essays)

902

In [9]:
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)

226 files found
226 essays processed


In [10]:
import datetime, logging
print("Started at: " + str(datetime.datetime.now()))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

Started at: 2019-04-17 17:13:08.914499


In [11]:
from numpy.random import shuffle
shuffle(tagged_essays)

## Prepare Tags

In [12]:
tag_freq = defaultdict(int)
unique_words = set()

for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1

# This caused some discrepancies with the other models. I believe this is here to prevent errors
# with some of the later code, but unfortunately it potentially breaks the micro-metrics
for essay in tagged_essays_test:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1

EMPTY_TAG = "Empty"
regular_tags = list((t for t in tag_freq.keys() if t[0].isdigit()))
cr_tags = list((t for t in tag_freq.keys() if ( "->" in t) and 
                not "Anaphor" in t and 
                not "other" in t and 
                not "rhetorical" in t and
                not "factor" in t and 
                1==1
               ))

vtags = set(regular_tags)
vtags.add(EMPTY_TAG)

cr_vtags = set(cr_tags)
cr_vtags.add(EMPTY_TAG)

len(unique_words), len(regular_tags), len(cr_tags)

(1677, 13, 86)

In [13]:
sorted(regular_tags)

['1', '11', '12', '13', '14', '2', '3', '4', '5', '50', '5b', '6', '7']

In [14]:
sorted(vtags)

['1',
 '11',
 '12',
 '13',
 '14',
 '2',
 '3',
 '4',
 '5',
 '50',
 '5b',
 '6',
 '7',
 'Empty']

In [15]:
print(len(cr_vtags))
sorted(cr_vtags)

87


['Causer:1->Result:11',
 'Causer:1->Result:13',
 'Causer:1->Result:14',
 'Causer:1->Result:2',
 'Causer:1->Result:3',
 'Causer:1->Result:4',
 'Causer:1->Result:5',
 'Causer:1->Result:50',
 'Causer:1->Result:6',
 'Causer:1->Result:7',
 'Causer:11->Result:11',
 'Causer:11->Result:12',
 'Causer:11->Result:13',
 'Causer:11->Result:14',
 'Causer:11->Result:3',
 'Causer:11->Result:4',
 'Causer:11->Result:50',
 'Causer:11->Result:6',
 'Causer:12->Result:11',
 'Causer:12->Result:13',
 'Causer:12->Result:14',
 'Causer:12->Result:50',
 'Causer:12->Result:5b',
 'Causer:12->Result:7',
 'Causer:13->Result:11',
 'Causer:13->Result:12',
 'Causer:13->Result:14',
 'Causer:13->Result:4',
 'Causer:13->Result:5',
 'Causer:13->Result:50',
 'Causer:13->Result:6',
 'Causer:13->Result:7',
 'Causer:14->Result:50',
 'Causer:14->Result:6',
 'Causer:14->Result:7',
 'Causer:2->Result:1',
 'Causer:2->Result:3',
 'Causer:2->Result:50',
 'Causer:2->Result:6',
 'Causer:3->Result:1',
 'Causer:3->Result:13',
 'Causer:3-

# Transform Essays into Training Data (Word Ids)

* Computes `xs`, `ys`, `ys_bytag` and `seq_lens`
* `ys_bytag` includes **all tags** and does **not** focus only on the most common tag
* `ys` only includes the most common tag (so we can use cross entropy)
* `seq_lens` is without the start and end tags included (so we have to map back and forth to maintain mappings)
* `ys_bytag` also excludes the START and END tokens

## Get Max Sequence Length, Generate All Ids

In [16]:
ix2tag = {}
for ix, t in enumerate(vtags):
    ix2tag[ix] = t

ix2crtag = {}
for ix, t in enumerate(cr_vtags):
    ix2crtag[ix] = t
    
generator = idGen(seed=1) # important as we zero pad sequences

maxlen = 0
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            id = generator.get_id(word) #starts at 0, but 0 used to pad sequences
        maxlen = max(maxlen, len(sentence) + 2)
        
for essay in tagged_essays_test:
    for sentence in essay.sentences:
        for word, tags in sentence:
            id = generator.get_id(word) #starts at 0, but 0 used to pad sequences
        maxlen = max(maxlen, len(sentence) + 2)

def ids2tags(ids):
    return [generator.get_key(j) for j in ids]

def lbls2tags(ixs):
    return [ix2tag[ix] for ix in ixs]
        
maxlen

141

In [17]:
START = "<start>"
END   = "<end>"

def get_training_data(tessays):
    # outputs
    xs = []
    ys = []
    ys_bytag_concept_sent = defaultdict(list)
    ys_bytag_cr_sent = defaultdict(list)
    seq_lens = []

    # cut texts after this number of words (among top max_features most common words)
    for essay in tessays:
        for sentence in essay.sentences:
            row = []
            y_found = False
            y_seq = []
            unique_tags = set() # get all unique tags in sentence
            for word, tags in [(START, set())] + sentence + [(END, set())]:
                id = generator.get_id(word) #starts at 0, but 0 used to pad sequences
                row.append(id)
                                
                # Make sure to include Causer:<num> and Result:<num> tags, as we do for the parser model
                missing_tags = [t.replace("Causer:","").replace("Result:","") 
                                    for t in tags
                                ]
                # Filter to just concept codes that were missing
                missing_tags = set([t for t in missing_tags if t[0].isdigit() and "->" not in t])
                new_tags = tags.union(missing_tags)
                #if missing_tags:
                #    diff = missing_tags - tags
                #    if diff:
                #        print(diff, tags)

                tags = new_tags
                
                unique_tags.update(tags)
                               
                # remove unwanted tags, filter to concept tags
                concept_tags = vtags.intersection(tags)

                # encode ys with most common tag only
                if len(concept_tags) > 1:
                    most_common = max(concept_tags, key=lambda t: tag_freq[t])
                    concept_tags = set([most_common])
                if len(concept_tags) == 0:
                    concept_tags.add(EMPTY_TAG)

                one_hot = []
                for t in vtags:
                    if t in concept_tags:
                        one_hot.append(1)
                    else:
                        one_hot.append(0)
                y_seq.append(one_hot)
                #end for each word
            
            # sentence level tags
            for tag in vtags:
                if tag in unique_tags:
                    ys_bytag_concept_sent[tag].append(1)
                else:
                    ys_bytag_concept_sent[tag].append(0)
            
            for tag in cr_vtags:
                if tag in unique_tags:
                    ys_bytag_cr_sent[tag].append(1)
                else:
                    ys_bytag_cr_sent[tag].append(0)
                
            seq_lens.append(len(row)-2)
            ys.append(y_seq)
            xs.append(row)
    
    xs = sequence.pad_sequences(xs, maxlen=maxlen)
    ys = sequence.pad_sequences(ys, maxlen=maxlen)
    assert xs.shape[0] == ys.shape[0], "Sequences should have the same number of rows"
    assert xs.shape[1] == ys.shape[1] == maxlen, "Sequences should have the same lengths"
    return xs, ys, ys_bytag_concept_sent, ys_bytag_cr_sent, seq_lens

## Create Train - Test Split

In [18]:
#Helper Functions
def collapse_results(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    pred_ys_by_tag = defaultdict(list)
    for i in range(len(seq_lens)):
        row_ixs = preds[i,:]
        len_of_sequence = seq_lens[i] + 2
        # sequences are padded from the left, take the preds from the end of the seq
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]
        # skip the start and end label
        pred_ys = pred_ys[1:-1]
        for pred_tag in pred_ys:
            pred_ys_by_tag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in(vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

In [19]:
#Helper Functions
def collapse_results_sentence_level(seq_lens, preds):
    assert len(seq_lens) == preds.shape[0], "Axis 1 size does not align"
    pred_ys_by_tag = defaultdict(list)
    for i in range(len(seq_lens)):
        row_ixs = preds[i,:]
        len_of_sequence = seq_lens[i] + 2
        # sequences are padded from the left, take the preds from the end of the seq
        pred_ys = [ix2tag[j] for j in row_ixs[-len_of_sequence:]]
        # skip the start and end label
        pred_ys = set(pred_ys[1:-1])
        for tag in vtags:
            if tag == EMPTY_TAG:
                continue
            if tag in pred_ys:
                pred_ys_by_tag[tag].append(1)
            else:
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

In [20]:
def train_dev_split(lst, dev_split, randomize=True):
    # random shuffle
    if randomize:
        shuffle(lst)
    num_training = int((1.0 - dev_split) * len(lst))
    return lst[:num_training], lst[num_training:]

In [21]:
%%time
# use this name for a different function later
from CrossValidation import cross_validation as cv

folds = cv(tagged_essays, CV_FOLDS)
fold2training_data = {}
fold2dev_data = {}
fold2test_data = {}

for i, (essays_TD, essays_VD) in enumerate(folds):
    # further split into train and dev test
    essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT)
    fold2training_data[i] = get_training_data(essays_train)
    fold2dev_data[i]      = get_training_data(essays_dev)
    # Test Data
    fold2test_data[i]     = get_training_data(essays_VD)

CPU times: user 6.82 s, sys: 579 ms, total: 7.4 s
Wall time: 7.5 s


In [22]:
with open(cv_folder + "td.dill", "wb") as f:
    dill.dump(fold2training_data, f)

with open(cv_folder + "devd.dill", "wb") as f:
    dill.dump(fold2dev_data, f)

with open(cv_folder + "vd.dill", "wb") as f:
    dill.dump(fold2test_data, f)

In [23]:
# Make sure the generator is incremented on the test data too
_,_,_,_,_, = get_training_data(tagged_essays_test)

## Load Glove 100 Dim Embeddings

In [24]:
# see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
# for creating pre-filtered embeddings file
import pickle, os
from numpy.linalg import norm

embeddings_file = "/Users/simon.hughes/data/word_embeddings/glove.6B/cb_dict_glove.6B.100d.txt"
# read data file
with open(embeddings_file, "rb+") as f:
    cb_emb_index = pickle.load(f)

In [25]:
missed = set()
for wd in unique_words:
    if wd not in cb_emb_index:
        missed.add(wd)
print(len(missed), len(unique_words), 100.0 * round(len(missed)/  len(unique_words),4), "%")

42 1677 2.5 %


### Construct Embedding Matrix

In [26]:
EMBEDDING_DIM = list(cb_emb_index.values())[0].shape[0]

def get_embedding_matrix(words, idgenerator, max_features, init='uniform', unit_length=False):
    embedding_dim = list(cb_emb_index.values())[0].shape[0]
    # initialize with a uniform distribution
    if init == 'uniform':
        # NOTE: the max norms for these is quite low relative to the embeddings
        embedding_matrix = np.random.uniform(low=-0.05, high=0.05,size=(max_features, embedding_dim))
    elif init =='zeros':
        embedding_matrix = np.zeros(shape=(max_features, embedding_dim), dtype=np.float32)
    elif init == 'normal':
        embedding_matrix = np.random.normal(mean, sd, size=(max_features, embedding_dim))
    else:
        raise Exception("Unknown init type")
    for word in words:
        i = idgenerator.get_id(word)
        embedding_vector = cb_emb_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    if unit_length:
        norms = np.linalg.norm(embedding_matrix, axis=1,keepdims=True)
        # remove 0 norms to prevent divide by zero
        norms[norms == 0.0] = 1.0
        embedding_matrix = embedding_matrix / norms
    return embedding_matrix

In [27]:
def score_predictions(model, xs, ys_by_tag, seq_len):
    preds = model.predict_classes(xs, batch_size=batch_size, verbose=0)   
    pred_ys_by_tag = collapse_results_sentence_level(seq_len, preds)
    class2metrics = ResultsProcessor.compute_metrics(ys_by_tag, pred_ys_by_tag)
    micro_metrics = micro_rpfa(class2metrics.values())
    return micro_metrics, pred_ys_by_tag

In [28]:
def pivot_predictions_to_dict(preds):
    pred_ys_by_tag = defaultdict(list)
    for i in range(preds.shape[0]):
        row = preds[i]
        for tag_ix, pred in enumerate(row):
            tag = ix2crtag[tag_ix]
            pred_ys_by_tag[tag].append(pred)
    return pred_ys_by_tag

In [29]:
def score_predictions_sent_level(model, xs, ys_by_tag, seq_len):
    preds = model.predict(xs, batch_size=batch_size, verbose=0)
    preds = np.where(preds >= 0.5, 1, 0)
    pred_ys_by_tag = pivot_predictions_to_dict(preds)
    class2metrics = ResultsProcessor.compute_metrics(ys_by_tag, pred_ys_by_tag)
    micro_metrics = micro_rpfa(class2metrics.values())
    return micro_metrics, pred_ys_by_tag

In [30]:
from keras.layers import Bidirectional
from datetime import datetime

def get_ts():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')

def get_file_ts():
    return datetime.now().strftime('%Y%m%d_%H%M%S_%f')

embedding_size = EMBEDDING_DIM
hidden_size    = 128
out_size = len(vtags)
batch_size = 128
merge_mode = "sum"

get_ts(), get_file_ts()

('2019-04-17 17:13:26.777555', '20190417_171326_777581')

## Train Bi-Directional LSTM With Glove Embeddings

In [31]:
max_features=len(generator.get_ids())+2 #Need plus one maybe due to masking of sequences

In [32]:
def get_file_name(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    lcls = locals()
    s = ""
    for k, val in sorted(lcls.items(), key = lambda tpl: (0,tpl[0]) if tpl[0] == 'fold_ix' else (1,tpl[0])):
        s += "{key}-{val}_".format(key=k, val=str(val))
    return models_folder + s[:-1] + ".h5"

get_file_name(0, True, True, 2, merge_mode, hidden_size)

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Models/Bi-LSTM_Stacked/fold_ix-0_bi_directional-True_hidden_size-128_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.h5'

In [33]:
# merge_mode is Bi-Directional only
def evaluate_fold(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):

    if use_pretrained_embedding:
        embedding_matrix = get_embedding_matrix(unique_words, generator, max_features, init='uniform', unit_length=False)
        embedding_layer = Embedding(max_features,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=True,
                                mask_zero=True) # If false, initialize unfound words with all 0's
    else:
        embedding_layer = Embedding(max_features, embedding_size, input_length=maxlen, trainable=True, mask_zero=True)

    if bi_directional:
        rnn_layer_fact = lambda : Bidirectional(GRU(hidden_size, return_sequences=True, consume_less="cpu"), merge_mode=merge_mode)
    else:
        rnn_layer_fact = lambda : GRU(hidden_size, return_sequences=True, consume_less="cpu")
        
    model = Sequential()
    model.add(embedding_layer)
    for i in range(num_rnns):
        model.add(rnn_layer_fact())

    model.add(TimeDistributedDense(out_size))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', sample_weight_mode="temporal")
    
    X_train, y_train, train_ys_bytag_con_sent, train_ys_by_tag_cr_sent, seq_len_train = fold2training_data[fold_ix]
    X_dev,   y_dev,   dev_ys_bytag_con_sent,   dev_ys_by_tag_cr_sent,   seq_len_dev   = fold2dev_data[fold_ix]
    X_test,  y_test,  test_ys_bytag_con_sent,  test_ys_by_tag_cr_sent,  seq_len_test  = fold2test_data[fold_ix]

    # init loop vars
    f1_scores = [-1]
    num_since_best_score = 0
    patience = 3
    best_weights = None

    for i in range(30):
    #for i in range(2):
        #print("{ts}: Epoch={epoch}".format(ts=get_ts(), epoch=i))
        epochs = 1 # epochs per training instance
        results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, verbose=0)
        micro_metrics,_ = score_predictions(model, X_dev, dev_ys_bytag_con_sent, seq_len_dev)

        #print(micro_metrics)
        #print()

        f1_score = micro_metrics.f1_score
        best_f1_score = max(f1_scores)
        if f1_score <= best_f1_score:
            num_since_best_score += 1
        else: # score improved
            num_since_best_score = 0
            best_weights = model.get_weights()

        f1_scores.append(f1_score)
        if num_since_best_score >= patience:
            #print("Too long since an improvement, stopping")
            break
    
    print("Fold[{ix}] - Best F1 Score={f1}".format(ix=fold_ix, f1=best_f1_score))
    
    # load best weights
    model.set_weights(best_weights)
    return model

## Hyper Param Tuning

In [34]:
def cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    fold2model = {}
    for i in range(CV_FOLDS):
        model = evaluate_fold(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
        fname = get_file_name(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
        model.save(fname)    
        fold2model[i] = model
    return fold2model

#### Do a quick run to validate it is working

### Proper Run

In [35]:
import warnings
warnings.filterwarnings("ignore")

i = 0
for use_pretrained_embedding in [True]:
    for bi_directional in [True]:
        for num_rnns in [2]:
            for merge_mode in ["sum"]:
                for hidden_size in [256]:

                    i += 1
                    print("[{i}] Params {ts} - Embeddings={use_pretrained_embedding}, Bi-Direct={bi_directional} Num_Rnns={num_rnns} Hidden_Size={hidden_size}"\
                          .format(i=i, ts=get_ts(), use_pretrained_embedding=use_pretrained_embedding, bi_directional=bi_directional, num_rnns=num_rnns, hidden_size=hidden_size))
                    fold2model = cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
                    #print("MicroF1={micro_f1}".format(micro_f1=micro_f1))
                    print(get_ts())

[1] Params 2019-04-17 17:13:26.987480 - Embeddings=True, Bi-Direct=True Num_Rnns=2 Hidden_Size=256
Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Fold[0] - Best F1 Score=0.8968609865470851
Fold[1] - Best F1 Score=0.9035262807717898
Fold[2] - Best F1 Score=0.9092188599577763
Fold[3] - Best F1 Score=0.8945783132530121
Fold[4] - Best F1 Score=0.8967874231032127
2019-04-17 20:03:35.225293


In [43]:
def load_model(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    fname = get_file_name(fold_ix, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
    return keras.models.load_model(fname)

def load_models(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    models_by_fold = {}
    for i in range(CV_FOLDS):
        model = load_model(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size)
        models_by_fold[i] = model
    return models_by_fold

In [37]:
predicts_by_fold = {}
for fold_ix in range(CV_FOLDS):
    X_test,  y_test,  test_ys_bytag_con_sent,  test_ys_by_tag_cr_sent,  seq_len_test  = fold2test_data[fold_ix]
    model = fold2model[fold_ix]
    probs = model.predict_proba(X_test)
    predicts_by_fold[fold_ix] = probs



In [38]:
len(predicts_by_fold)

5

## Get Test Data Predictions

In [39]:
# merge_mode is Bi-Directional only
def evaluate_test(num_rnns, merge_mode, hidden_size):
    embedding_matrix = get_embedding_matrix(unique_words, generator, max_features, init='uniform', unit_length=False)
    embedding_layer = Embedding(max_features,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True,
                            mask_zero=True) # If false, initialize unfound words with all 0's
    rnn_layer_fact = lambda : Bidirectional(GRU(hidden_size, return_sequences=True, consume_less="cpu"), merge_mode=merge_mode)
        
    model = Sequential()
    model.add(embedding_layer)
    for i in range(num_rnns):
        model.add(rnn_layer_fact())

    model.add(TimeDistributedDense(out_size))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', sample_weight_mode="temporal")
    
    essays_train, essays_dev = train_dev_split(tagged_essays, DEV_SPLIT)
    X_train, y_train,  train_ys_bytag_con_sent,  train_ys_by_tag_cr_sent,  seq_len_train = get_training_data(essays_train)
    X_dev,   y_dev,    dev_ys_bytag_con_sent,    dev_ys_by_tag_cr_sent,    seq_len_dev   = get_training_data(essays_dev)

    # init loop vars
    f1_scores = [-1]
    num_since_best_score = 0
    patience = 3
    best_weights = None

    for i in range(30):
        print("{ts}: Epoch={epoch}".format(ts=get_ts(), epoch=i))
        results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, validation_split=0.0, verbose=0)
        micro_metrics,_ = score_predictions(model, X_dev, dev_ys_bytag_con_sent, seq_len_dev)

        print(micro_metrics)
        print()

        f1_score = micro_metrics.f1_score
        best_f1_score = max(f1_scores)
        if f1_score <= best_f1_score:
            num_since_best_score += 1
        else: # score improved
            num_since_best_score = 0
            best_weights = model.get_weights()

        f1_scores.append(f1_score)
        if num_since_best_score >= patience:
            #print("Too long since an improvement, stopping")
            break
    
    print("Fold[{ix}] - Best F1 Score={f1}".format(ix=fold_ix, f1=best_f1_score))
    
    # load best weights
    model.set_weights(best_weights)
    return model

In [40]:
test_model = evaluate_test(2, "sum", 256)

2019-04-17 20:05:17.365457: Epoch=0
Recall: 0.8711, Precision: 0.8562, F1: 0.8636, Accuracy: 0.9787, Codes:   923

2019-04-17 20:09:26.157137: Epoch=1
Recall: 0.9079, Precision: 0.8877, F1: 0.8977, Accuracy: 0.9840, Codes:   923

2019-04-17 20:13:35.303918: Epoch=2
Recall: 0.8992, Precision: 0.8954, F1: 0.8973, Accuracy: 0.9840, Codes:   923

2019-04-17 20:17:29.421119: Epoch=3
Recall: 0.8971, Precision: 0.8865, F1: 0.8918, Accuracy: 0.9831, Codes:   923

2019-04-17 20:21:23.326997: Epoch=4
Recall: 0.9068, Precision: 0.8961, F1: 0.9015, Accuracy: 0.9846, Codes:   923

2019-04-17 20:25:16.936736: Epoch=5
Recall: 0.8960, Precision: 0.9148, F1: 0.9053, Accuracy: 0.9855, Codes:   923

2019-04-17 20:29:05.306674: Epoch=6
Recall: 0.9328, Precision: 0.8706, F1: 0.9006, Accuracy: 0.9840, Codes:   923

2019-04-17 20:32:59.957264: Epoch=7
Recall: 0.9036, Precision: 0.9055, F1: 0.9046, Accuracy: 0.9852, Codes:   923

2019-04-17 20:36:58.162676: Epoch=8
Recall: 0.9177, Precision: 0.8906, F1: 0.903

In [41]:
X_test,  y_test,   test_ys_bytag_con_sent,   test_ys_by_tag_cr_sent,   seq_len_test = get_training_data(tagged_essays_test)
test_probs = test_model.predict_proba(X_test)



In [42]:
test_probs.shape, len(test_ys_by_tag_cr_sent['Causer:4->Result:3'])

((1918, 141, 14), 1918)

# Stacked Model

## Generate Train and Test Data For Each Fold

In [44]:
def get_stacked_feat_from_probs(probs, max_feats, min_feats, average_feats, binary_feats, combo_feats):
    xs = []
    for i in range(len(probs)):
        preds = probs[i,:]
        max_preds = np.max(preds, axis=0)
        min_preds = np.max(preds, axis=0)
        mean_preds = np.mean(preds, axis=0)
        
        predicted_ixs = set(np.argwhere(max_preds > 0.5).flatten())
        binary = [0] * len(max_preds)
        for ix in predicted_ixs:
            binary[ix] = 1
        
        x = []
        if binary_feats:
            x += binary
        if max_feats:
            x += max_preds.tolist()
        if min_feats:            
            x += min_preds.tolist()
        if average_feats:
            x += mean_preds.tolist()

        # combination tags
        if combo_feats:
            ixs = ix2tag.keys()
            for a in ixs:
                for b in ixs:
                    if b < a:
                        if a in predicted_ixs and b in predicted_ixs:
                            x.append(1)
                        else:
                            x.append(0)
        xs.append(x)
    return np.asarray(xs)

def get_stacked_feats_by_fold(fold_ix, predicts_by_fold, max_feats, min_feats, average_feats, binary_feats, combo_feats):
    probs = predicts_by_fold[fold_ix]
    xs = get_stacked_feat_from_probs(probs, max_feats, min_feats, average_feats, binary_feats, combo_feats)
    return xs

### Loop Thru Each Fold, Merge the Xs and Ys from the Other Folds as TD, and then Use Fold as VD

In [45]:
from wordtagginghelper import merge_dictionaries

def generate_stacked_features(max_feats, min_feats, average_feats, binary_feats, combo_feats):
    stacked_feats_by_code = {}
    for fold_ix in range(CV_FOLDS):
        stacked_feats_by_code[fold_ix] = get_stacked_feats_by_fold(fold_ix, predicts_by_fold, max_feats, min_feats, average_feats, binary_feats, combo_feats)
    
    td_xs_by_fold = {}
    vd_xs_by_fold = {}

    td_ys_by_fold = {}
    vd_ys_by_fold = {}
    for vd_ix in range(CV_FOLDS):
        td = []

        td_ys = defaultdict(list)
        vd_ys = defaultdict(list)
        for td_ix in range(CV_FOLDS):
            if td_ix == vd_ix:
                continue
            xs = stacked_feats_by_code[td_ix]
            td.append(xs)

            _, _, _, td_ys_by_tag_cr_sent, _ = fold2test_data[td_ix]
            merge_dictionaries(td_ys_by_tag_cr_sent, td_ys)

        vd_xs_by_fold[vd_ix] = stacked_feats_by_code[vd_ix]
        td_xs_by_fold[vd_ix] = np.vstack(td)

        del td_ys[EMPTY_TAG]
        td_ys_by_fold[vd_ix] = td_ys

        _, _, _, vd_ys_by_tag_cr_sent, _ = fold2test_data[vd_ix]
        # make a copy (so can delete EMPTY tag)
        merge_dictionaries(vd_ys_by_tag_cr_sent, vd_ys)
        
        del vd_ys[EMPTY_TAG]
        assert EMPTY_TAG not in td_ys
        assert EMPTY_TAG not in vd_ys
        
        vd_ys_by_fold[vd_ix] = vd_ys
    return td_xs_by_fold, td_ys_by_fold, vd_xs_by_fold, vd_ys_by_fold

# Train Stacked Classifier

In [47]:
from wordtagginghelper import train_classifier_per_code, test_classifier_per_code
from sklearn.linear_model import LogisticRegression

def train_stacked_classifier(dual, penalty, C, max_feats, min_feats, average_feats, binary_feats, combo_feats):
    # capture param values
    fn_args  = dict(locals())
    
    td_xs_by_fold, td_ys_by_fold, vd_xs_by_fold, vd_ys_by_fold = generate_stacked_features(max_feats, min_feats, average_feats, binary_feats, combo_feats)
    
    fn_create_sent_cls  = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty)

    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    num_feats = []
    for i in range(CV_FOLDS):
        sent_td_xs = td_xs_by_fold[i]
        sent_vd_xs = vd_xs_by_fold[i]
        
        num_feats.append(sent_vd_xs.shape[1])

        sent_td_ys_bycode = td_ys_by_fold[i]
        sent_vd_ys_bycode = vd_ys_by_fold[i]

        tags = sent_td_ys_bycode.keys()

        tag2sent_classifier = train_classifier_per_code(sent_td_xs, sent_td_ys_bycode , fn_create_sent_cls, tags, verbose=False)
        td_sent_predictions_by_code \
            = test_classifier_per_code(sent_td_xs, tag2sent_classifier, tags )

        vd_sent_predictions_by_code \
            = test_classifier_per_code(sent_vd_xs, tag2sent_classifier, tags )

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(td_sent_predictions_by_code, cv_sent_td_predictions_by_tag)
        merge_dictionaries(vd_sent_predictions_by_code, cv_sent_vd_predictions_by_tag)

    sent_algo = str(fn_create_sent_cls())

    CB_SENT_TD, CB_SENT_VD = "CR_CB_STACKED_TD_CNT_FEATS", "CR_CB_STACKED_VD_CNT_FEATS"
    parameters = dict(config)
    parameters["extractors"] = []
    parameters["num_feats_MEAN"] = np.mean(num_feats)
    # merge in function args
    parameters.update(fn_args)

    sent_td_objectid = processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo)
    sent_vd_objectid = processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo)
    
    avg_f1 = float(processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"])
    return avg_f1

## Determine Optimal Stacked Features

In [90]:
counter = 0
for max_feats in [True, False]:
    for min_feats in [True, False]:
        for average_feats in [True, False]:
            for binary_feats in [True,False]:
                for combo_feats in [True,False]:
                    if not any([max_feats, min_feats, average_feats, binary_feats, combo_feats]):
                        continue
                        
                    counter +=1
                    micro_f1 = train_stacked_classifier(dual=True, penalty='l2', C=1.0, \
                                                      max_feats=max_feats, min_feats=min_feats, average_feats=average_feats,\
                                                      binary_feats=binary_feats, combo_feats=combo_feats)
                    print("{ts} {counter} MICRO: F1: {f1:.6f} Max:{max} Min:{min} Avg:{average} Binary:{binary} Combo:{combo}".format(\
                        ts=get_ts(), counter=counter, f1=micro_f1, max=max_feats, min=min_feats, \
                        average=average_feats, binary=binary_feats, combo=combo_feats))

2017-12-03 21:52:10.308026 1 MICRO: F1: 0.692014 Max:True Min:True Avg:True Binary:True Combo:True
2017-12-03 21:52:28.890105 2 MICRO: F1: 0.687338 Max:True Min:True Avg:True Binary:True Combo:False
2017-12-03 21:52:48.725480 3 MICRO: F1: 0.690863 Max:True Min:True Avg:True Binary:False Combo:True
2017-12-03 21:53:02.934574 4 MICRO: F1: 0.675737 Max:True Min:True Avg:True Binary:False Combo:False
2017-12-03 21:53:21.054911 5 MICRO: F1: 0.691425 Max:True Min:True Avg:False Binary:True Combo:True
2017-12-03 21:53:34.447422 6 MICRO: F1: 0.684661 Max:True Min:True Avg:False Binary:True Combo:False
2017-12-03 21:53:49.842310 7 MICRO: F1: 0.690598 Max:True Min:True Avg:False Binary:False Combo:True
2017-12-03 21:54:00.833707 8 MICRO: F1: 0.674115 Max:True Min:True Avg:False Binary:False Combo:False
2017-12-03 21:54:17.367821 9 MICRO: F1: 0.691680 Max:True Min:False Avg:True Binary:True Combo:True
2017-12-03 21:54:29.332734 10 MICRO: F1: 0.684993 Max:True Min:False Avg:True Binary:True Combo:

In [92]:
# best 2017-12-03 21:58:04.438748 29 MICRO: F1: 0.693566 Max:False Min:False Avg:False Binary:True Combo:True

### Best Features are Binary and Combo

In [56]:
optimal_feats = 105
max_feats = 147

(max_feats-optimal_feats)/max_feats, (optimal_feats)/max_feats

(0.2857142857142857, 0.7142857142857143)

In [93]:
counter = 0
for dual in [True, False]:
    for penalty in ["l1", "l2"]:
        # dual only support l2
        if dual and penalty != "l2":
            continue
        for C in [0.1, 0.5, 1.0, 5.0, 10.0, 100.0]:
            counter += 1
            micro_f1 = train_stacked_classifier(dual=dual, penalty=penalty, C=C, \
                                                      max_feats=False, min_feats=False, average_feats=False,\
                                                      binary_feats=True, combo_feats=True)                
            print("%i MICRO: F1: %s dual: %s penalty: %s C:%s"
                   % (counter, str(round(micro_f1, 6)).rjust(8), str(dual), str(penalty), str(round(C, 3)).rjust(5)))

1 MICRO: F1: 0.621378 dual: True penalty: l2 C:  0.1
2 MICRO: F1:  0.68806 dual: True penalty: l2 C:  0.5
3 MICRO: F1: 0.693566 dual: True penalty: l2 C:  1.0
4 MICRO: F1: 0.690153 dual: True penalty: l2 C:  5.0
5 MICRO: F1: 0.688191 dual: True penalty: l2 C: 10.0
6 MICRO: F1: 0.647016 dual: True penalty: l2 C:100.0
7 MICRO: F1: 0.633465 dual: False penalty: l1 C:  0.1
8 MICRO: F1: 0.692996 dual: False penalty: l1 C:  0.5
9 MICRO: F1: 0.694611 dual: False penalty: l1 C:  1.0
10 MICRO: F1: 0.685934 dual: False penalty: l1 C:  5.0
11 MICRO: F1: 0.682689 dual: False penalty: l1 C: 10.0
12 MICRO: F1: 0.678819 dual: False penalty: l1 C:100.0
13 MICRO: F1: 0.621378 dual: False penalty: l2 C:  0.1
14 MICRO: F1:  0.68806 dual: False penalty: l2 C:  0.5
15 MICRO: F1: 0.693566 dual: False penalty: l2 C:  1.0
16 MICRO: F1: 0.690153 dual: False penalty: l2 C:  5.0
17 MICRO: F1:  0.68853 dual: False penalty: l2 C: 10.0
18 MICRO: F1: 0.681699 dual: False penalty: l2 C:100.0


In [94]:
# best - 9 MICRO: F1: 0.694611 dual: False penalty: l1 C:  1.0
# need to adjust code below

## Compute Test Metric Performance

In [95]:
td = []
td_ys = defaultdict(list)
for i in range(CV_FOLDS):
    tmp_xs = predicts_by_fold[i]
    td.append(tmp_xs)

    _, _, _, td_ys_by_tag_cr_sent, _ = fold2test_data[i]
    merge_dictionaries(td_ys_by_tag_cr_sent, td_ys)

xs = np.vstack(td)
# ensure the same number of rows
xs.shape, len(list(td_ys.values())[0])

((8292, 141, 14), 8292)

In [100]:
xs_feats_train = get_stacked_feat_from_probs(xs, max_feats=False, min_feats=False, average_feats=False, binary_feats=True, combo_feats=True)
xs_feats_train.shape

(8292, 105)

In [101]:
xs_feats_test = get_stacked_feat_from_probs(test_probs, max_feats=False, min_feats=False, average_feats=False, binary_feats=True, combo_feats=True)
xs_feats_test.shape

(1918, 105)

In [102]:
#optimal from earlier
fn_create_sent_cls = lambda: LogisticRegression(dual=False, C=1.0, penalty='l1')
tag2sent_classifier = train_classifier_per_code(xs_feats_train, td_ys , fn_create_sent_cls, cr_vtags, verbose=False)
train_sent_predictions_by_code \
    = test_classifier_per_code(xs_feats_train, tag2sent_classifier, cr_vtags )

test_sent_predictions_by_code \
    = test_classifier_per_code(xs_feats_test, tag2sent_classifier, cr_vtags )

In [103]:
CB_SENT_TD, CB_SENT_VD = "TEST_CR_CB_STACKED_TD", "TEST_CR_CB_STACKED_VD"
parameters = dict(config)
parameters["extractors"] = []
# merge in function args
sent_algo = "stacked"
parameters.update({'dual': True, 'C':0.5, 'penalty':'l2', 
                   'max_feats': True, 'min_feats': False, 'average_feats': True, 
                   'binary_feats': False, 'combo_feats': True})

sent_td_objectid = processor.persist_results(CB_SENT_TD, td_ys,                  train_sent_predictions_by_code, parameters, sent_algo)
sent_vd_objectid = processor.persist_results(CB_SENT_VD, test_ys_by_tag_cr_sent, test_sent_predictions_by_code,  parameters, sent_algo)

In [None]:
raise Exception("Stop here, don't run rest of notebook")

## Now Train an RNN on Sequential Predictions

In [113]:
len(cr_vtags)

83

In [540]:
def get_xs_for_fold(fold_ix):
    tmp_xs = predicts_by_fold[fold_ix]
    _,_,_,_, seq_len_test  = fold2test_data[fold_ix]

    xs = []
    for i in range(len(seq_len_test)):
        x = tmp_xs[i].copy()
        seq_len = seq_len_test[i]
        #mask zeros
        x[:-seq_len,:] = 0.0
        xs.append(x)
    return np.asarray(xs)

def get_seq_lens_for_fold(fold_ix):
    _,_,_,_, seq_len_test  = fold2test_data[fold_ix]
    return seq_len_test

def get_ys_for_fold(fold_ix):
    _, _, _, td_ys_by_tag_cr_sent, _ = fold2test_data[fold_ix]
    num_classes = len(td_ys_by_tag_cr_sent.keys())
    n_rows = len(td_ys_by_tag_cr_sent[list(td_ys_by_tag_cr_sent.keys())[0]])
    ys = []
    for i in range(n_rows):
        y = np.zeros((num_classes))
        for tag_ix in range(num_classes):            
            tag = ix2crtag[tag_ix]
            lbl = td_ys_by_tag_cr_sent[tag][i]
            y[tag_ix] = lbl
        ys.append(y)
    return np.asarray(ys)

def get_ys_for_fold(fold_ix):
    _, _, _, td_ys_by_tag_cr_sent, _ = fold2test_data[fold_ix]
    num_classes = len(td_ys_by_tag_cr_sent.keys())
    n_rows = len(td_ys_by_tag_cr_sent[list(td_ys_by_tag_cr_sent.keys())[0]])
    ys = []
    for i in range(n_rows):
        y = np.zeros((num_classes))
        for tag_ix in range(num_classes):            
            tag = ix2crtag[tag_ix]
            lbl = td_ys_by_tag_cr_sent[tag][i]
            y[tag_ix] = lbl
        ys.append(y)
    return np.asarray(ys)

def split_dict(ys_by_tag, split):
    n_rows = len(ys_by_tag[list(ys_by_tag.keys())[0]])
    num_training = int((1.0 - split) * n_rows)
    
    a, b = defaultdict(list), defaultdict(list)
    for tag, lst in ys_by_tag.items():
        a[tag], b[tag] = lst[:num_training], lst[num_training:]
    return a,b

In [541]:
for i in range(CV_FOLDS):
    xs = get_xs_for_fold(0)
    ys = get_ys_for_fold(0)
    assert ys.shape[0] == len(xs)
    # + 1 because of the empty tag
    assert ys.shape[1] == len(cr_tags) + 1, "%i,%i" % (ys.shape[1] , len(cr_tags))

In [542]:
def train_dev_split_array(a, dev_split):
    num_training = int((1.0 - dev_split) * a.shape[0])
    return a[:num_training], a[num_training:]

In [543]:
td_xs_by_fold, td_ys_by_fold, vd_xs_by_fold, vd_ys_by_fold = generate_stacked_features(False, False, False, True, True)

In [544]:
seq_td_xs_by_fold = {}
seq_dev_xs_by_fold = {}
seq_vd_xs_by_fold = {}

seq_td_ys_by_fold = {}
seq_dev_ys_by_fold = {}
seq_vd_ys_by_fold = {}

seq_len_td_by_fold = {}
seq_len_dev_by_fold = {}
seq_len_vd_by_fold = {}

td_ys_by_tag_fold = {}
dev_ys_by_tag_fold = {}
vd_ys_by_tag_fold = {}

for vd_ix in range(CV_FOLDS):
    
    td_xs = []
    td_ys = []
    vd_ys = []
    td_seq_lens = []
    for td_ix in range(CV_FOLDS):
        if td_ix == vd_ix:
            continue
        
        xs = get_xs_for_fold(td_ix)
        td_xs.append(xs)
        
        ys = get_ys_for_fold(td_ix)
        td_ys.append(ys)
        
        seq_lens = get_seq_lens_for_fold(td_ix)
        td_seq_lens.extend(seq_lens)
        #TODO - keep EMPTY?

    td_xs = np.vstack(td_xs)
    td_ys = np.vstack(td_ys)
    
    seq_td_xs_by_fold[vd_ix], seq_dev_xs_by_fold[vd_ix] = train_dev_split_array(td_xs, DEV_SPLIT)
    seq_vd_xs_by_fold[vd_ix] = get_xs_for_fold(vd_ix)
    
    seq_td_ys_by_fold[vd_ix], seq_dev_ys_by_fold[vd_ix] = train_dev_split_array(td_ys, DEV_SPLIT)
    seq_vd_ys_by_fold[vd_ix] = get_ys_for_fold(vd_ix)
    
    td_ys_by_tag_fold[vd_ix], dev_ys_by_tag_fold[vd_ix] = split_dict(td_ys_by_fold[vd_ix], DEV_SPLIT)
    vd_ys_by_tag_fold[vd_ix] = vd_ys_by_fold[vd_ix]
    
    seq_len_td_by_fold[vd_ix], seq_len_dev_by_fold[vd_ix] = train_dev_split(td_seq_lens, DEV_SPLIT, randomize=False)
    seq_len_vd_by_fold[vd_ix] = get_seq_lens_for_fold(vd_ix)

In [545]:
# merge_mode is Bi-Directional only
def evaluate_stacked_fold(fold_ix, bi_directional, num_rnns, merge_mode, hidden_size):

    num_outputs = len(cr_vtags)    
    input_shape=(maxlen, len(vtags))
    
    model = Sequential()
    for i in range(num_rnns):
        return_sequences = i < (num_rnns-1)
        if i == 0:
            if bi_directional:
                rnn_layer = Bidirectional(GRU(hidden_size, return_sequences=return_sequences, consume_less="cpu"), input_shape=input_shape, merge_mode=merge_mode)
            else:
                rnn_layer = GRU(hidden_size, input_shape=input_shape, return_sequences=return_sequences, consume_less="cpu")
        else: # no need for input_size            
            if bi_directional:
                rnn_layer = Bidirectional(GRU(hidden_size, return_sequences=return_sequences, consume_less="cpu"), merge_mode=merge_mode)
            else:
                rnn_layer = GRU(hidden_size, return_sequences=return_sequences, consume_less="cpu")
        model.add(rnn_layer)

    model.add(Dense(num_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    # get training data, split into train and development sets
    X_train, y_train = seq_td_xs_by_fold[fold_ix],  seq_td_ys_by_fold[fold_ix]    
    X_dev, y_dev     = seq_dev_xs_by_fold[fold_ix], seq_dev_ys_by_fold[fold_ix] 
    X_test,  y_test  = seq_vd_xs_by_fold[fold_ix],  seq_vd_ys_by_fold[fold_ix]

    #model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_split=0.0, verbose=0)
    #return model
    
    # init loop vars
    f1_scores = [-1]
    num_since_best_score = 0
    patience = 10
    best_weights = None

    for i in range(30):
        print("{ts}: Epoch={epoch}".format(ts=get_ts(), epoch=i))
        epochs = 1 # epochs per training instance
        results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, verbose=0)
        micro_metrics,_ = score_predictions_sent_level(model, X_dev, dev_ys_by_tag_fold[fold_ix], seq_len_dev_by_fold[fold_ix])

        print(micro_metrics)
        print()

        f1_score = micro_metrics.f1_score
        best_f1_score = max(f1_scores)
        if f1_score <= best_f1_score:
            num_since_best_score += 1
        else: # score improved
            num_since_best_score = 0
            best_weights = model.get_weights()

        f1_scores.append(f1_score)
        if num_since_best_score >= patience:
            #print("Too long since an improvement, stopping")
            break
    
    print("Fold[{ix}] - Best F1 Score={f1}".format(ix=fold_ix, f1=best_f1_score))
    
    # load best weights
    model.set_weights(best_weights)
    return model

In [None]:
# works with a very small number of hidden nodes
mdl = evaluate_stacked_fold(0, bi_directional=True, num_rnns=2, merge_mode="sum", hidden_size=8)