In [1]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [2]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import MIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model, train_instance, get_essays_for_data, evaluate_ranker
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor
from filter_features import filter_feats

from wordtagginghelper import merge_dictionaries
from results_procesor import ResultsProcessor, __MICRO_F1__
from evaluation import add_cr_labels

from random import shuffle
from joblib import Parallel, delayed
from collections import defaultdict

In [3]:
def train_model_fold(xs_train, xs_test, name2essay, C, pa_type, max_update_items, set_cr_tags,\
                     initial_weight, max_epochs, early_stop_iters):

    mdl = MIRA(
        C=C, pa_type=pa_type, max_update_items=max_update_items, initial_weight=initial_weight)

    return train_model(mdl, xs_train=xs_train, xs_test=xs_test, name2essay=name2essay,
            max_epochs=max_epochs, early_stop_iters=early_stop_iters, set_cr_tags=set_cr_tags,
            train_instance_fn=train_instance,
            verbose=False, return_metrics=True, early_stopping=False)

def train_model_parallel(cv_folds, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                         initial_weight, max_epochs=5, early_stop_iters=5, n_jobs=None):

    if n_jobs == None:
        n_jobs = len(cv_folds)
    try:
        results = Parallel(n_jobs=n_jobs)(
            delayed(train_model_fold)(train, test, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                                      initial_weight, max_epochs, early_stop_iters)
            for (train, test) in cv_folds)

        f1s = []
        for tpl in results:
            best_test_f1, best_iterations, train_ys_bytag, train_pred_ys_bytag, test_ys_bytag, test_pred_ys_bytag, num_feats = tpl
            f1s.append(best_test_f1)

        return np.mean(f1s)

    except KeyboardInterrupt:
        print("Process stopped by user")

def train_model_parallel_logged(training_collection_name: str, results_processor: ResultsProcessor,
                                feat_extractors, params,
                                cv_folds, name2essay,
                                C: float, pa_type: str, max_update_items:int, set_cr_tags, \
                                initial_weight: float,  max_epochs=5, early_stop_iters=5, n_jobs=None):
    if not n_jobs or n_jobs == None:
        n_jobs = len(cv_folds)

    try:
        results = Parallel(n_jobs=n_jobs)(
            delayed(train_model_fold)(train, test, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                                      initial_weight, max_epochs, early_stop_iters)
            for (train, test) in cv_folds)

        cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
        cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

        f1s = []
        feats = []
        for tpl in results:
            best_test_f1, best_iterations, train_ys_bytag, train_pred_ys_bytag, test_ys_bytag, test_pred_ys_bytag, num_feats = tpl
            feats.append(num_feats)
            f1s.append(best_test_f1)

            merge_dictionaries(train_ys_bytag, cv_sent_td_ys_by_tag)
            merge_dictionaries(test_ys_bytag, cv_sent_vd_ys_by_tag)

            merge_dictionaries(train_pred_ys_bytag, cv_sent_td_predictions_by_tag)
            merge_dictionaries(test_pred_ys_bytag, cv_sent_vd_predictions_by_tag)


        ALGO = "MIRA Re-Ranker"
        validation_collection = training_collection_name.replace("_TD", "_VD")

        # extractors = list(map(lambda fn: fn.func_name, feat_extractors))
        extractors = list(feat_extractors)

        parameters = {
            "C":                    C,
            "pa_type":              pa_type,
            "loss_type":            "None - cost insens",
            "max_update_items":     max_update_items,
            "initial_weight":       initial_weight,

            "max_epochs":           max_epochs,
            "early_stopping_iters": early_stop_iters,

            "extractors":           extractors,

            # Add in number of features
            "num_feats_per_fold":   feats,
            "num_feats_MEAN":       np.mean(feats)
        }
        # add in additional parameters not passed in
        parameters.update(params)

        wd_td_objectid = results_processor.persist_results(training_collection_name,
                                                           cv_sent_td_ys_by_tag,
                                                           cv_sent_td_predictions_by_tag,
                                                           parameters, ALGO)

        wd_vd_objectid = results_processor.persist_results(validation_collection,
                                                           cv_sent_vd_ys_by_tag,
                                                           cv_sent_vd_predictions_by_tag,
                                                           parameters, ALGO)

        avg_f1 = float(results_processor.get_metric(validation_collection, wd_vd_objectid, __MICRO_F1__)["f1_score"])
        return avg_f1

    except KeyboardInterrupt:
        print("Process stopped by user")

In [5]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/CB"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "CB_PA_RE-RANKER_FEATURE_SEL_TD"
# first and second were with initial_weight set to 1.0
# thrid is with set to 0.001

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:50->Result:7',
 'Causer:11->Result:11',
 'Causer:6->Result:50',
 'Causer:12->Result:50',
 'Causer:11->Result:3',
 'Causer:7->Result:14',
 'Causer:7->Result:50',
 'Causer:12->Result:13',
 'Causer:13->Result:12',
 'Causer:2->Result:1']

In [7]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

In [42]:
len(all_essays)

1128

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [8]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 2 
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [11]:
rr_fname = "xs_rerank_" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_rerank = dill.load(f)

rr_fname = "xs_rerank_test" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_test_rerank = dill.load(f)
    
assert len(xs_rerank) == len(pred_tagged_essays_train),     "Wrong number of train crels"
assert len(xs_test_rerank) == len(pred_tagged_essays_test), "Wrong number of test crels"
len(xs_rerank), len(xs_test_rerank)

(902, 226)

## Prepare Features

In [12]:
best_max_parses, min_feat_freq, best_min_prob

(300, 1, 0.0)

In [13]:
%%time
xs = get_features_from_probabilities(xs_rerank, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 1min 48s, sys: 2.39 s, total: 1min 50s
Wall time: 1min 50s


In [14]:
%%time
xs_test = get_features_from_probabilities(xs_test_rerank, name2crels, best_max_parses, 
                                          causal_model_type=CAUSAL_MODEL_TYPE,
                                          min_feat_freq=min_feat_freq, min_prob=best_min_prob)

CPU times: user 5.38 s, sys: 69.8 ms, total: 5.45 s
Wall time: 5.44 s


In [15]:
# Prepare test dataset 
  # training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

### Initial Parameters

In [31]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 0.01  # was 0.01

## Train on Test Data (i.e. Held out Folds)

In [22]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

### Determine Number of Training Iterations

In [24]:
train_instance

<function train_reranker.train_instance>

In [25]:
%%time
# use training data to determine number of iterations
mdl = MIRA(C=best_C, pa_type=pa_type, max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(
    mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, 
    set_cr_tags=set_cr_tags, max_epochs=20, early_stop_iters=5, 
     train_instance_fn = train_instance, verbose=True, early_stopping=True)
print("Best iterations:", best_iterations)

Epoch: 0 Train Accuracy: 0.7428 Test Accuracy: 0.7391
Epoch: 1 Train Accuracy: 0.7466 Test Accuracy: 0.7296
Epoch: 2 Train Accuracy: 0.7492 Test Accuracy: 0.7254
Epoch: 3 Train Accuracy: 0.7513 Test Accuracy: 0.7252
Epoch: 4 Train Accuracy: 0.7510 Test Accuracy: 0.7260
Epoch: 5 Train Accuracy: 0.7521 Test Accuracy: 0.7300
Best Test Acc: 0.7391
Best iterations: 1
CPU times: user 2min 26s, sys: 740 ms, total: 2min 27s
Wall time: 2min 27s


In [32]:
MONGO_COLLECTION, best_iterations

('CB_PA_RE-RANKER_HYPER_PARAM_TD', 1)

## Feature Selection

In [27]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
#     "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [28]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

In [30]:
MONGO_COLLECTION, best_iterations, initial_weight

('CB_PA_RE-RANKER_HYPER_PARAM_TD', 1, 0.01)

In [34]:
%%time
params = {
    "best_top_n":      best_top_n,
    "best_max_upd":    best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob":   best_min_prob,
    "min_feat_freq":   min_feat_freq,
    "best_iterations": best_iterations
}

print("Starting...")
while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel_logged(
            training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
            feat_extractors=new_prefixes, params=params,
            
            cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
            pa_type=1, max_update_items=best_max_upd, 
            set_cr_tags=set_cr_tags, initial_weight=initial_weight,
            # use best iterations from above
            max_epochs=best_iterations, early_stop_iters=best_iterations
        )
        print("\t{length} feats F1: {f1:.6f} Prefixes: {prefixes}".format(
            length=len(new_prefixes), f1=f1_by_prefix[prefix], prefixes=str(new_prefixes)))
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.6f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break

# CS PA Algo:

# 1 feats, new Best F1: 0.7430 Prefixes: ['CREL_']
# 2 feats, new Best F1: 0.7438 Prefixes: ['CREL_', 'Prob-']
# 3 feats, new Best F1: 0.7461 Prefixes: ['CREL_', 'Prob-', 'CChainStats-']
# No further improvement, stopping

Starting...
	1 feats F1: 0.572189 Prefixes: ['Prob-']
	1 feats F1: 0.000000 Prefixes: ['Inv-']
	1 feats F1: 0.736270 Prefixes: ['num_crels']
	1 feats F1: 0.741586 Prefixes: ['Tally-']
	1 feats F1: 0.474108 Prefixes: ['CChain-']
	1 feats F1: 0.626460 Prefixes: ['CChainStats-']
	1 feats F1: 0.717570 Prefixes: ['Above-']
	1 feats F1: 0.741851 Prefixes: ['CREL_']
	1 feats F1: 0.633727 Prefixes: ['Propn_']
	1 feats F1: 0.607971 Prefixes: ['Diff_']
1 feats, new Best F1: 0.741851 Prefixes: ['CREL_']
	2 feats F1: 0.731977 Prefixes: ['CREL_', 'Prob-']
	2 feats F1: 0.741075 Prefixes: ['CREL_', 'Inv-']
	2 feats F1: 0.737476 Prefixes: ['CREL_', 'num_crels']
	2 feats F1: 0.739921 Prefixes: ['CREL_', 'Tally-']
	2 feats F1: 0.741657 Prefixes: ['CREL_', 'CChain-']
	2 feats F1: 0.739344 Prefixes: ['CREL_', 'CChainStats-']
	2 feats F1: 0.729196 Prefixes: ['CREL_', 'Above-']
	2 feats F1: 0.737441 Prefixes: ['CREL_', 'Propn_']
	2 feats F1: 0.736504 Prefixes: ['CREL_', 'Diff_']
No further improvement, stop

In [40]:
current_best, best_f1

(['CREL_'], 0.7418508804795804)

## Apply to Test Data

In [35]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [36]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [37]:
%%time
mdl = MIRA(C=best_C, pa_type=pa_type,
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(
    mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, 
    set_cr_tags=set_cr_tags, max_epochs=20, early_stop_iters=3, 
    train_instance_fn = train_instance, verbose=True,  early_stopping=True)

print("Best iterations:", best_iterations)

Epoch: 0 Train Accuracy: 0.7408 Test Accuracy: 0.7502
Epoch: 1 Train Accuracy: 0.7456 Test Accuracy: 0.7505
Epoch: 2 Train Accuracy: 0.7510 Test Accuracy: 0.7460
Epoch: 3 Train Accuracy: 0.7582 Test Accuracy: 0.7431
Epoch: 4 Train Accuracy: 0.7609 Test Accuracy: 0.7500
Best Test Acc: 0.7505
Best iterations: 2
CPU times: user 44.2 s, sys: 271 ms, total: 44.5 s
Wall time: 44.8 s


In [41]:
best_iterations

2

## Run for X Iterations on the Test Dataset

In [39]:
mdl = MIRA(C=best_C, pa_type=pa_type,  max_update_items=max_update_items, 
           initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516
best_iterations

Epoch: 0 Train Accuracy: 0.7430 Test Accuracy: 0.7358
Epoch: 1 Train Accuracy: 0.7476 Test Accuracy: 0.7327
Best Test Acc: 0.7358


2