In [1]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [2]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import MIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model, train_instance, get_essays_for_data, evaluate_ranker
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor
from filter_features import filter_feats

from wordtagginghelper import merge_dictionaries
from results_procesor import ResultsProcessor, __MICRO_F1__
from evaluation import add_cr_labels

from random import shuffle
from joblib import Parallel, delayed
from collections import defaultdict

In [3]:
def train_model_fold(xs_train, xs_test, name2essay, C, pa_type, max_update_items, set_cr_tags,\
                     initial_weight, max_epochs, early_stop_iters):

    mdl = MIRA(
        C=C, pa_type=pa_type, max_update_items=max_update_items, initial_weight=initial_weight)

    return train_model(mdl, xs_train=xs_train, xs_test=xs_test, name2essay=name2essay,
            max_epochs=max_epochs, early_stop_iters=early_stop_iters, set_cr_tags=set_cr_tags,
            train_instance_fn=train_instance,
            verbose=False, return_metrics=True, early_stopping=False)

def train_model_parallel(cv_folds, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                         initial_weight, max_epochs=5, early_stop_iters=5, n_jobs=None):

    if n_jobs == None:
        n_jobs = len(cv_folds)
    try:
        results = Parallel(n_jobs=n_jobs)(
            delayed(train_model_fold)(train, test, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                                      initial_weight, max_epochs, early_stop_iters)
            for (train, test) in cv_folds)

        f1s = []
        for tpl in results:
            best_test_f1, best_iterations, train_ys_bytag, train_pred_ys_bytag, test_ys_bytag, test_pred_ys_bytag, num_feats = tpl
            f1s.append(best_test_f1)

        return np.mean(f1s)

    except KeyboardInterrupt:
        print("Process stopped by user")

def train_model_parallel_logged(training_collection_name: str, results_processor: ResultsProcessor,
                                feat_extractors, params,
                                cv_folds, name2essay,
                                C: float, pa_type: str, max_update_items:int, set_cr_tags, \
                                initial_weight: float,  max_epochs=5, early_stop_iters=5, n_jobs=None):
    if not n_jobs or n_jobs == None:
        n_jobs = len(cv_folds)

    try:
        results = Parallel(n_jobs=n_jobs)(
            delayed(train_model_fold)(train, test, name2essay, C, pa_type, max_update_items, set_cr_tags, \
                                      initial_weight, max_epochs, early_stop_iters)
            for (train, test) in cv_folds)

        cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
        cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

        f1s = []
        feats = []
        for tpl in results:
            best_test_f1, best_iterations, train_ys_bytag, train_pred_ys_bytag, test_ys_bytag, test_pred_ys_bytag, num_feats = tpl
            feats.append(num_feats)
            f1s.append(best_test_f1)

            merge_dictionaries(train_ys_bytag, cv_sent_td_ys_by_tag)
            merge_dictionaries(test_ys_bytag, cv_sent_vd_ys_by_tag)

            merge_dictionaries(train_pred_ys_bytag, cv_sent_td_predictions_by_tag)
            merge_dictionaries(test_pred_ys_bytag, cv_sent_vd_predictions_by_tag)


        ALGO = "MIRA Re-Ranker"
        validation_collection = training_collection_name.replace("_TD", "_VD")

        # extractors = list(map(lambda fn: fn.func_name, feat_extractors))
        extractors = list(feat_extractors)

        parameters = {
            "C":                    C,
            "pa_type":              pa_type,
            "loss_type":            "None - cost insens",
            "max_update_items":     max_update_items,
            "initial_weight":       initial_weight,

            "max_epochs":           max_epochs,
            "early_stopping_iters": early_stop_iters,

            "extractors":           extractors,

            # Add in number of features
            "num_feats_per_fold":   feats,
            "num_feats_MEAN":       np.mean(feats)
        }
        # add in additional parameters not passed in
        parameters.update(params)

        wd_td_objectid = results_processor.persist_results(training_collection_name,
                                                           cv_sent_td_ys_by_tag,
                                                           cv_sent_td_predictions_by_tag,
                                                           parameters, ALGO)

        wd_vd_objectid = results_processor.persist_results(validation_collection,
                                                           cv_sent_vd_ys_by_tag,
                                                           cv_sent_vd_predictions_by_tag,
                                                           parameters, ALGO)

        avg_f1 = float(results_processor.get_metric(validation_collection, wd_vd_objectid, __MICRO_F1__)["f1_score"])
        return avg_f1

    except KeyboardInterrupt:
        print("Process stopped by user")

In [4]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.SKIN_CANCER
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/SC"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "SC_PA_RE-RANKER_HYPER_PARAM_TD"
MONGO_TEST_COLLECTION = "TEST_SC_PA_RE-RANKER_TD"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
870 218


In [5]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:12->Result:3',
 'Causer:50->Result:5',
 'Causer:2->Result:11',
 'Causer:6->Result:4',
 'Causer:11->Result:12',
 'Causer:11->Result:3',
 'Causer:1->Result:5',
 'Causer:1->Result:4',
 'Causer:11->Result:4',
 'Causer:4->Result:50']

In [6]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [7]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 1
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [8]:
def load_rerank(top_n):
    rr_fname = "xs_rerank_" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_rerank = dill.load(f)

    rr_fname = "xs_rerank_test" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_test_rerank = dill.load(f)
    return xs_rerank, xs_test_rerank

## Prepare Features

In [9]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
#     "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]

# Results for SC
current_best = ['CREL_', 'CChain-', 'Prob-']
best_iterations = 1

# default params (not all of these are optimized)
params = {
    "best_top_n": best_top_n,
    "best_max_upd": best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob": best_min_prob,
    "min_feat_freq": min_feat_freq
}

In [10]:
MONGO_COLLECTION, best_iterations

('SC_COST_INSENS_RE-RANKER_HYPER_PARAM_TD', 1)

### Load Param Hash (avoid re-running same experiment multiple times)

In [11]:
vals = ("C", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        "best_min_prob", "best_top_n", "extractors", "initial_weight",\
        "min_feat_freq", "pa_type")

def hash_params(params):
    p = dict()
    # only copy over white list vals
    for v in vals:
        p[v] = params[v]
    return str(sorted(p.items())).replace(" ","")

def load_param_hash(db, collection):
    project = {
        "params": "$parameters",
#         "asof":   "$asof",
        "_id": 1
    }
    feats_pipeline = [{ "$project": project }]
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    print("len(rows)", len(rows))
    param_hash = set()
    for r in rows:
        param_hash.add(hash_params(r["params"]))
    return param_hash


In [12]:
import pymongo

client = pymongo.MongoClient(serverSelectionTimeoutMS=100, host="127.0.0.1")
param_hash = load_param_hash(client.metrics_causal_model_reranker, MONGO_COLLECTION)
len(param_hash), MONGO_COLLECTION

len(rows) 1


(1, 'SC_COST_INSENS_RE-RANKER_HYPER_PARAM_TD')

# Hyper Parameter Optimization

In [13]:
import datetime
datetime.datetime.now()

datetime.datetime(2019, 6, 23, 15, 52, 18, 562401)

In [14]:
best_f1 = -1
# best_f1 = 0.7457

In [15]:
# Form a closure to simplify loop
def trn_mdl(top_n, prms, cv_filtrd):
    
    global best_f1, best_C, pa_type, best_max_upd, initial_weight
    
    n_jobs = None
    # Uses too much RAM, drop to single threaded
    if top_n > 3:        
        n_jobs = 1
    
    f1 = train_model_parallel_logged(
        training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
        feat_extractors=current_best, params=prms,

        cv_folds=cv_filtrd, name2essay=name2essay, 
        C=best_C, pa_type=pa_type, max_update_items=best_max_upd, initial_weight=initial_weight,
        set_cr_tags=set_cr_tags,
        # use best iterations from above
        max_epochs=best_iterations, early_stop_iters=best_iterations,
        n_jobs=n_jobs
    )
    if f1 > best_f1:
        best_f1 = f1
        print("New Best F1: {f1:.6f}:\t{params}".format(f1=f1, params=\
                                str((top_n, best_C, pa_type, best_max_upd, initial_weight))))
    else:
        print("         F1: {f1:.6f}:\t{params}".format(f1=f1, params=\
                                str((top_n, best_C, pa_type, best_max_upd, initial_weight))))

In [16]:
param_hash = load_param_hash(client.metrics_causal_model_reranker, MONGO_COLLECTION)

for top_n in [1,2,3,5]: # [1,2,3,5] - for SC
    
    print("top_n", top_n)
    xs_rr, _ = load_rerank(top_n)
    xs_temp = get_features_from_probabilities(xs_rr, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)
    
    cv_flds_rr = cross_validation(xs_temp, 5)
    cv_flds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_flds_rr]
    
    # Filter feats
    cv_filtered = []
    for tr, test in cv_flds_mm:
        x_tr,x_test = filter_feats(tr, test, current_best)
        cv_filtered.append((x_tr,x_test))
    
    skipped = False
    for best_C in [0.0005, 0.0025, 0.0100, 0.1]:
        for pa_type in [1,2]: # [0]
            for best_max_upd in [1]:
                for initial_weight in [0.01]:
                    p = {'C': best_C,
                         'best_max_parses': best_max_parses,
                         'best_min_prob': best_min_prob,
                         'best_top_n': top_n,
                         'extractors': list(current_best),
                         'initial_weight': initial_weight,
                         'max_update_items': best_max_upd,
                         'min_feat_freq': min_feat_freq,
                         'pa_type': pa_type
                    }
                    hash_p = hash_params(p)
                    if hash_p in param_hash:
                        print(".", end = '')
                        skipped = True
                        continue
                    if skipped:
                        print()
                    trn_mdl(top_n, p, cv_filtered)
                    skipped = False

# top_n, best_C, pa_type, best_max_upd, initial_weight

len(rows) 1
top_n 1
.
New Best F1: 0.805234:	(1, 0.0005, 2, 1, 0.01)
         F1: 0.799393:	(1, 0.0025, 1, 1, 0.01)
         F1: 0.799945:	(1, 0.0025, 2, 1, 0.01)
         F1: 0.800220:	(1, 0.01, 1, 1, 0.01)
         F1: 0.799614:	(1, 0.01, 2, 1, 0.01)
         F1: 0.803513:	(1, 0.1, 1, 1, 0.01)
         F1: 0.794376:	(1, 0.1, 2, 1, 0.01)
top_n 2
New Best F1: 0.806197:	(2, 0.0005, 1, 1, 0.01)
         F1: 0.805234:	(2, 0.0005, 2, 1, 0.01)
         F1: 0.798897:	(2, 0.0025, 1, 1, 0.01)
         F1: 0.800495:	(2, 0.0025, 2, 1, 0.01)
         F1: 0.800716:	(2, 0.01, 1, 1, 0.01)
         F1: 0.801928:	(2, 0.01, 2, 1, 0.01)
         F1: 0.803738:	(2, 0.1, 1, 1, 0.01)
         F1: 0.798282:	(2, 0.1, 2, 1, 0.01)
top_n 3
         F1: 0.799260:	(3, 0.0005, 1, 1, 0.01)
         F1: 0.797267:	(3, 0.0005, 2, 1, 0.01)
         F1: 0.797052:	(3, 0.0025, 1, 1, 0.01)
         F1: 0.799076:	(3, 0.0025, 2, 1, 0.01)
         F1: 0.802286:	(3, 0.01, 1, 1, 0.01)
         F1: 0.804017:	(3, 0.01, 2, 1, 0.01)

## Apply to Test Data

In [17]:
min_feat_freq, best_min_prob

(1, 0.0)

In [19]:
# f1_score	best_top_n	C	    best_max_parses	  max_update_items	initial_weight	pa_type
# 0.806197	2	        0.0005	300	              1	                0.01	1

# # current best settings
# C=best_C, pa_type=pa_type, loss_type=loss_type, max_update_items=best_max_upd, initial_weight=initial_weight,

top_n = 2
best_C = 0.0005
best_max_upd = 1
max_update_items = 1
initial_weight = 0.01
pa_type = 1

In [20]:
xs_rr, xs_tst_rr = load_rerank(top_n)
xs_temp = get_features_from_probabilities(xs_rr, name2crels, best_max_parses, 
                                 causal_model_type=CAUSAL_MODEL_TYPE,
                                 min_feat_freq=min_feat_freq, min_prob=best_min_prob)

# Don't need CV here, but leave it in anyways
cv_flds_rr = cross_validation(xs_temp, 5)    
xs_test = get_features_from_probabilities(xs_tst_rr, name2crels, best_max_parses, 
                                      causal_model_type=CAUSAL_MODEL_TYPE,
                                      min_feat_freq=min_feat_freq, min_prob=best_min_prob)
xs_train_tmp = []
for train, test in cv_flds_rr:
    xs_train_tmp.extend(test)

xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train_tmp, xs_test)
# Filter at the end
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

## Determine the Optimal Number of Training Iterations

In [21]:
# tuning dataset from training data
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

In [22]:
%%time
mdl = MIRA(C=best_C, pa_type=pa_type, 
                        max_update_items=best_max_upd, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.8044 Test Accuracy: 0.8109
Epoch: 1 Train Accuracy: 0.8039 Test Accuracy: 0.8061
Epoch: 2 Train Accuracy: 0.8021 Test Accuracy: 0.8035
Epoch: 3 Train Accuracy: 0.8005 Test Accuracy: 0.8029
Best Test Acc: 0.8109
CPU times: user 50.4 s, sys: 1.55 s, total: 52 s
Wall time: 52.5 s


In [23]:
best_iterations

1

## Run for X Iterations on the Test Dataset

In [24]:
mdl = MIRA(C=best_C, pa_type=pa_type, 
                        max_update_items=best_max_upd, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_instance, verbose=True, early_stopping=False)

best_iterations

Epoch: 0 Train Accuracy: 0.8054 Test Accuracy: 0.8280
Best Test Acc: 0.8280


1

In [25]:
%%time
test_folds = [(xs_train_mm_fltr, xs_test_mm_fltr)]

params = {
    'C': best_C,
    'best_max_parses': best_max_parses,
    'best_min_prob': best_min_prob,
    'best_top_n': top_n,
    'extractors': list(current_best),
    'initial_weight': initial_weight,
    'max_update_items': best_max_upd,
    'min_feat_freq': min_feat_freq,
    'pa_type': pa_type
}

test_f1 = train_model_parallel_logged(
        training_collection_name=MONGO_TEST_COLLECTION, results_processor=results_processor,
        feat_extractors=current_best, params=params,

        cv_folds=test_folds, 
        
        name2essay=name2essay, 
        C=best_C, pa_type=pa_type, max_update_items=best_max_upd, initial_weight=initial_weight,
        set_cr_tags=set_cr_tags,
        # use best iterations from above
        max_epochs=best_iterations, early_stop_iters=best_iterations
    )
print(test_f1)

0.8279816513761469
CPU times: user 14.7 s, sys: 35.5 ms, total: 14.8 s
Wall time: 14.8 s


In [26]:
len(xs_rr), len(xs_test_mm_fltr)

(870, 218)

In [27]:
# sorted(best_mdl.weights.items(), key = lambda tpl: (tpl[0]))[0:50]

In [28]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:10]

[('Prob-prod-prob', 0.03604832328561456),
 ('Prob-med-prob', 0.034467183620218236),
 ('Prob-25%-prob', 0.03436321262488109),
 ('Prob-max-prob', 0.034303192860812015),
 ('Prob-95%-prob', 0.03422051311351167),
 ('Prob-90%-prob', 0.03413902866855831),
 ('Prob-75%-prob', 0.03397033596081636),
 ('Prob-avg-prob', 0.03377268476817905),
 ('Prob-geo-mean', 0.03372539541185161),
 ('Prob-10%-prob', 0.03312162777453066)]