In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [3]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel_logged, train_model_parallel,\
    train_model, train_cost_sensitive_instance
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor
from filter_features import filter_feats

In [4]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/CB"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "CB_RE-RANKER_HYPER_PARAM_TD"
# first and second were with initial_weight set to 1.0
# thrid is with set to 0.001

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [5]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:13->Result:14',
 'Causer:5->Result:14',
 'Causer:7->Result:4',
 'Causer:7->Result:13',
 'Causer:7->Result:14',
 'Causer:1->Result:5',
 'Causer:5->Result:11',
 'Causer:3->Result:14',
 'Causer:2->Result:50',
 'Causer:50->Result:7']

In [6]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [7]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 2 
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [8]:
def load_rerank(top_n):
    rr_fname = "xs_rerank_" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_rerank = dill.load(f)

    rr_fname = "xs_rerank_test" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_test_rerank = dill.load(f)
    return xs_rerank, xs_test_rerank

## Prepare Features

In [9]:
best_max_parses, min_feat_freq, best_min_prob

(300, 1, 0.0)

In [10]:
prefixes = [
    "Prob-",
#     "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]

# Results from run 3
current_best = ['CREL_', 'Prob-', 'CChainStats-']
best_iterations = 3

# default params (not all of these are optimized)
params = {
    "best_top_n": best_top_n,
#     "best_max_upd": best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob": best_min_prob,
    "min_feat_freq": min_feat_freq
}

### Load Param Hash (avoid re-running same experiment multiple times)

In [11]:
vals = ("C", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        "best_min_prob", "best_top_n", "extractors", "initial_weight", "loss_type",\
        "min_feat_freq", "pa_type")

def hash_params(params):
    p = dict()
    # only copy over white list vals
    for v in vals:
        p[v] = params[v]
    return str(sorted(p.items())).replace(" ","")

def load_param_hash(db, collection):
    project = {
        "params": "$parameters",
#         "asof":   "$asof",
        "_id": 1
    }
    feats_pipeline = [{ "$project": project }]
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    print("len(rows)", len(rows))
    param_hash = set()
    for r in rows:
        param_hash.add(hash_params(r["params"]))
    return param_hash

In [12]:
import pymongo

client = pymongo.MongoClient()
param_hash = load_param_hash(client.metrics_causal_model_reranker, MONGO_COLLECTION)
len(param_hash), MONGO_COLLECTION

len(rows) 564


(426, 'CB_RE-RANKER_HYPER_PARAM_TD')

In [13]:
# %%time

# topn2cvfolds_mm = dict()
# # store raw feats j.i.c (expensive to compute)
# topn2feats = dict()

# for top_n in [1,2,3,5,10]:
    
#     topn2rerank[top_n], topn2rerank_test[top_n] = load_rerank(top_n)
    
#     xs_rr = topn2rerank[top_n]
#     xs_temp = get_features_from_probabilities(xs_rr, name2crels, best_max_parses, 
#                                      causal_model_type=CAUSAL_MODEL_TYPE,
#                                      min_feat_freq=min_feat_freq, min_prob=best_min_prob)
    
#     cv_flds_rr = cross_validation(xs_temp, 5)
#     topn2feats[top_n] = cv_flds_rr
    
#     cv_flds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_flds_rr]
    
#     cv_filtered = []
#     for tr, test in cv_flds_mm:
#         x_tr,x_test = filter_feats(tr, test, current_best)
#         cv_filtered.append((x_tr,x_test))
    
#     topn2cvfolds_mm[top_n] = cv_filtered
    
# #     xs_tst_rr = topn2rerank_test[top_n]
# #     xs_tst = get_features_from_probabilities(xs_tst_rr, name2crels, best_max_parses, 
# #                                           causal_model_type=CAUSAL_MODEL_TYPE,
# #                                           min_feat_freq=min_feat_freq, min_prob=best_min_prob)
# #     # Store feats for later, just in case
# #     topn2feats[top_n] = (xs_temp, xs_tst)
    
# #     # build up training data for the test dataset using the CV held out folds from the training data
# #     # but before doing feat normalization
# #     xs_train_tmp = []
# #     for train, test in cv_flds_rr:
# #         xs_train_tmp.extend(test)

# #     xs_tr_mm, xs_tst_mm = min_max_normalize_feats(xs_train_tmp, xs_tst)
# #     topn2cvfolds_mm_test[top_n] = [(xs_tr_mm, xs_tst_mm)]

### Pickle dictionaries above - TOO BIG to store in GH (several gig)

### Initial Parameters

In [14]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 0.01  # was 0.01

In [15]:
MONGO_COLLECTION, best_iterations, initial_weight

('CB_RE-RANKER_HYPER_PARAM_TD', 3, 0.01)

# Hyper Parameter Optimization

In [16]:
import datetime
datetime.datetime.now()

datetime.datetime(2019, 6, 16, 23, 10, 8, 648614)

In [17]:
best_f1 = -1
# best_f1 = 0.7457

In [18]:
# Form a closure to simplify loop
def trn_mdl(top_n, prms, cv_filtrd):
    
    global best_f1, best_C, pa_type, best_max_upd, initial_weight, loss_type
    
    f1 = train_model_parallel_logged(
        training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
        feat_extractors=current_best, params=prms,

        cv_folds=cv_filtrd, name2essay=name2essay, 
        C=best_C, pa_type=pa_type, loss_type=loss_type, max_update_items=best_max_upd, initial_weight=initial_weight,
        set_cr_tags=set_cr_tags,
        # use best iterations from above
        max_epochs=best_iterations, early_stop_iters=best_iterations
    )
    if f1 > best_f1:
        best_f1 = f1
        print("New Best F1: {f1:.6f}:\t{params}".format(f1=best_f1, params=\
                                                       str((top_n, best_C, pa_type, best_max_upd, initial_weight, loss_type))))
    else:
        print("         F1: {f1:.6f}:\t{params}".format(f1=best_f1, params=\
                                                       str((top_n, best_C, pa_type, best_max_upd, initial_weight, loss_type))))

In [19]:
hash_count = 0
not_hashed = 0
for top_n in [1,2,3,5]: # [1,2,3,5] - for SC
    for best_C in [0.0005, 0.0025, 0.0100, 0.1]:
        for pa_type in [1,2]: # [0]
            for best_max_upd in [1,2,5]:
                for initial_weight in [0.01, 0.1, 1.0]:
                    for loss_type in ["pb", "ml"]:
                        p = {'C': best_C,
                             'best_max_parses': best_max_parses,
                             'best_min_prob': best_min_prob,
                             'best_top_n': top_n,
                             'extractors': list(current_best),
                             'initial_weight': initial_weight,
                             'loss_type': loss_type,
                             'max_update_items': best_max_upd,
                             'min_feat_freq': min_feat_freq,
                             'pa_type': pa_type
                        }
                        hash_p = hash_params(p)
                        if hash_p in param_hash:
                            hash_count += 1
                        else:
                            not_hashed += 1

hash_count, not_hashed, hash_count + not_hashed

(203, 373, 576)

In [None]:
for top_n in [3,2,1,5]: # [1,2,3,5] - for SC
    
    print("top_n", top_n)
    xs_rr, _ = load_rerank(top_n)
    xs_temp = get_features_from_probabilities(xs_rr, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)
    
    cv_flds_rr = cross_validation(xs_temp, 5)
    cv_flds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_flds_rr]
        
    for best_C in [0.0005, 0.0025, 0.0100, 0.1]:
        for pa_type in [1,2]: # [0]
            for best_max_upd in [1,2]:
                for initial_weight in [0.01, 0.1, 1.0]:
                    for loss_type in ["pb", "ml"]:
                        
                        p = {'C': best_C,
                             'best_max_parses': best_max_parses,
                             'best_min_prob': best_min_prob,
                             'best_top_n': top_n,
                             'extractors': list(current_best),
                             'initial_weight': initial_weight,
                             'loss_type': loss_type,
                             'max_update_items': best_max_upd,
                             'min_feat_freq': min_feat_freq,
                             'pa_type': pa_type
                        }
                        hash_p = hash_params(p)
                        if hash_p in param_hash:
                            print(".", end = '')
                            continue
                        print()
                        trn_mdl(top_n, p, cv_flds_mm)
    
#                       (best_C, pa_type, best_max_upd, initial_weight)
# New Best F1: 0.742813:	(1, 0.0005, 2, 1, 0, 'ml')
# New Best F1: 0.743098:	(1, 0.0025, 1, 1, 0, 'ml')

top_n 3


### TODO 
- ~~re-run test on loss_type~~
- ~~Fix CREL_Pair filtering~~
- Try different top_n

In [37]:
# current best settings
best_C, pa_type, best_max_upd, initial_weight = (0.01, 1, 1, 0.01)
loss_type = "ml"

## Apply to Test Data

### TODO (Test Data)
- Get optimal top_n and settings
- Extract feats
- Filter feats

In [39]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best)

In [40]:
# tuning dataset from training data
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [41]:
%%time
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=best_max_upd, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.7509 Test Accuracy: 0.7473
Epoch: 1 Train Accuracy: 0.7556 Test Accuracy: 0.7461
Epoch: 2 Train Accuracy: 0.7561 Test Accuracy: 0.7434
Epoch: 3 Train Accuracy: 0.7584 Test Accuracy: 0.7404
Best Test Acc: 0.7473
CPU times: user 41.6 s, sys: 290 ms, total: 41.9 s
Wall time: 42.5 s


In [42]:
best_iterations

1

## Run for X Iterations on the Test Dataset

In [43]:
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=best_max_upd, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516
best_iterations

Epoch: 0 Train Accuracy: 0.7497 Test Accuracy: 0.7420
Best Test Acc: 0.7420


1

In [49]:
MONGO_TEST_COLLECTION = "TEST_CB_RE-RANKER_TD"

In [52]:
current_best

['CREL_', 'Prob-', 'CChainStats-']

In [50]:
# TODO - needs to include the top_n

# test_folds = [(xs_train_mm_fltr, xs_test_mm_fltr)]

# test_f1 = train_model_parallel_logged(
#         training_collection_name=MONGO_TEST_COLLECTION, results_processor=results_processor,
#         feat_extractors=current_best, params=params,

#         cv_folds=test_folds, 
        
#         name2essay=name2essay, 
#         C=best_C, pa_type=pa_type, loss_type=loss_type, max_update_items=best_max_upd, initial_weight=initial_weight,
#         set_cr_tags=set_cr_tags,
#         # use best iterations from above
#         max_epochs=best_iterations, early_stop_iters=best_iterations
#     )
# test_f1

0.7427536231884059

In [55]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:500]

[('Prob-prod-prob', 0.16598788326875036),
 ('CREL_Causer:1->Result:3-MAX(prob)', 0.13547793953948611),
 ('CREL_7:50', 0.13465626583889415),
 ('CREL_Causer:7->Result:50-MAX(prob)', 0.13001903006914273),
 ('CREL_Causer:1->Result:50-MIN(prob)', 0.12887278483874806),
 ('CREL_Causer:1->Result:3-MIN(prob)', 0.12554598936126396),
 ('Prob-min-prob', 0.12510261716299706),
 ('CREL_1:50', 0.11999999999999998),
 ('CREL_Causer:4->Result:14-pred-count=2', 0.11999999999999998),
 ('CREL_Causer:4->Result:5-pred-count=2', 0.11999999999999998),
 ('CREL_4:5', 0.11999999999999998),
 ('CREL_3:50', 0.11845998933164874),
 ('CREL_Causer:1->Result:50-MAX(prob)', 0.11528812307471469),
 ('Prob-5%-prob', 0.11420280368585191),
 ('CREL_4:14', 0.10999999999999999),
 ('CREL_6:7', 0.10999999999999999),
 ('CREL_Pair-Causer:1->Result:50|Causer:5->Result:50', 0.10999999999999999),
 ('CREL_Pair-Causer:6->Result:14|Causer:7->Result:50', 0.10999999999999999),
 ('CREL_Pair-Causer:1->Result:2|Causer:1->Result:50', 0.1099999999

## TODO

### Ideas
- ~~num-crels - add back in the logic to threshold these? But only if needed to improve results here (seemed to help essay parser)~~
- ~~Re-run with more realistic initial hyper params~~
- ~~Add in logic to store results to mongo~~
- ~~Switch back to using an initial_weight of 1~~
- Approach seems very sensitive to the initial configuration of the algorithm. However, it also seems correlated to the training data performance on the first epoch. Run the algorithm multiple times, take the model with the best training performance and use that as the final selected model to train futher.
- We need to add hyper parameter tuning
- Do we want to just remove the BEAM search from this? It makes the explanation a lot more complex. But then again, it's the only way we can really add more crels that the model wouldn't otherwise parse
- Do we use the BEAM search with some de-duping? Although we already de-dupe to some extent anyways

### Needed to Finish
- Record run on test data - needs optimal hyper parameters first