In [2]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [28]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel_logged, train_model_parallel, train_model, train_cost_sensitive_instance
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor

In [29]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/CB"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "CB_RE_RANKER_FEATURE_SEL_TD"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [8]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:1->Result:2',
 'Causer:3->Result:14',
 'Causer:11->Result:3',
 'Causer:1->Result:6',
 'Causer:12->Result:5b',
 'Causer:4->Result:14',
 'Causer:5->Result:14',
 'Causer:3->Result:4',
 'Causer:1->Result:5',
 'Causer:12->Result:13']

In [9]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [84]:
best_top_n = 2
min_feat_freq = 1
best_max_upd, best_max_parses, best_min_prob = (2, 300, 0.0)  # min prob of 0 seems better

In [85]:
rr_fname = "xs_rerank_" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_rerank = dill.load(f)

rr_fname = "xs_rerank_test" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_test_rerank = dill.load(f)
    
assert len(xs_rerank) == len(pred_tagged_essays_train),     "Wrong number of train crels"
assert len(xs_test_rerank) == len(pred_tagged_essays_test), "Wrong number of test crels"
len(xs_rerank), len(xs_test_rerank)

(902, 226)

## Prepare Features

In [86]:
%%time
xs = get_features_from_probabilities(xs_rerank, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 6min 38s, sys: 1min 39s, total: 8min 18s
Wall time: 9min 3s


In [87]:
%%time
xs_test = get_features_from_probabilities(xs_test_rerank, name2crels, best_max_parses, 
                                          causal_model_type=CAUSAL_MODEL_TYPE,
                                          min_feat_freq=min_feat_freq, min_prob=best_min_prob)

CPU times: user 21.7 s, sys: 1.96 s, total: 23.7 s
Wall time: 23.9 s


In [88]:
# Prepare test dataset 
  # training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

### Initial Parameters

In [91]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 1.0  # was 0.01

In [None]:
%%time
f1 = train_model_parallel(
    cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
    set_cr_tags=set_cr_tags, initial_weight=initial_weight)
print(f1)  # 0.7421167703055035

## Train on Test Data

### Tuning Dataset

In [31]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

### Determine Number of Training Iterations

In [32]:
# %%time
# # use training data to determine number of iterations
# mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
#                         max_update_items=max_update_items, initial_weight=initial_weight)
# # Determine number of training iterations
# best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
#      max_epochs=20, early_stop_iters=5, 
#      train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=True)

In [34]:
# best_iterations, best_C, initial_weight

In [35]:
# %%time
# # Test on test data
# mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
#                         max_update_items=max_update_items, initial_weight=initial_weight)

# best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
#                                        name2essay=name2essay, set_cr_tags=set_cr_tags,
#      max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance, verbose=True)
# # Best Test Acc: 0.7406

# Feature Selection

In [36]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
    "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [37]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

In [38]:
%%time
while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
                                  pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
                                  set_cr_tags=set_cr_tags, initial_weight=initial_weight)
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break
        
# 1 feats, new Best F1: 0.7389 Prefixes: ['CREL_']
# 2 feats, new Best F1: 0.7426 Prefixes: ['CREL_', 'Above-']

# with num_crels fixed:
# 1 feats, new Best F1: 0.7481 Prefixes: ['CREL_']
# 2 feats, new Best F1: 0.7482 Prefixes: ['CREL_', 'num_crels']

1 feats, new Best F1: 0.7430 Prefixes: ['CREL_']
2 feats, new Best F1: 0.7431 Prefixes: ['CREL_', 'Prob-']
3 feats, new Best F1: 0.7432 Prefixes: ['CREL_', 'Prob-', 'Inv-']
No further improvement, stopping


In [39]:
current_best, best_f1

(['CREL_', 'Prob-', 'Inv-'], 0.7432279026710764)

In [40]:
# # run it against the full set of features for comparison
# cv_filtered = []
# for tr, test in cv_folds_mm:
#     x_tr,x_test = filter_feats(tr, test, prefixes)
#     cv_filtered.append((x_tr,x_test))

# f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
#                           pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
#                           set_cr_tags=set_cr_tags, initial_weight=initial_weight)
# f1

## Apply to Test Data

In [41]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [42]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [60]:
# initial_weight = 0

In [61]:
%%time
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.7567 Test Accuracy: 0.7183
Epoch: 1 Train Accuracy: 0.7572 Test Accuracy: 0.7187
Epoch: 2 Train Accuracy: 0.7578 Test Accuracy: 0.7181
Epoch: 3 Train Accuracy: 0.7587 Test Accuracy: 0.7181
Epoch: 4 Train Accuracy: 0.7589 Test Accuracy: 0.7181
Best Test Acc: 0.7187
CPU times: user 1min, sys: 390 ms, total: 1min
Wall time: 1min 1s


In [62]:
best_iterations

2

In [63]:
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516
best_iterations

Epoch: 0 Train Accuracy: 0.7450 Test Accuracy: 0.7370
Epoch: 1 Train Accuracy: 0.7498 Test Accuracy: 0.7378
Best Test Acc: 0.7378


2

In [68]:
initial_weight

0

In [69]:
# try a smaller C
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=5, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7470 Test Accuracy: 0.7412
Epoch: 1 Train Accuracy: 0.7522 Test Accuracy: 0.7415
Epoch: 2 Train Accuracy: 0.7541 Test Accuracy: 0.7403
Epoch: 3 Train Accuracy: 0.7547 Test Accuracy: 0.7426
Epoch: 4 Train Accuracy: 0.7559 Test Accuracy: 0.7426
Best Test Acc: 0.7426


In [65]:
# try a larger initial weight
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7417 Test Accuracy: 0.7407
Epoch: 1 Train Accuracy: 0.7417 Test Accuracy: 0.7407
Best Test Acc: 0.7407


In [67]:
# initial weight of 1, all feats
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm, xs_test=xs_test_mm,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7405 Test Accuracy: 0.7409
Epoch: 1 Train Accuracy: 0.7394 Test Accuracy: 0.7399
Best Test Acc: 0.7409


In [73]:
test_acc_df_ml

(0.7426470588235294, 0)

In [71]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:50]

[('CREL_Causer:7->Result:50-MAX(prob)', 0.243912351364519),
 ('CREL_Pair-Causer:14->Result:50|Causer:6->Result:14', 0.23750000000000016),
 ('CREL_7:50', 0.22750000000000015),
 ('Inv-not_inverted', 0.22342509836073043),
 ('Inv-inverted', -0.22342509836073043),
 ('Prob-prod-prob', 0.20948981176216386),
 ('CREL_Causer:7->Result:50-MIN(prob)', 0.20565634400312116),
 ('CREL_Pair-Causer:6->Result:14|Causer:7->Result:50', 0.20500000000000013),
 ('CREL_Pair-Causer:1->Result:50|Causer:5->Result:50', 0.20250000000000012),
 ('CREL_3:50', 0.19800873456074375),
 ('CREL_Causer:1->Result:50-MIN(prob)', 0.19223590489813785),
 ('CREL_Causer:1->Result:50-MAX(prob)', 0.18130067370288777),
 ('CREL_Causer:1->Result:3-MAX(prob)', 0.18020632085293448),
 ('CREL_6:7', 0.1775000000000001),
 ('CREL_Causer:3->Result:4-MAX(prob)', 0.17621265697741062),
 ('CREL_Causer:6->Result:7-MAX(prob)', 0.1707054788910274),
 ('CREL_Causer:1->Result:3-MIN(prob)', 0.16853399258635784),
 ('CREL_1:50', 0.16798942509661055),
 ('CRE

## TODO
- ~~num-crels - add back in the logic to threshold these? But only if needed to improve results here (seemed to help essay parser)~~
- ~~Re-run with more realistic initial hyper params~~
- ~~Add in logic to store results to mongo~~
- ~~Switch back to using an initial_weight of 1~~
- We need to add hyper parameter tuning
- Do we want to just remove the BEAM search from this? It makes the explanation a lot more complex. But then again, it's the only way we can really add more crels that the model wouldn't otherwise parse
- Do we use the BEAM search with some de-duping? Although we already de-dupe to some extent anyways