In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/BEAM Parser


In [3]:
import os
import sys
# cwd = os.getcwd()
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
models_folder = os.path.join(cm_folder, "BEAM Parser/models/")
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [4]:
from typing import Any

import dill
from sklearn.linear_model import LogisticRegression
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_extraction import get_features_from_probabilities
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel, train_model, train_cost_sensitive_instance
from window_based_tagger_config import get_config
from feature_extraction import get_features_essay_level
from causal_model_features import CausalModelType

In [5]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:3->Result:5',
 'Causer:13->Result:5',
 'Causer:2->Result:7',
 'Causer:1->Result:2',
 'Causer:7->Result:5',
 'Causer:4->Result:11',
 'Causer:11->Result:14',
 'Causer:12->Result:50',
 'Causer:4->Result:5b',
 'Causer:1->Result:3']

In [7]:
from searn_essay_parser_breadth_first import SearnModelEssayParserBreadthFirst

In [8]:
test_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

In [9]:
len(pred_tagged_essays_train)

902

### Get the Expected Crels Per Essay

In [10]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

## Train Re-Ranker

In [11]:
BEAM_SIZE = 100

In [12]:
import dill

rr_fname = "xs_rerank_" + str(BEAM_SIZE) + ".dill"
with open(os.path.join(models_folder, rr_fname), "rb") as f:
    xs_rr = dill.load(f)
len(xs_rr)

902

In [13]:
assert len(xs_rr) == len(pred_tagged_essays_train)

In [14]:
len(xs_rr), len(pred_tagged_essays_train)

(902, 902)

In [15]:
CAUSAL_MODEL_TYPE == CausalModelType.CORAL_BLEACHING

True

In [16]:
%%time
xs = get_features_essay_level(xs_rr, name2crels, causal_model_type=CAUSAL_MODEL_TYPE, min_feat_freq=1)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 27.3 s, sys: 342 ms, total: 27.6 s
Wall time: 27.9 s


In [17]:
best_C, best_max_upd = 0.01, 1

In [19]:
%%time
f1 = train_model_parallel(cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", \
                          max_update_items=best_max_upd, set_cr_tags=set_cr_tags)
print(f1) # 0.733426894314066

0.7316674196020332
CPU times: user 24.1 s, sys: 1.86 s, total: 26 s
Wall time: 2min 36s


In [20]:
%%time
f1 = train_model_parallel(cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", \
                          max_update_items=best_max_upd, set_cr_tags=set_cr_tags)
print(f1) # 0.7336580926726814

0.7316674196020332
CPU times: user 21.4 s, sys: 1.48 s, total: 22.9 s
Wall time: 2min 42s


In [21]:
%%time
f1 = train_model_parallel(cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", \
                          max_update_items=best_max_upd, set_cr_tags=set_cr_tags)
print(f1) # 0.7338333044191401

0.7317485002668207
CPU times: user 22.7 s, sys: 1.61 s, total: 24.3 s
Wall time: 2min 46s


In [22]:
BEAM_SIZE

100

## Train on Test Data

In [23]:
rr_fname = "xs_rerank_test_" + str(BEAM_SIZE) + ".dill"
with open(os.path.join(models_folder, rr_fname), "rb+") as f:
    xs_test_rerank = dill.load(f)
len(xs_test_rerank)

226

In [25]:
xs_test = get_features_essay_level(xs_test_rerank, name2crels, causal_model_type=CAUSAL_MODEL_TYPE, min_feat_freq=1)

In [27]:
# training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

In [30]:
# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

In [31]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

In [32]:
%%time
C = best_C
pa_type = 1
loss_type= "ml"
max_update_items = best_max_upd

mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=5, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7414 Test Accuracy: 0.7435
Epoch: 1 Train Accuracy: 0.7395 Test Accuracy: 0.7433
Epoch: 2 Train Accuracy: 0.7406 Test Accuracy: 0.7451
Epoch: 3 Train Accuracy: 0.7424 Test Accuracy: 0.7439
Epoch: 4 Train Accuracy: 0.7418 Test Accuracy: 0.7428
Epoch: 5 Train Accuracy: 0.7438 Test Accuracy: 0.7439
Epoch: 6 Train Accuracy: 0.7440 Test Accuracy: 0.7439
Epoch: 7 Train Accuracy: 0.7440 Test Accuracy: 0.7439
Best Test Acc: 0.7451
CPU times: user 50.8 s, sys: 257 ms, total: 51.1 s
Wall time: 51.2 s


In [33]:
best_iterations

3

In [34]:
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
                                       name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7425 Test Accuracy: 0.7306
Epoch: 1 Train Accuracy: 0.7441 Test Accuracy: 0.7322
Epoch: 2 Train Accuracy: 0.7437 Test Accuracy: 0.7301
Best Test Acc: 0.7322


In [35]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
    "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [36]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
                                  pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
                                  set_cr_tags=set_cr_tags)
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break

1 feats, new Best F1: 0.7356 Prefixes: ['Above-']
2 feats, new Best F1: 0.7387 Prefixes: ['Above-', 'num_crels']
3 feats, new Best F1: 0.7397 Prefixes: ['Above-', 'num_crels', 'Diff_']
4 feats, new Best F1: 0.7402 Prefixes: ['Above-', 'num_crels', 'Diff_', 'CChainStats-']
5 feats, new Best F1: 0.7402 Prefixes: ['Above-', 'num_crels', 'Diff_', 'CChainStats-', 'Inv-']
No further improvement, stopping


In [37]:
%%time
# run it against the full set for comparison
cv_filtered = []
for tr, test in cv_folds_mm:
    x_tr,x_test = filter_feats(tr, test, prefixes)
#     x_tr,x_test = filter_feats(tr, test, current_best)
    cv_filtered.append((x_tr,x_test))

f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
                          pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
                          set_cr_tags=set_cr_tags)
print(f1)

0.7399266076852923
CPU times: user 14.1 s, sys: 959 ms, total: 15 s
Wall time: 41.6 s


## Apply to Test Data

In [38]:
current_best

['Above-', 'num_crels', 'Diff_', 'CChainStats-', 'Inv-']

In [39]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [40]:
# Use training data to determine the best number of traininng iterations
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

In [41]:
%%time
C = best_C
pa_type = 1
loss_type= "ml"
max_update_items = best_max_upd

mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7363 Test Accuracy: 0.7305
Epoch: 1 Train Accuracy: 0.7394 Test Accuracy: 0.7300
Epoch: 2 Train Accuracy: 0.7397 Test Accuracy: 0.7268
Epoch: 3 Train Accuracy: 0.7404 Test Accuracy: 0.7291
Best Test Acc: 0.7305
CPU times: user 3.95 s, sys: 31.9 ms, total: 3.99 s
Wall time: 3.98 s


In [42]:
best_iterations 

1

In [43]:
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations, 
    train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7329 Test Accuracy: 0.7322
Best Test Acc: 0.7322


In [44]:
# Set higher max iterations
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=30, early_stop_iters=30, 
    train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7328 Test Accuracy: 0.7382
Epoch: 1 Train Accuracy: 0.7371 Test Accuracy: 0.7356
Epoch: 2 Train Accuracy: 0.7379 Test Accuracy: 0.7349
Epoch: 3 Train Accuracy: 0.7382 Test Accuracy: 0.7327
Epoch: 4 Train Accuracy: 0.7382 Test Accuracy: 0.7298
Epoch: 5 Train Accuracy: 0.7385 Test Accuracy: 0.7292
Epoch: 6 Train Accuracy: 0.7398 Test Accuracy: 0.7285
Epoch: 7 Train Accuracy: 0.7401 Test Accuracy: 0.7279
Epoch: 8 Train Accuracy: 0.7407 Test Accuracy: 0.7273
Epoch: 9 Train Accuracy: 0.7405 Test Accuracy: 0.7273
Epoch: 10 Train Accuracy: 0.7403 Test Accuracy: 0.7273
Epoch: 11 Train Accuracy: 0.7401 Test Accuracy: 0.7266
Epoch: 12 Train Accuracy: 0.7403 Test Accuracy: 0.7266
Epoch: 13 Train Accuracy: 0.7398 Test Accuracy: 0.7277
Epoch: 14 Train Accuracy: 0.7397 Test Accuracy: 0.7277
Epoch: 15 Train Accuracy: 0.7399 Test Accuracy: 0.7277
Epoch: 16 Train Accuracy: 0.7401 Test Accuracy: 0.7277
Epoch: 17 Train Accuracy: 0.7401 Test Accuracy: 0.7277
Epoch: 18 Train Accu

In [45]:
# Try no filters
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm, xs_test=xs_test_mm,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=30, early_stop_iters=30, 
    train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7417 Test Accuracy: 0.7386
Epoch: 1 Train Accuracy: 0.7423 Test Accuracy: 0.7326
Epoch: 2 Train Accuracy: 0.7433 Test Accuracy: 0.7306
Epoch: 3 Train Accuracy: 0.7437 Test Accuracy: 0.7277
Epoch: 4 Train Accuracy: 0.7451 Test Accuracy: 0.7255
Epoch: 5 Train Accuracy: 0.7456 Test Accuracy: 0.7255
Epoch: 6 Train Accuracy: 0.7464 Test Accuracy: 0.7262
Epoch: 7 Train Accuracy: 0.7467 Test Accuracy: 0.7262
Epoch: 8 Train Accuracy: 0.7473 Test Accuracy: 0.7262
Epoch: 9 Train Accuracy: 0.7477 Test Accuracy: 0.7250
Epoch: 10 Train Accuracy: 0.7481 Test Accuracy: 0.7262
Epoch: 11 Train Accuracy: 0.7483 Test Accuracy: 0.7266
Epoch: 12 Train Accuracy: 0.7485 Test Accuracy: 0.7262
Epoch: 13 Train Accuracy: 0.7485 Test Accuracy: 0.7268
Epoch: 14 Train Accuracy: 0.7489 Test Accuracy: 0.7268
Epoch: 15 Train Accuracy: 0.7490 Test Accuracy: 0.7262
Epoch: 16 Train Accuracy: 0.7497 Test Accuracy: 0.7262
Epoch: 17 Train Accuracy: 0.7501 Test Accuracy: 0.7262
Epoch: 18 Train Accu

In [46]:
sorted(best_mdl.weights.items(), key = lambda tpl: -tpl[1])[0:20]

[('num_crels=0', 0.3285077040959117),
 ('Inv-not_inverted', 0.29336942671062827),
 ('Above-%-0.9', 0.2924967375197842),
 ('Above-All-Above-0.7', 0.24747217742060995),
 ('Above-%-0.95', 0.240299419164543),
 ('Above-All-Above-0.5', 0.2274721774206099),
 ('CREL_Causer:7->Result:50-MIN(prob)', 0.2216978786685111),
 ('CREL_7:50', 0.20997943357559656),
 ('CREL_Causer:7->Result:50-MAX(prob)', 0.20557895248912006),
 ('Above-All-Above-0.9', 0.2016405596489272),
 ('Prob-min-prob', 0.18950247771053025),
 ('Prob-5%-prob', 0.18797979400407147),
 ('Above-%-0.8', 0.18602666699265427),
 ('Above-All-Above-0.8', 0.18517063817849072),
 ('Prob-10%-prob', 0.18363771392475062),
 ('Above-All-Above-0.3', 0.1823015392421192),
 ('Prob-prod-prob', 0.17677218277061454),
 ('Prob-geo-mean', 0.16311881496828176),
 ('Above-All-Above-0.2', 0.15716326185683582),
 ('CREL_Causer:6->Result:7-MAX(prob)', 0.15418552921399484)]

# Notes on Remaining Code Changes
- The Beam search approach outputs a list of Dict[str, List[float]], instead of just one Dict[str, List[float]]
- However, we don't need to sample from the crels, we will just use the already generated parses, after de-duping
- ParserInputs needs modifying so that it takes a list of crel2probs instead of on dict for all parses
- Need to figure out what the optimal parse is based on amount of overlap with the actual crels, minus the false positives

# TODO 
- include the cum prob from the parse action result as a feature? - or simply compute the geometric mean of the probs?
- To speed up MIRA, de-dupe the generated parses prior to feature extraction. Where there are dupes, take the one with the highest cum prob
