In [1]:
# %load_ext autoreload
# %autoreload 2

In [4]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
models_folder = os.path.join(cm_folder, "BEAM Parser - Essay Level/models/")
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [6]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel_logged, train_model_parallel, train_model, train_cost_sensitive_instance
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_essay_level
from results_procesor import ResultsProcessor

In [23]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/CB"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "CB_ESSAY_PARSER_RE-RANKER_FEATURE_SEL_TD"
# first and second were with initial_weight set to 1.0
# thrid is with set to 0.001

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [8]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:5b->Result:7',
 'Causer:7->Result:50',
 'Causer:2->Result:4',
 'Causer:13->Result:7',
 'Causer:12->Result:7',
 'Causer:4->Result:11',
 'Causer:13->Result:11',
 'Causer:3->Result:7',
 'Causer:4->Result:7',
 'Causer:14->Result:7']

In [9]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Essay Parser

In [10]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 2 
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [11]:
BEAM_SIZE = 100
best_C, best_max_upd = 0.01, 1

In [12]:
models_folder

'/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/BEAM Parser - Essay Level/models/'

In [13]:
import dill

rr_fname = "xs_rerank_" + str(BEAM_SIZE) + ".dill"
with open(os.path.join(models_folder, rr_fname), "rb") as f:
    xs_rerank = dill.load(f)
    
rr_fname = "xs_rerank_test_" + str(BEAM_SIZE) + ".dill"
with open(os.path.join(models_folder, rr_fname), "rb") as f:
    xs_test_rerank = dill.load(f)

len(xs_rerank), len(xs_test_rerank)

(902, 226)

## Prepare Features

In [14]:
%%time
xs = get_features_essay_level(xs_rerank, name2crels, causal_model_type=CAUSAL_MODEL_TYPE, min_feat_freq=1)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 39.6 s, sys: 550 ms, total: 40.1 s
Wall time: 41 s


In [15]:
%%time
xs_test = get_features_essay_level(xs_test_rerank, name2crels, causal_model_type=CAUSAL_MODEL_TYPE, min_feat_freq=1)

CPU times: user 2.8 s, sys: 422 ms, total: 3.23 s
Wall time: 3.34 s


In [16]:
# Prepare test dataset 
  # training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

### Initial Parameters

In [17]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 0.01  # was 0.01

In [13]:
# %%time
# f1 = train_model_parallel(
#     cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
#     set_cr_tags=set_cr_tags, initial_weight=initial_weight)
# print(f1)  # 0.7421167703055035

## Train on Test Data

### Tuning Dataset

In [18]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

### Determine Number of Training Iterations

In [19]:
%%time
# use training data to determine number of iterations
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=5, 
     train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=True)
print("Best iterations:", best_iterations)

Epoch: 0 Train Accuracy: 0.7349 Test Accuracy: 0.7484
Epoch: 1 Train Accuracy: 0.7391 Test Accuracy: 0.7482
Epoch: 2 Train Accuracy: 0.7403 Test Accuracy: 0.7490
Epoch: 3 Train Accuracy: 0.7409 Test Accuracy: 0.7518
Epoch: 4 Train Accuracy: 0.7420 Test Accuracy: 0.7510
Epoch: 5 Train Accuracy: 0.7428 Test Accuracy: 0.7510
Epoch: 6 Train Accuracy: 0.7429 Test Accuracy: 0.7537
Epoch: 7 Train Accuracy: 0.7436 Test Accuracy: 0.7550
Epoch: 8 Train Accuracy: 0.7445 Test Accuracy: 0.7537
Epoch: 9 Train Accuracy: 0.7445 Test Accuracy: 0.7537
Epoch: 10 Train Accuracy: 0.7447 Test Accuracy: 0.7510
Epoch: 11 Train Accuracy: 0.7444 Test Accuracy: 0.7510
Epoch: 12 Train Accuracy: 0.7443 Test Accuracy: 0.7510
Best Test Acc: 0.7550
Best iterations: 8
CPU times: user 2min 14s, sys: 1.13 s, total: 2min 15s
Wall time: 2min 17s


In [16]:
# %%time
# # Test on test data
# mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
#                         max_update_items=max_update_items, initial_weight=initial_weight)

# best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
#                                        name2essay=name2essay, set_cr_tags=set_cr_tags,
#      max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance, verbose=True)
# # Best Test Acc: 0.7406

# Feature Selection

In [20]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
    "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [21]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

In [24]:
MONGO_COLLECTION, best_iterations, initial_weight

('CB_ESSAY_PARSER_RE-RANKER_FEATURE_SEL_TD', 8, 0.01)

In [25]:
%%time
params = {
    "best_top_n": best_top_n,
    "best_max_upd": best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob": best_min_prob,
    "min_feat_freq": min_feat_freq
}

print("Starting...")
while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel(
            #training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
            #feat_extractors=new_prefixes, params=params,            
            cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
            pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
            set_cr_tags=set_cr_tags, initial_weight=initial_weight,
            # use best iterations from above
            max_epochs=best_iterations, early_stop_iters=best_iterations,
            n_jobs=1
        )
        print("\t{length} feats F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(new_prefixes), f1=f1_by_prefix[prefix], prefixes=str(new_prefixes)))
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break
        
# 1 feats, new Best F1: 0.7389 Prefixes: ['CREL_']
# 2 feats, new Best F1: 0.7426 Prefixes: ['CREL_', 'Above-']

# with num_crels fixed:
# 1 feats, new Best F1: 0.7481 Prefixes: ['CREL_']
# 2 feats, new Best F1: 0.7482 Prefixes: ['CREL_', 'num_crels']

Starting...
	1 feats F1: 0.7122 Prefixes: ['Prob-']
	1 feats F1: 0.7255 Prefixes: ['CREL_Pair-']
	1 feats F1: 0.7413 Prefixes: ['Inv-']
	1 feats F1: 0.6812 Prefixes: ['num_crels']
	1 feats F1: 0.6820 Prefixes: ['Tally-']
	1 feats F1: 0.7257 Prefixes: ['CChain-']
	1 feats F1: 0.7074 Prefixes: ['CChainStats-']
	1 feats F1: 0.7334 Prefixes: ['Above-']
	1 feats F1: 0.7317 Prefixes: ['CREL_']
	1 feats F1: 0.7103 Prefixes: ['Propn_']
	1 feats F1: 0.6871 Prefixes: ['Diff_']
1 feats, new Best F1: 0.7413 Prefixes: ['Inv-']
	2 feats F1: 0.7228 Prefixes: ['Inv-', 'Prob-']
	2 feats F1: 0.7205 Prefixes: ['Inv-', 'CREL_Pair-']
	2 feats F1: 0.6821 Prefixes: ['Inv-', 'num_crels']
	2 feats F1: 0.6816 Prefixes: ['Inv-', 'Tally-']
	2 feats F1: 0.7239 Prefixes: ['Inv-', 'CChain-']
	2 feats F1: 0.7139 Prefixes: ['Inv-', 'CChainStats-']
	2 feats F1: 0.7344 Prefixes: ['Inv-', 'Above-']
	2 feats F1: 0.7300 Prefixes: ['Inv-', 'CREL_']
	2 feats F1: 0.7102 Prefixes: ['Inv-', 'Propn_']
	2 feats F1: 0.7206 Prefixe

In [26]:
current_best, best_f1 

(['Inv-'], 0.7412673891359148)

In [27]:
# # run it against the full set of features for comparison
# cv_filtered = []
# for tr, test in cv_folds_mm:
#     x_tr,x_test = filter_feats(tr, test, prefixes)
#     cv_filtered.append((x_tr,x_test))

# f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
#                           pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
#                           set_cr_tags=set_cr_tags, initial_weight=initial_weight)
# f1

## Apply to Test Data

In [28]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [29]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [30]:
# initial_weight = 0

In [31]:
%%time
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.7454 Test Accuracy: 0.7226
Epoch: 1 Train Accuracy: 0.7454 Test Accuracy: 0.7226
Epoch: 2 Train Accuracy: 0.7454 Test Accuracy: 0.7226
Epoch: 3 Train Accuracy: 0.7454 Test Accuracy: 0.7226
Best Test Acc: 0.7226
CPU times: user 872 ms, sys: 42.4 ms, total: 914 ms
Wall time: 906 ms


In [32]:
best_iterations

1

## Run for X Iterations on the Test Dataset

In [33]:
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7364
best_iterations

Epoch: 0 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Best Test Acc: 0.7364


1

In [34]:
initial_weight

0.01

In [35]:
# try initial weight of 0
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7364

Epoch: 0 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Best Test Acc: 0.7364


In [36]:
# try initial weight of 0, more training iterations
ITERATIONS = 20
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=ITERATIONS, early_stop_iters=ITERATIONS,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Epoch: 19 Train Accuracy: 0.7411 Test Accuracy: 0.7364
# Best Test Acc: 0.7364

Epoch: 0 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 1 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 2 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 3 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 4 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 5 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 6 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 7 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 8 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 9 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 10 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 11 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 12 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 13 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 14 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 15 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 16 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 17 Train Accuracy: 0.7411 Test Accuracy: 0.7364
Epoch: 18 Train Accu

In [37]:
# try an initial weight of 1
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Epoch: 0 Train Accuracy: 0.7303 Test Accuracy: 0.7308
# Best Test Acc: 0.7308

Epoch: 0 Train Accuracy: 0.7303 Test Accuracy: 0.7308
Best Test Acc: 0.7308


In [38]:
# initial weight of 1, all feats
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm, xs_test=xs_test_mm,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Epoch: 0 Train Accuracy: 0.6864 Test Accuracy: 0.6814
# Best Test Acc: 0.6814

Epoch: 0 Train Accuracy: 0.6864 Test Accuracy: 0.6814
Best Test Acc: 0.6814


In [39]:
test_acc_df_ml

0.6814024390243902

In [40]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:50]

[('Inv-not_inverted', 2.264999999999973),
 ('Above-All-Above-0.3', 2.0449999999999777),
 ('Above-All-Above-0.2', 2.0424999999999778),
 ('Above-All-Above-0.5', 1.8549999999999818),
 ('Prob-geo-mean', 1.7235842806917),
 ('Prob-min-prob', 1.6958928931766082),
 ('Prob-5%-prob', 1.6892733888996858),
 ('Prob-10%-prob', 1.6801976472883926),
 ('Prob-25%-prob', 1.5555806086790627),
 ('num_crels<=5', 1.517499999999989),
 ('Prob-prod-prob', 1.5041516926477634),
 ('Above-All-Above-0.7', 1.4599999999999902),
 ('num_crels<=6', 1.4574999999999902),
 ('num_crels<=4', 1.4349999999999907),
 ('num_crels<=7', 1.372499999999992),
 ('CChainStats-num_distinct_chains <=1', 1.3574999999999924),
 ('num_crels=0', 1.3574999999999924),
 ('CChainStats-num_distinct_chains <=2', 1.332499999999993),
 ('num_crels<=8', 1.3049999999999935),
 ('num_crels<=3', 1.3049999999999935),
 ('CChainStats-MaxChain_Len=3', 1.2949999999999937),
 ('Above-All-Above-0.8', 1.277499999999994),
 ('Above-%-0.5', 1.2748797279109814),
 ('Above

## TODO

### Ideas
- ~~num-crels - add back in the logic to threshold these? But only if needed to improve results here (seemed to help essay parser)~~
- ~~Re-run with more realistic initial hyper params~~
- ~~Add in logic to store results to mongo~~
- ~~Switch back to using an initial_weight of 1~~
- Approach seems very sensitive to the initial configuration of the algorithm. However, it also seems correlated to the training data performance on the first epoch. Run the algorithm multiple times, take the model with the best training performance and use that as the final selected model to train futher.
- We need to add hyper parameter tuning
- Do we want to just remove the BEAM search from this? It makes the explanation a lot more complex. But then again, it's the only way we can really add more crels that the model wouldn't otherwise parse
- Do we use the BEAM search with some de-duping? Although we already de-dupe to some extent anyways

### Needed to Finish
- Record run on test data - needs optimal hyper parameters first