In [1]:
# %load_ext autoreload
# %autoreload 2

In [13]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [14]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel_logged, train_model_parallel, train_model, train_cost_sensitive_instance
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor

In [3]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.SKIN_CANCER
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/SC"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "SC_RE-RANKER_FEATURE_SEL_TD"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
870 218


In [4]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
print(len(cr_tags))
list(set_cr_tags)[0:10]

52


['Causer:11->Result:3',
 'Causer:12->Result:2',
 'Causer:3->Result:50',
 'Causer:50->Result:2',
 'Causer:4->Result:11',
 'Causer:5->Result:4',
 'Causer:5->Result:6',
 'Causer:5->Result:5',
 'Causer:12->Result:5',
 'Causer:12->Result:12']

In [5]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [6]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 2 
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [7]:
rr_fname = "xs_rerank_" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_rerank = dill.load(f)

rr_fname = "xs_rerank_test" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_test_rerank = dill.load(f)
    
assert len(xs_rerank) == len(pred_tagged_essays_train),     "Wrong number of train crels"
assert len(xs_test_rerank) == len(pred_tagged_essays_test), "Wrong number of test crels"
len(xs_rerank), len(xs_test_rerank)

(870, 218)

## Prepare Features

In [8]:
%%time
xs = get_features_from_probabilities(xs_rerank, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 2min 12s, sys: 4.12 s, total: 2min 16s
Wall time: 2min 18s


In [9]:
%%time
xs_test = get_features_from_probabilities(xs_test_rerank, name2crels, best_max_parses, 
                                          causal_model_type=CAUSAL_MODEL_TYPE,
                                          min_feat_freq=min_feat_freq, min_prob=best_min_prob)

CPU times: user 13.4 s, sys: 237 ms, total: 13.6 s
Wall time: 13.7 s


In [10]:
# Prepare test dataset 
  # training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

### Initial Parameters

In [11]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 0.01  # was 0.01

In [12]:
%%time
f1 = train_model_parallel(
    cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
    set_cr_tags=set_cr_tags, initial_weight=initial_weight)
print(f1)
# 0.80599

0.8059914772313782
CPU times: user 3min 49s, sys: 47.7 s, total: 4min 37s
Wall time: 8min 58s


In [17]:
%%time
# Test on test data
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
                                       name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance, v
                                         erbose=True)


Epoch: 0 Train Accuracy: 0.8061 Test Accuracy: 0.8280
Epoch: 1 Train Accuracy: 0.8061 Test Accuracy: 0.8280
Epoch: 2 Train Accuracy: 0.8069 Test Accuracy: 0.8260
Epoch: 3 Train Accuracy: 0.8074 Test Accuracy: 0.8269
Best Test Acc: 0.8280
CPU times: user 2min 39s, sys: 1.09 s, total: 2min 40s
Wall time: 2min 42s


## Train on Test Data

### Tuning Dataset

In [15]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

### Determine Number of Training Iterations

In [16]:
%%time
# use training data to determine number of iterations
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=5, 
     train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=True)
print("Best iterations:", best_iterations)

Epoch: 0 Train Accuracy: 0.8042 Test Accuracy: 0.8149
Epoch: 1 Train Accuracy: 0.8050 Test Accuracy: 0.8155
Epoch: 2 Train Accuracy: 0.8050 Test Accuracy: 0.8161
Epoch: 3 Train Accuracy: 0.8051 Test Accuracy: 0.8167
Epoch: 4 Train Accuracy: 0.8058 Test Accuracy: 0.8158
Epoch: 5 Train Accuracy: 0.8061 Test Accuracy: 0.8149
Epoch: 6 Train Accuracy: 0.8064 Test Accuracy: 0.8140
Epoch: 7 Train Accuracy: 0.8063 Test Accuracy: 0.8140
Epoch: 8 Train Accuracy: 0.8063 Test Accuracy: 0.8146
Best Test Acc: 0.8167
Best iterations: 4
CPU times: user 4min 33s, sys: 3.16 s, total: 4min 37s
Wall time: 1h 32min 14s


# Feature Selection

In [17]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
    "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [18]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

In [19]:
MONGO_COLLECTION, best_iterations, initial_weight

('CB_RE-RANKER_FEATURE_SEL_TD_3', 3, 0.01)

In [20]:
%%time
params = {
    "best_top_n": best_top_n,
    "best_max_upd": best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob": best_min_prob,
    "min_feat_freq": min_feat_freq
}

print("Starting...")
while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel_logged(
            training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
            feat_extractors=new_prefixes, params=params,
            
            cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
            pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
            set_cr_tags=set_cr_tags, initial_weight=initial_weight,
            # use best iterations from above
            max_epochs=best_iterations, early_stop_iters=best_iterations
        )
        print("\t{length} feats F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(new_prefixes), f1=f1_by_prefix[prefix], prefixes=str(new_prefixes)))
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break


1 feats, new Best F1: 0.7430 Prefixes: ['CREL_']
2 feats, new Best F1: 0.7438 Prefixes: ['CREL_', 'Prob-']
3 feats, new Best F1: 0.7461 Prefixes: ['CREL_', 'Prob-', 'CChainStats-']
No further improvement, stopping
CPU times: user 40min 53s, sys: 4min 32s, total: 45min 26s
Wall time: 1h 13min 49s


In [21]:
current_best, best_f1 

(['CREL_', 'Prob-', 'CChainStats-'], 0.7460682777138474)

In [22]:
# # run it against the full set of features for comparison
# cv_filtered = []
# for tr, test in cv_folds_mm:
#     x_tr,x_test = filter_feats(tr, test, prefixes)
#     cv_filtered.append((x_tr,x_test))

# f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
#                           pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
#                           set_cr_tags=set_cr_tags, initial_weight=initial_weight)
# f1 

## Apply to Test Data

In [23]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [24]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [25]:
# initial_weight = 0

In [26]:
%%time
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.7441 Test Accuracy: 0.7533
Epoch: 1 Train Accuracy: 0.7480 Test Accuracy: 0.7565
Epoch: 2 Train Accuracy: 0.7496 Test Accuracy: 0.7597
Epoch: 3 Train Accuracy: 0.7510 Test Accuracy: 0.7590
Epoch: 4 Train Accuracy: 0.7525 Test Accuracy: 0.7592
Epoch: 5 Train Accuracy: 0.7528 Test Accuracy: 0.7588
Best Test Acc: 0.7597
CPU times: user 1min 5s, sys: 86.5 ms, total: 1min 5s
Wall time: 1min 5s


In [27]:
best_iterations

3

## Run for X Iterations on the Test Dataset

In [28]:
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516
best_iterations

Epoch: 0 Train Accuracy: 0.7464 Test Accuracy: 0.7406
Epoch: 1 Train Accuracy: 0.7505 Test Accuracy: 0.7418
Epoch: 2 Train Accuracy: 0.7531 Test Accuracy: 0.7420
Best Test Acc: 0.7420


3

In [29]:
initial_weight

0.01

In [30]:
# try initial weight of 0
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7471 Test Accuracy: 0.7431
Epoch: 1 Train Accuracy: 0.7515 Test Accuracy: 0.7438
Epoch: 2 Train Accuracy: 0.7525 Test Accuracy: 0.7429
Best Test Acc: 0.7438


In [31]:
# try initial weight of 0, more training iterations
ITERATIONS = 20
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=ITERATIONS, early_stop_iters=ITERATIONS,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7499 Test Accuracy: 0.7409
Epoch: 1 Train Accuracy: 0.7526 Test Accuracy: 0.7427
Epoch: 2 Train Accuracy: 0.7530 Test Accuracy: 0.7434
Epoch: 3 Train Accuracy: 0.7543 Test Accuracy: 0.7422
Epoch: 4 Train Accuracy: 0.7550 Test Accuracy: 0.7399
Epoch: 5 Train Accuracy: 0.7556 Test Accuracy: 0.7399
Epoch: 6 Train Accuracy: 0.7563 Test Accuracy: 0.7406
Epoch: 7 Train Accuracy: 0.7572 Test Accuracy: 0.7413
Epoch: 8 Train Accuracy: 0.7570 Test Accuracy: 0.7426
Epoch: 9 Train Accuracy: 0.7574 Test Accuracy: 0.7426
Epoch: 10 Train Accuracy: 0.7577 Test Accuracy: 0.7433
Epoch: 11 Train Accuracy: 0.7580 Test Accuracy: 0.7429
Epoch: 12 Train Accuracy: 0.7588 Test Accuracy: 0.7429
Epoch: 13 Train Accuracy: 0.7586 Test Accuracy: 0.7429
Epoch: 14 Train Accuracy: 0.7586 Test Accuracy: 0.7429
Epoch: 15 Train Accuracy: 0.7587 Test Accuracy: 0.7435
Epoch: 16 Train Accuracy: 0.7591 Test Accuracy: 0.7435
Epoch: 17 Train Accuracy: 0.7603 Test Accuracy: 0.7449
Epoch: 18 Train Accu

In [32]:
# try an initial weight of 1
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7417 Test Accuracy: 0.7407
Epoch: 1 Train Accuracy: 0.7414 Test Accuracy: 0.7407
Epoch: 2 Train Accuracy: 0.7414 Test Accuracy: 0.7414
Best Test Acc: 0.7414


In [33]:
# initial weight of 1, all feats
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm, xs_test=xs_test_mm,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

# Best Test Acc: 0.7516

# Best Test Acc: 0.7408

Epoch: 0 Train Accuracy: 0.7403 Test Accuracy: 0.7409
Epoch: 1 Train Accuracy: 0.7394 Test Accuracy: 0.7399
Epoch: 2 Train Accuracy: 0.7382 Test Accuracy: 0.7384
Best Test Acc: 0.7409


In [34]:
test_acc_df_ml

0.7383512544802866

In [35]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:50]

[('num_crels<=4', 2.172499999999975),
 ('num_crels<=3', 2.1374999999999758),
 ('num_crels<=5', 1.9999999999999787),
 ('num_crels<=2', 1.844999999999982),
 ('num_crels<=6', 1.7274999999999845),
 ('CChainStats-num_distinct_chains <=1', 1.7099999999999849),
 ('Inv-not_inverted', 1.6224999999999867),
 ('CChainStats-MaxChain_Len=0', 1.590070667321122),
 ('CChainStats-num_distinct_chains=0', 1.590070667321122),
 ('CChainStats-num_distinct_chains <=0', 1.590070667321122),
 ('num_crels<=7', 1.5524999999999882),
 ('num_crels<=1', 1.5050706673211238),
 ('CChainStats-num_distinct_chains <=2', 1.47249999999999),
 ('num_crels=0', 1.4049999999999914),
 ('num_crels=2', 1.3399293326788582),
 ('Prob-prod-prob', 1.2996970063299125),
 ('num_crels=3', 1.2924999999999938),
 ('Above-All-Above-0.7', 1.279999999999994),
 ('CChainStats-num_distinct_chains <=3', 1.2674999999999943),
 ('Diff_adjacent_codes', 1.2378750000000023),
 ('num_crels<=8', 1.1949999999999958),
 ('Above-All-Above-0.9', 1.1275706673211319),

## TODO

### Ideas
- ~~num-crels - add back in the logic to threshold these? But only if needed to improve results here (seemed to help essay parser)~~
- ~~Re-run with more realistic initial hyper params~~
- ~~Add in logic to store results to mongo~~
- ~~Switch back to using an initial_weight of 1~~
- Approach seems very sensitive to the initial configuration of the algorithm. However, it also seems correlated to the training data performance on the first epoch. Run the algorithm multiple times, take the model with the best training performance and use that as the final selected model to train futher.
- We need to add hyper parameter tuning
- Do we want to just remove the BEAM search from this? It makes the explanation a lot more complex. But then again, it's the only way we can really add more crels that the model wouldn't otherwise parse
- Do we use the BEAM search with some de-duping? Although we already de-dupe to some extent anyways

### Needed to Finish
- Record run on test data - needs optimal hyper parameters first