In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [3]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel_logged, train_model_parallel, train_model, train_cost_sensitive_instance
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor

In [4]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.SKIN_CANCER
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

crels_folder = "./crels/SC"
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

MONGO_COLLECTION = "SC_RE-RANKER_FEATURE_SEL_TD"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
870 218


In [5]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
print(len(cr_tags))
list(set_cr_tags)[0:10]

52


['Causer:11->Result:12',
 'Causer:5->Result:50',
 'Causer:1->Result:5',
 'Causer:3->Result:2',
 'Causer:4->Result:50',
 'Causer:2->Result:50',
 'Causer:50->Result:5',
 'Causer:6->Result:3',
 'Causer:1->Result:2',
 'Causer:5->Result:6']

In [6]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

# Train Re-Ranker

## Extract Parses from Sentence Parser

In [7]:
best_top_n = 2
min_feat_freq = 1
best_max_upd = 2 
best_max_parses = 300
best_min_prob = 0.0  # min prob of 0 seems better

In [8]:
rr_fname = "xs_rerank_" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_rerank = dill.load(f)

rr_fname = "xs_rerank_test" + str(best_top_n) + ".dill"
with open(os.path.join(crels_folder, rr_fname), "rb") as f:
    xs_test_rerank = dill.load(f)
    
assert len(xs_rerank) == len(pred_tagged_essays_train),     "Wrong number of train crels"
assert len(xs_test_rerank) == len(pred_tagged_essays_test), "Wrong number of test crels"
len(xs_rerank), len(xs_test_rerank)

(870, 218)

## Prepare Features

In [9]:
%%time
xs = get_features_from_probabilities(xs_rerank, name2crels, best_max_parses, 
                                     causal_model_type=CAUSAL_MODEL_TYPE,
                                     min_feat_freq=min_feat_freq, min_prob=best_min_prob)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

CPU times: user 1min 59s, sys: 2.55 s, total: 2min 1s
Wall time: 2min 1s


In [10]:
%%time
xs_test = get_features_from_probabilities(xs_test_rerank, name2crels, best_max_parses, 
                                          causal_model_type=CAUSAL_MODEL_TYPE,
                                          min_feat_freq=min_feat_freq, min_prob=best_min_prob)

CPU times: user 9.76 s, sys: 109 ms, total: 9.87 s
Wall time: 9.85 s


In [11]:
# Prepare test dataset 
  # training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

### Initial Parameters

In [12]:
best_C = 0.0025       # 0.0025
C = best_C            # This needs to be a lot lower
pa_type = 1
loss_type= "ml"
max_update_items = 2  # best_max_upd - 2
initial_weight = 0.01  # was 0.01

In [13]:
%%time
f1 = train_model_parallel(
    cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
    set_cr_tags=set_cr_tags, initial_weight=initial_weight)
print(f1)
# 0.80599

0.8057593767132453
CPU times: user 3min 22s, sys: 46.4 s, total: 4min 8s
Wall time: 7min 44s


In [17]:
# %%time
# # Test on test data
# mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
#                         max_update_items=max_update_items, initial_weight=initial_weight)

# best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
#                                        name2essay=name2essay, set_cr_tags=set_cr_tags,
#      max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance,
#                                          verbose=True)


## Train on Test Data

### Tuning Dataset

In [18]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

### Determine Number of Training Iterations

In [19]:
%%time
# use training data to determine number of iterations
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=5, 
     train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=True)
print("Best iterations:", best_iterations)

Epoch: 0 Train Accuracy: 0.8058 Test Accuracy: 0.8084
Epoch: 1 Train Accuracy: 0.8060 Test Accuracy: 0.8076
Epoch: 2 Train Accuracy: 0.8066 Test Accuracy: 0.8076
Epoch: 3 Train Accuracy: 0.8070 Test Accuracy: 0.8076
Epoch: 4 Train Accuracy: 0.8074 Test Accuracy: 0.8081
Epoch: 5 Train Accuracy: 0.8074 Test Accuracy: 0.8078
Best Test Acc: 0.8084
Best iterations: 1
CPU times: user 2min 42s, sys: 2.66 s, total: 2min 45s
Wall time: 2min 46s


# Feature Selection

In [20]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
#     "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_",
    "Propn_",
    "Diff_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [21]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

In [22]:
MONGO_COLLECTION, best_iterations, initial_weight

('SC_RE-RANKER_FEATURE_SEL_TD', 1, 0.01)

In [23]:
%%time
params = {
    "best_top_n": best_top_n,
    "best_max_upd": best_max_upd,
    "best_max_parses": best_max_parses,
    "best_min_prob": best_min_prob,
    "min_feat_freq": min_feat_freq
}

print("Starting...")
while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel_logged(
            training_collection_name=MONGO_COLLECTION, results_processor=results_processor,
            feat_extractors=new_prefixes, params=params,
            
            cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
            pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
            set_cr_tags=set_cr_tags, initial_weight=initial_weight,
            # use best iterations from above
            max_epochs=best_iterations, early_stop_iters=best_iterations
        )
        print("\t{length} feats F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(new_prefixes), f1=f1_by_prefix[prefix], prefixes=str(new_prefixes)))
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break


Starting...
	1 feats F1: 0.7965 Prefixes: ['Prob-']
	1 feats F1: 0.0066 Prefixes: ['Inv-']
	1 feats F1: 0.8062 Prefixes: ['num_crels']
	1 feats F1: 0.8062 Prefixes: ['Tally-']
	1 feats F1: 0.6383 Prefixes: ['CChain-']
	1 feats F1: 0.7488 Prefixes: ['CChainStats-']
	1 feats F1: 0.8037 Prefixes: ['Above-']
	1 feats F1: 0.8064 Prefixes: ['CREL_']
	1 feats F1: 0.6637 Prefixes: ['Propn_']
	1 feats F1: 0.6454 Prefixes: ['Diff_']
1 feats, new Best F1: 0.8064 Prefixes: ['CREL_']
	2 feats F1: 0.8065 Prefixes: ['CREL_', 'Prob-']
	2 feats F1: 0.8061 Prefixes: ['CREL_', 'Inv-']
	2 feats F1: 0.8062 Prefixes: ['CREL_', 'num_crels']
	2 feats F1: 0.8062 Prefixes: ['CREL_', 'Tally-']
	2 feats F1: 0.8066 Prefixes: ['CREL_', 'CChain-']
	2 feats F1: 0.8059 Prefixes: ['CREL_', 'CChainStats-']
	2 feats F1: 0.8060 Prefixes: ['CREL_', 'Above-']
	2 feats F1: 0.8064 Prefixes: ['CREL_', 'Propn_']
	2 feats F1: 0.8063 Prefixes: ['CREL_', 'Diff_']
2 feats, new Best F1: 0.8066 Prefixes: ['CREL_', 'CChain-']
	3 feats

In [25]:
current_best, best_f1 

(['CREL_', 'CChain-', 'Prob-'], 0.8066838744735769)

In [None]:
# # run it against the full set of features for comparison
# cv_filtered = []
# for tr, test in cv_folds_mm:
#     x_tr,x_test = filter_feats(tr, test, prefixes)
#     cv_filtered.append((x_tr,x_test))

# f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
#                           pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
#                           set_cr_tags=set_cr_tags, initial_weight=initial_weight)
# f1 

## Apply to Test Data

In [26]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [27]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

## Determine the Optimal Number of Training Iterations

In [28]:
# initial_weight = 0

In [29]:
%%time
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance,
                                                        verbose=True,  early_stopping=True)

Epoch: 0 Train Accuracy: 0.8095 Test Accuracy: 0.7965
Epoch: 1 Train Accuracy: 0.8095 Test Accuracy: 0.7962
Epoch: 2 Train Accuracy: 0.8108 Test Accuracy: 0.7962
Epoch: 3 Train Accuracy: 0.8108 Test Accuracy: 0.7949
Best Test Acc: 0.7965
CPU times: user 48.3 s, sys: 290 ms, total: 48.6 s
Wall time: 49 s


In [30]:
best_iterations

1

## Run for X Iterations on the Test Dataset

In [31]:
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=initial_weight)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

best_iterations

Epoch: 0 Train Accuracy: 0.8067 Test Accuracy: 0.8285
Best Test Acc: 0.8285


1

In [32]:
initial_weight

0.01

In [33]:
# try initial weight of 0
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

Epoch: 0 Train Accuracy: 0.8075 Test Accuracy: 0.8286
Best Test Acc: 0.8286


In [34]:
# try initial weight of 0, more training iterations
ITERATIONS = 20
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=ITERATIONS, early_stop_iters=ITERATIONS,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

Epoch: 0 Train Accuracy: 0.8079 Test Accuracy: 0.8277
Epoch: 1 Train Accuracy: 0.8084 Test Accuracy: 0.8283
Epoch: 2 Train Accuracy: 0.8083 Test Accuracy: 0.8283
Epoch: 3 Train Accuracy: 0.8084 Test Accuracy: 0.8290
Epoch: 4 Train Accuracy: 0.8085 Test Accuracy: 0.8290
Epoch: 5 Train Accuracy: 0.8085 Test Accuracy: 0.8290
Epoch: 6 Train Accuracy: 0.8087 Test Accuracy: 0.8283
Epoch: 7 Train Accuracy: 0.8086 Test Accuracy: 0.8283
Epoch: 8 Train Accuracy: 0.8088 Test Accuracy: 0.8281
Epoch: 9 Train Accuracy: 0.8094 Test Accuracy: 0.8275
Epoch: 10 Train Accuracy: 0.8097 Test Accuracy: 0.8275
Epoch: 11 Train Accuracy: 0.8095 Test Accuracy: 0.8275
Epoch: 12 Train Accuracy: 0.8094 Test Accuracy: 0.8275
Epoch: 13 Train Accuracy: 0.8095 Test Accuracy: 0.8275
Epoch: 14 Train Accuracy: 0.8095 Test Accuracy: 0.8275
Epoch: 15 Train Accuracy: 0.8093 Test Accuracy: 0.8275
Epoch: 16 Train Accuracy: 0.8093 Test Accuracy: 0.8275
Epoch: 17 Train Accuracy: 0.8097 Test Accuracy: 0.8275
Epoch: 18 Train Accu

In [35]:
# try an initial weight of 1
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

Epoch: 0 Train Accuracy: 0.8062 Test Accuracy: 0.8280
Best Test Acc: 0.8280


In [36]:
# initial weight of 1, all feats
mdl = CostSensitiveMIRA(C=best_C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=1)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm, xs_test=xs_test_mm,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=5,
    train_instance_fn = train_cost_sensitive_instance, verbose=True, early_stopping=False)

Epoch: 0 Train Accuracy: 0.8057 Test Accuracy: 0.8271
Best Test Acc: 0.8271


In [37]:
test_acc_df_ml

0.8271108558299828

In [38]:
sorted(best_mdl.weights.items(), key = lambda tpl: -abs(tpl[1]))[0:50]

[('num_crels<=5', 1.2549999999999946),
 ('CChainStats-num_distinct_chains <=1', 1.2499999999999947),
 ('num_crels<=6', 1.2024999999999957),
 ('num_crels<=2', 1.1949999999999958),
 ('num_crels<=3', 1.1949999999999958),
 ('num_crels<=4', 1.192499999999996),
 ('CChainStats-num_distinct_chains <=2', 1.192499999999996),
 ('CChainStats-MaxChain_Len=0', 1.1599999999999966),
 ('CChainStats-num_distinct_chains=0', 1.1599999999999966),
 ('CChainStats-num_distinct_chains <=0', 1.1599999999999966),
 ('Above-All-Above-0.7', 1.1349999999999971),
 ('num_crels<=7', 1.1324999999999972),
 ('Above-All-Above-0.8', 1.115134960819355),
 ('num_crels=2', 1.1073650391806402),
 ('Prob-prod-prob', 1.094044242703092),
 ('CChainStats-num_distinct_chains=1', 1.089999999999998),
 ('num_crels<=1', 1.0876349608193556),
 ('CChainStats-num_distinct_chains <=3', 1.0874999999999981),
 ('Above-All-Above-0.9', 1.0851349608193557),
 ('num_crels=0', 1.0724999999999985),
 ('num_crels=5', 1.0624999999999987),
 ('num_crels<=8', 

## TODO

### Ideas
- ~~num-crels - add back in the logic to threshold these? But only if needed to improve results here (seemed to help essay parser)~~
- ~~Re-run with more realistic initial hyper params~~
- ~~Add in logic to store results to mongo~~
- ~~Switch back to using an initial_weight of 1~~
- Approach seems very sensitive to the initial configuration of the algorithm. However, it also seems correlated to the training data performance on the first epoch. Run the algorithm multiple times, take the model with the best training performance and use that as the final selected model to train futher.
- We need to add hyper parameter tuning
- Do we want to just remove the BEAM search from this? It makes the explanation a lot more complex. But then again, it's the only way we can really add more crels that the model wouldn't otherwise parse
- Do we use the BEAM search with some de-duping? Although we already de-dupe to some extent anyways

### Needed to Finish
- Record run on test data - needs optimal hyper parameters first