In [1]:
!pwd

/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
cwd = os.getcwd()
src_path = os.path.join(cwd, "src")
sys.path.append(src_path)

In [4]:
from typing import Any

import dill
from sklearn.linear_model import LogisticRegression
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings

from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_extraction import get_features_from_probabilities
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel, train_model, train_cost_sensitive_instance
from window_based_tagger_config import get_config

In [5]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:6->Result:5b',
 'Causer:13->Result:5',
 'Causer:13->Result:7',
 'Causer:50->Result:1',
 'Causer:5b->Result:7',
 'Causer:1->Result:3',
 'Causer:7->Result:1',
 'Causer:5->Result:4',
 'Causer:6->Result:7',
 'Causer:11->Result:6']

In [7]:
base_extractors, all_extractor_fns, all_cost_functions = create_extractor_functions()

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [8]:
from searn_essay_parser_breadth_first import SearnModelEssayParserBreadthFirst

In [9]:
test_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

In [98]:
len(pred_tagged_essays_train)

902

In [10]:
BASE_LEARNER_FACT = lambda: LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                        'three_words', 'third_order', 'unigrams']  # type: List[str]


In [11]:
%%time
result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    all_extractor_fns=all_extractor_fns,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    max_epochs=max_epochs,
    min_feat_freq=MIN_FEAT_FREQ, 
    cr_tags=set_cr_tags,
    base_learner_fact=BASE_LEARNER_FACT, 
    down_sample_rate=1.0, model = SearnModelEssayParserBreadthFirst)

CPU times: user 6min 4s, sys: 5.86 s, total: 6min 10s
Wall time: 6min 12s


## TODO - Generate the Top Parses

## Training Accuracy

In [13]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag = result_test_essay_level

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.986072,0.783054,0.753011,0.815594


## CV Accuracy

In [14]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_test_essay_level

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.982883,0.735056,0.711314,0.760437


## Test Accuracy

In [15]:
%%time
result_final_test = evaluate_model_essay_level(
    folds=test_folds,
    extractor_fn_names_lst=best_extractor_names,
    all_extractor_fns=all_extractor_fns,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    max_epochs=max_epochs,
    min_feat_freq=MIN_FEAT_FREQ,
    cr_tags=set_cr_tags,
    base_learner_fact=BASE_LEARNER_FACT, 
    down_sample_rate=1.0, model = SearnModelEssayParserBreadthFirst)

models_test, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_final_test

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
test_metrics_df = get_micro_metrics(metrics_to_df(mean_metrics))

CPU times: user 1min 29s, sys: 1.27 s, total: 1min 31s
Wall time: 1min 31s


In [16]:
test_metrics_df

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.985413,0.7307,0.744059,0.717813


In [17]:
final_test_model = models_test

### Get the Expected Crels Per Essay

In [104]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

### Init Parameter Settings

In [105]:
# initial settings for other params
# min prob of 0 seems better
# best_max_upd - how many of the top ranked items cause an update?
# best_top_n = BEAM size for BEAM parser
# C - refers to MIRA C param


# BEST
# best_top_n, best_C, best_max_upd, best_max_parses, best_min_prob = (2, 0.0025, 2, 300, 0.0)  

# Sensible Defaults - So we can 
best_top_n, best_C, best_max_upd, best_max_parses, best_min_prob = (3, 0.01, 1, 250, 0.0)  

## Train Re-Ranker

In [106]:
from collections import defaultdict
from parse_generator import collapse_sent_parse

# ESSAY Parser
# For the essay level parser, each pred_parse is a separate complete parse tree, and should be treated as such.
def get_essays2crels_essay_level(essays, sr_model: SearnModelEssayParserBreadthFirst, top_n, search_mode_max_prob=False):
    trainessay2probs = defaultdict(list)
    for eix, essay in enumerate(essays):
        pred_parse_actions = sr_model.generate_all_potential_parses_for_essay(
                tagged_essay=essay, top_n=top_n,
                search_mode_max_prob=search_mode_max_prob)

        for pp in pred_parse_actions:
            cr2p = collapse_sent_parse([pp])
            trainessay2probs[essay.name].append(dict(cr2p))

        if len(trainessay2probs[essay.name]) == 0:
            trainessay2probs[essay.name] = [dict()]

    # returns a dictionary to a list of dictionaries, instead of a list of probabilties. Each dictionary is then a list of probs
    # conceptually this returns a dictionary of filename to a list of parses, as we don't then generate those later from random smapling
    return trainessay2probs

# apply get_essays2crels.... to each held out fold, and combine into same data structure (dictionary keyed on essay name)
def essay_to_crels_cv_essay_level(cv_folds, models, top_n, search_mode_max_prob=False):
    essay2crelprobs = defaultdict(list)
    assert len(cv_folds) == len(models)
    for (train, test), mdl in zip(cv_folds, models):
        test2probs = get_essays2crels_essay_level(test, mdl, top_n, search_mode_max_prob)
        for k,v in test2probs.items():
            assert k not in essay2crelprobs
            essay2crelprobs[k] = v
    return essay2crelprobs


In [107]:
BEAM_SIZE = 30

In [66]:
%%time
xs_rr_tmp = essay_to_crels_cv_essay_level(cv_folds, models, top_n=BEAM_SIZE, search_mode_max_prob=False)
# 4 mins for BEAM size of 10 - mean of 2 different parses
# 15 mins for BEAM size of 30 - mean of 4 different parses

CPU times: user 14min 40s, sys: 3.05 s, total: 14min 43s
Wall time: 14min 46s


In [108]:
assert len(xs_rr_tmp) == len(pred_tagged_essays_train)

In [109]:
len(xs_rr_tmp), len(xs), len(pred_tagged_essays_train)

(902, 902, 902)

In [69]:
# Compute average number of actual parses in each sentence
def to_parse(dct):
    return tuple(sorted(dct.keys()))

lens = []
for k, dcts in xs_rr_tmp.items():
    unique_parses = set()
    for dct in dcts:
        p = to_parse(dct)
        unique_parses.add(p)
    lens.append(len(unique_parses))
    
np.mean(lens), np.median(lens), np.percentile(lens, 75)

(4.034368070953437, 4.0, 5.0)

In [42]:
import joblib
# These are essays with their predicted CRel probs
# joblib.dump(xs_rerank, "crel_probs/CB/xs_rerank_top_n_" + str(best_top_n) + ".jlib")

['crel_probs/CB/xs_rerank_top_n_3.jlib']

In [110]:
best_max_parses

250

In [112]:
from feature_extraction import get_features_essay_level

In [113]:
xs = get_features_essay_level(\
                xs_rr_tmp, name2crels, min_feat_freq=1)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train, test) for (train, test) in cv_folds_rerank]

TypeError: get_features_essay_level() got an unexpected keyword argument 'min_prob'

In [44]:
%%time
f1 = train_model_parallel(cv_folds=cv_folds_mm, name2essay=name2essay, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, set_cr_tags=set_cr_tags)
print(f1)  # 0.7421167703055035

0.7332355312641453
CPU times: user 2min 15s, sys: 23.5 s, total: 2min 38s
Wall time: 11min 19s


## Train on Test Data

In [16]:
xs_test_rerank = essay_to_crels_cv(test_folds, final_test_model, top_n=best_top_n, search_mode_max_prob=False)
xs_test = get_features_from_probabilities(xs_test_rerank, name2crels, best_max_parses, min_feat_freq=1,
                                          min_prob=best_min_prob)

In [17]:
# training data comes from the test fold predictions from CV on the full training dataset
xs_train = []
for train, test in cv_folds_rerank:
    xs_train.extend(test)

In [18]:
# Normalize both using training data
xs_train_mm, xs_test_mm = min_max_normalize_feats(xs_train,xs_test)

In [19]:
num_train = int(0.8 * len(xs_train_mm))
tmp_train_copy = list(xs_train_mm)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

In [20]:
%%time
C = best_C
pa_type = 1
loss_type= "ml"
max_update_items = best_max_upd

mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=5, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7429 Test Accuracy: 0.7461
Epoch: 1 Train Accuracy: 0.7456 Test Accuracy: 0.7349
Epoch: 2 Train Accuracy: 0.7466 Test Accuracy: 0.7379
Epoch: 3 Train Accuracy: 0.7470 Test Accuracy: 0.7367
Epoch: 4 Train Accuracy: 0.7471 Test Accuracy: 0.7374
Epoch: 5 Train Accuracy: 0.7491 Test Accuracy: 0.7381
Best Test Acc: 0.7461
CPU times: user 2min 2s, sys: 665 ms, total: 2min 3s
Wall time: 2min 4s


In [21]:
best_iterations

1

In [22]:
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  xs_train=xs_train_mm, xs_test=xs_test_mm,
                                       name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=best_iterations, early_stop_iters=best_iterations, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7436 Test Accuracy: 0.7548
Best Test Acc: 0.7548


In [46]:
from filter_features import filter_feats

prefixes = [
    "Prob-",
    "CREL_Pair-",
    "Inv-",
    "num_crels",
    "Tally-",
    "CChain-",
    "CChainStats-",
    "Above-",
    "CREL_"
]
# xs_fltr_train, xs_fltr_test = filter_feats(xs_train_mm, xs_test_mm, prefixes)
assert len(prefixes) == len(set(prefixes)), "Duplicate prefixes found"

In [None]:
best_f1 = -1
current_best = []
remaining = list(prefixes)

while True:
    if len(remaining) == 0:
        break
    
    f1_by_prefix = dict()
    for prefix in remaining:
        new_prefixes = current_best + [prefix]
        
        cv_filtered = []
        for tr, test in cv_folds_mm:
            x_tr,x_test = filter_feats(tr, test, new_prefixes)
            cv_filtered.append((x_tr,x_test))
        
        f1_by_prefix[prefix] = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
                                  pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
                                  set_cr_tags=set_cr_tags)
    
    best_prefix, new_best_f1 = sorted(f1_by_prefix.items(), key = lambda tpl: -tpl[1])[0]
    if new_best_f1 > best_f1:
        best_f1 = new_best_f1
        current_best.append(best_prefix)
        remaining.remove(best_prefix)
        print("{length} feats, new Best F1: {f1:.4f} Prefixes: {prefixes}".format(
            length=len(current_best), f1=best_f1, prefixes=str(current_best)))
    else:
        print("No further improvement, stopping")
        break

1 feats, new Best F1: 0.7359 Prefixes: ['Above-']
2 feats, new Best F1: 0.7363 Prefixes: ['Above-', 'Inv-']
3 feats, new Best F1: 0.7372 Prefixes: ['Above-', 'Inv-', 'Prob-']
4 feats, new Best F1: 0.7375 Prefixes: ['Above-', 'Inv-', 'Prob-', 'CREL_Pair-']


In [33]:
cv_filtered = []
for tr, test in cv_folds_mm:
    x_tr,x_test = filter_feats(tr, test, prefixes)
    cv_filtered.append((x_tr,x_test))

f1 = train_model_parallel(cv_folds=cv_filtered, name2essay=name2essay, C=best_C, 
                          pa_type=1, loss_type="ml", max_update_items=best_max_upd, 
                          set_cr_tags=set_cr_tags)
f1

0.7393575145678756

## Apply to Test Data

In [34]:
current_best

['CREL_', 'Above-', 'num_crels', 'CChain-', 'Inv-']

In [35]:
xs_train_mm_fltr, xs_test_mm_fltr = filter_feats(xs_train_mm, xs_test_mm, current_best) 

In [36]:
num_train = int(0.8 * len(xs_train_mm_fltr))
tmp_train_copy = list(xs_train_mm_fltr)
np.random.shuffle(tmp_train_copy)
tmp_train, tmp_test = tmp_train_copy[:num_train], tmp_train_copy[num_train:]

In [40]:
%%time
C = best_C
pa_type = 1
loss_type= "ml"
max_update_items = best_max_upd

mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)
# Determine number of training iterations
best_mdl, test_acc_df_ml, best_iterations = train_model(mdl, xs_train=tmp_train, xs_test=tmp_test, name2essay=name2essay, set_cr_tags=set_cr_tags,
     max_epochs=20, early_stop_iters=3, train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7455 Test Accuracy: 0.7365
Epoch: 1 Train Accuracy: 0.7476 Test Accuracy: 0.7368
Epoch: 2 Train Accuracy: 0.7501 Test Accuracy: 0.7351
Epoch: 3 Train Accuracy: 0.7502 Test Accuracy: 0.7360
Epoch: 4 Train Accuracy: 0.7506 Test Accuracy: 0.7382
Epoch: 5 Train Accuracy: 0.7510 Test Accuracy: 0.7389
Epoch: 6 Train Accuracy: 0.7518 Test Accuracy: 0.7384
Epoch: 7 Train Accuracy: 0.7533 Test Accuracy: 0.7379
Epoch: 8 Train Accuracy: 0.7537 Test Accuracy: 0.7366
Best Test Acc: 0.7389
CPU times: user 2min 9s, sys: 866 ms, total: 2min 10s
Wall time: 2min 11s


In [41]:
best_iterations 

6

In [42]:
mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, 
                        max_update_items=max_update_items, initial_weight=0.01)

best_mdl, test_acc_df_ml,_ = train_model(mdl,  
    xs_train=xs_train_mm_fltr, xs_test=xs_test_mm_fltr,
    name2essay=name2essay, set_cr_tags=set_cr_tags,
    max_epochs=best_iterations, early_stop_iters=best_iterations, 
    train_instance_fn = train_cost_sensitive_instance, verbose=True)

Epoch: 0 Train Accuracy: 0.7432 Test Accuracy: 0.7541
Epoch: 1 Train Accuracy: 0.7435 Test Accuracy: 0.7530
Epoch: 2 Train Accuracy: 0.7458 Test Accuracy: 0.7537
Epoch: 3 Train Accuracy: 0.7474 Test Accuracy: 0.7514
Epoch: 4 Train Accuracy: 0.7467 Test Accuracy: 0.7484
Epoch: 5 Train Accuracy: 0.7480 Test Accuracy: 0.7472
Best Test Acc: 0.7541


In [48]:
sorted(best_mdl.weights.items(), key = lambda tpl: -tpl[1])[0:20]

[('Above-All-Above-0.7', 0.20500000000000013),
 ('num_crels>1', 0.1830896507948776),
 ('Above-All-Above-0.5', 0.1780896507948776),
 ('Above-%-0.8', 0.1559733968260104),
 ('Inv-not_inverted', 0.14250000000000007),
 ('Above-All-Above-0.8', 0.13500000000000006),
 ('Above-%-0.9', 0.12656764285775632),
 ('Above-%-0.7', 0.1263989920641052),
 ('Above-0.8', 0.10645440564387565),
 ('num_crels=2', 0.09808965079487753),
 ('num_crels>2', 0.09500000000000004),
 ('Above-%-0.95', 0.09467577777839115),
 ('Above-0.9', 0.09423218342165329),
 ('CREL_Causer:7->Result:50-MAX(prob)', 0.09251722315791491),
 ('CREL_7:50', 0.09250000000000004),
 ('Above-0.7', 0.09230896507948781),
 ('CREL_Causer:7->Result:50-MIN(prob)', 0.08759919589953284),
 ('CREL_6:7', 0.08250000000000003),
 ('CREL_Causer:1->Result:50-MIN(prob)', 0.08044351245388602),
 ('CREL_Causer:6->Result:7-pred-count=2', 0.08000000000000003)]

# Notes on Remaining Code Changes
- The Beam search approach outputs a list of Dict[str, List[float]], instead of just one Dict[str, List[float]]
- However, we don't need to sample from the crels, we will just use the already generated parses, after de-duping
- ParserInputs needs modifying so that it takes a list of crel2probs instead of on dict for all parses
- Need to figure out what the optimal parse is based on amount of overlap with the actual crels, minus the false positives

# TODO 
- include the cum prob from the parse action result as a feature? - or simply compute the geometric mean of the probs?
- To speed up MIRA, de-dupe the generated parses prior to feature extraction. Where there are dupes, take the one with the highest cum prob
