In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model


In [28]:
import os
import sys
# cwd = os.getcwd()
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
models_folder = os.path.join(cm_folder, "BEAM Parser/models/")
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

'/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/BEAM Parser/models/'

In [4]:
from typing import Any

import dill
from sklearn.linear_model import LogisticRegression
import numpy as np

from CrossValidation import cross_validation
from MIRA import CostSensitiveMIRA
from Settings import Settings
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_extraction import get_features_from_probabilities
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model_parallel, train_model, train_cost_sensitive_instance
from window_based_tagger_config import get_config

In [5]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
902 226


In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
list(set_cr_tags)[0:10]

['Causer:14->Result:6',
 'Causer:11->Result:50',
 'Causer:11->Result:4',
 'Causer:4->Result:14',
 'Causer:5b->Result:14',
 'Causer:11->Result:11',
 'Causer:2->Result:7',
 'Causer:4->Result:50',
 'Causer:11->Result:14',
 'Causer:7->Result:50']

In [7]:
base_extractors, all_extractor_fns, all_cost_functions = create_extractor_functions()

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [8]:
from searn_essay_parser_breadth_first import SearnModelEssayParserBreadthFirst

In [9]:
test_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

In [10]:
len(pred_tagged_essays_train)

902

In [11]:
BASE_LEARNER_FACT = lambda: LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                        'three_words', 'third_order', 'unigrams']  # type: List[str]


In [14]:
%%time
result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    all_extractor_fns=all_extractor_fns,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    max_epochs=max_epochs,
    min_feat_freq=MIN_FEAT_FREQ, 
    cr_tags=set_cr_tags,
    base_learner_fact=BASE_LEARNER_FACT, 
    down_sample_rate=1.0, model = SearnModelEssayParserBreadthFirst)

CPU times: user 5min 52s, sys: 4.61 s, total: 5min 57s
Wall time: 5min 57s


## Training Accuracy

In [15]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag = result_test_essay_level

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.986087,0.782476,0.749635,0.818327


## CV Accuracy

In [20]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_test_essay_level

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.982907,0.736129,0.714234,0.75941


### Get the Expected Crels Per Essay

In [21]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

### Init Parameter Settings

## Train Re-Ranker

In [22]:
from collections import defaultdict
from parse_generator import collapse_sent_parse

# ESSAY Parser
# For the essay level parser, each pred_parse is a separate complete parse tree, and should be treated as such.
def get_essays2crels_essay_level(essays, sr_model: SearnModelEssayParserBreadthFirst, top_n, search_mode_max_prob=False):
    trainessay2probs = defaultdict(list)
    for eix, essay in enumerate(essays):
        pred_parse_actions = sr_model.generate_all_potential_parses_for_essay(
                tagged_essay=essay, top_n=top_n,
                search_mode_max_prob=search_mode_max_prob)

        for pp in pred_parse_actions:
            cr2p = collapse_sent_parse([pp])
            trainessay2probs[essay.name].append(dict(cr2p))

        if len(trainessay2probs[essay.name]) == 0:
            trainessay2probs[essay.name] = [dict()]

    # returns a dictionary to a list of dictionaries, instead of a list of probabilties. Each dictionary is then a list of probs
    # conceptually this returns a dictionary of filename to a list of parses, as we don't then generate those later from random smapling
    return trainessay2probs

# apply get_essays2crels.... to each held out fold, and combine into same data structure (dictionary keyed on essay name)
def essay_to_crels_cv_essay_level(cv_folds, models, top_n, search_mode_max_prob=False):
    essay2crelprobs = defaultdict(list)
    assert len(cv_folds) == len(models)
    for (train, test), mdl in zip(cv_folds, models):
        test2probs = get_essays2crels_essay_level(test, mdl, top_n, search_mode_max_prob)
        for k,v in test2probs.items():
            assert k not in essay2crelprobs
            essay2crelprobs[k] = v
    return essay2crelprobs


In [None]:
assert len(xs_rr_tmp) == len(pred_tagged_essays_train)
len(xs_rr_tmp), len(xs), len(pred_tagged_essays_train)

In [125]:
BEAM_SIZE = 100

In [126]:
%%time
xs_rr_tmp = essay_to_crels_cv_essay_level(cv_folds, models, top_n=BEAM_SIZE, search_mode_max_prob=False)
# 4 mins for BEAM size of 10 - mean of 2 different parses
# 15 mins for BEAM size of 30 - mean of 4 different parses

CPU times: user 49min 42s, sys: 7.46 s, total: 49min 50s
Wall time: 49min 56s


## De-Dupe the Parses

In [127]:
from feature_extraction import dict2parse

def de_dupe_parses(xs_rr):
    xs_rr_new = dict()
    for ename, parses in xs_rr.items():   
        parse2meanprob = defaultdict(float)
        parse2bestparse = dict()
        # get all identical parses, and store only the most probable one    
        for p in parses:
            parse_tuple = dict2parse(p)
            p_probs = []
            for crels, list_probs in p.items():
                p_probs.extend(list_probs)
            # use the mean here, so as not to penalize longer parses where the same crel
            # has been parsed more than once
            mean_prob = np.mean(p_probs) if len(p_probs) > 0 else 0

            # Make sure all parses have one parse stored
            if parse_tuple not in parse2bestparse:            
                parse2bestparse[parse_tuple] = p
                parse2meanprob[parse_tuple]  = mean_prob
            # if parse stored, replace with more probable one
            elif mean_prob > parse2meanprob[parse_tuple]:
                parse2meanprob[parse_tuple] = mean_prob
                parse2bestparse[parse_tuple] = p

        xs_rr_new[ename] = list(parse2bestparse.values())
    return xs_rr_new

In [128]:
xs_rr = de_dupe_parses(xs_rr_tmp)
assert len(xs_rr) == len(xs_rr_tmp)
len(xs_rr)

902

In [129]:
lens = []
for (_, rr) in xs_rr.items():
    if len(rr) == 0:
        print(rr, ename)
    print(ename, len(rr))
    lens.append(len(rr))
np.mean(lens)

EBA1415_AEKD_4_CB_ES-05572.ann 2
EBA1415_AEKD_4_CB_ES-05572.ann 19
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 16
EBA1415_AEKD_4_CB_ES-05572.ann 5
EBA1415_AEKD_4_CB_ES-05572.ann 2
EBA1415_AEKD_4_CB_ES-05572.ann 1
EBA1415_AEKD_4_CB_ES-05572.ann 11
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 2
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 6
EBA1415_AEKD_4_CB_ES-05572.ann 2
EBA1415_AEKD_4_CB_ES-05572.ann 18
EBA1415_AEKD_4_CB_ES-05572.ann 1
EBA1415_AEKD_4_CB_ES-05572.ann 1
EBA1415_AEKD_4_CB_ES-05572.ann 16
EBA1415_AEKD_4_CB_ES-05572.ann 14
EBA1415_AEKD_4_CB_ES-05572.ann 8
EBA1415_AEKD_4_CB_ES-05572.ann 8
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 16
EBA1415_AEKD_4_CB_ES-05572.ann 8
EBA1415_AEKD_4_CB_ES-05572.ann 17
EBA1415_AEKD_4_CB_ES-05572.ann 2
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 4
EBA1415_AEKD_4_CB_ES-05572.ann 2
EB

7.873614190687362

In [130]:
import dill

rr_fname = "xs_rerank_" + str(BEAM_SIZE) + ".dill"
with open(os.path.join(models_folder, rr_fname), "wb+") as f:
    dill.dump(xs_rr, f)

In [131]:
! ls -alh '/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/BEAM Parser/models/' | grep .dill

-rw-r--r--  1 simon.hughes  1702704586   611K Jun  7 21:21 xs_rerank_10.dill
-rw-r--r--  1 simon.hughes  1702704586   1.8M Jun  7 22:11 xs_rerank_100.dill


# TODO 
- include the cum prob from the parse action result as a feature? - or simply compute the geometric mean of the probs?
- To speed up MIRA, de-dupe the generated parses prior to feature extraction. Where there are dupes, take the one with the highest cum prob
