In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries
from searn_parser_breadth_first import ParseActionResult, SearnModelBreadthFirst
from MIRA import MIRA, CostSensitiveMIRA
from joblib import Parallel, delayed

In [3]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [5]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:11->Result:50',
 'Causer:1->Result:50',
 'Causer:13->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [7]:
set_cr_tags = set(cr_tags)

In [8]:
def evaluate_model_essay_level(
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        train_sr_parser(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    parser_models = []
    for (model, num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in serial_results:
        number_of_feats.append(num_feats)

        parser_models.append(model)
        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return parser_models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag

In [9]:
def add_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def get_label_data_essay_level(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_essay = defaultdict(list)

    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        add_labels(unique_cr_tags, ys_bytag_essay)
    return dict(ys_bytag_essay) # convert to dict so no issue when iterating over if additional keys are present

def essay_to_crels(tagged_essays):
    global set_cr_tags
    # outputs
    name2crels = defaultdict(set)
    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        name2crels[essay.name] = unique_cr_tags
    return dict(name2crels)

In [10]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

def predict_essay_level(parser, essays):
    pred_ys_by_sent = defaultdict(list)
    for essay_ix, essay in enumerate(essays):
        unq_pre_relations = set()
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            pred_relations = parser.predict_sentence(taggged_sentence, predicted_tags)
            unq_pre_relations.update(pred_relations)
        # Store predictions for evaluation
        add_labels(unq_pre_relations, pred_ys_by_sent)
    return pred_ys_by_sent

In [11]:
LINE_WIDTH = 80

# other settings
DOWN_SAMPLE_RATE = 1.0  # For faster smoke testing the algorithm
BASE_LEARNER_FACT = None
COLLECTION_PREFIX = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_MOST_RECENT_CODE"

# some of the other extractors aren't functional if the system isn't able to do a basic parse
# so the base extractors are the MVP for getting to a basic parser, then additional 'meta' parse
# features from all_extractors can be included
base_extractors = [
    single_words,
    word_pairs,
    three_words,
    between_word_features
]

all_extractor_fns = base_extractors + [
    word_distance,
    valency,
    unigrams,
    third_order,
    label_set,
    size_features
]

all_cost_functions = [
    micro_f1_cost,
    micro_f1_cost_squared,
    micro_f1_cost_plusone,
    micro_f1_cost_plusepsilon,
    binary_cost,
    inverse_micro_f1_cost,
    uniform_cost
]

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [12]:
# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                                    'three_words', 'third_order', 'unigrams'] # type: List[str]

In [13]:
def train_sr_parser(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelBreadthFirst(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = get_label_data_essay_level(essays_TD)
    sent_vd_ys_bycode = get_label_data_essay_level(essays_VD)

    sent_td_pred_ys_bycode = predict_essay_level(parse_model, essays_TD)
    sent_vd_pred_ys_bycode = predict_essay_level(parse_model, essays_VD)

    return parse_model, num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode


In [14]:
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [15]:
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

## Essay Level Results

In [16]:
result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

## Train

In [17]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.985703,0.780099,0.759672,0.801656


## Test

In [18]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.982846,0.739837,0.730657,0.749251


# Train Re-Ranker Model

In [19]:
from itertools import combinations

def get_possible_crels(predicted_tags):
    if len(predicted_tags) < 2:
        return set()
    predicted_tags = sorted(predicted_tags)
    pred_crels = set()
    for a,b in combinations(predicted_tags, 2):
        pred_crels.add("Causer:{a}->Result:{b}".format(a=a, b=b))
        pred_crels.add("Causer:{b}->Result:{a}".format(a=a, b=b))
    return pred_crels

def to_canonical_parse(crels):
    return tuple(sorted(crels))

def get_crels(parse):
    crels = set()
    p = parse
    while p:
        if p.relations:
            crels.update(p.relations)
        p = p.parent_action
    return crels

In [20]:
from searn_parser_breadth_first import geo_mean

def collapse_sent_parse(pred_parses):
    crel2prob = defaultdict(list)
    for pact in pred_parses:
        act_seq = pact.get_action_sequence()
        for act in act_seq:
            if not act.relations:
                continue

            assert act.lr_action_prob >= 0
            prob = geo_mean([act.action_prob * act.lr_action_prob])
            for r in act.relations:
                crel2prob[r].append(prob)
    return crel2prob

def merge_crel_probs(a, b):    
    for k,v in b.items():
        a[k].extend(v)
    return a

def get_max_probs(crel2probs):
    crel2max_prob = dict()
    for crel, probs in crel2probs.items():
        crel2max_prob[crel] = max(probs)
    return crel2max_prob

In [21]:
from itertools import combinations

def get_all_combos(items):
    # enforces a consistent ordering for the resulting tuples
    items = sorted(items) 
    cbos = [()] # seed with the empty combo
    for i in range(1, len(items)+1):
        cbos.extend(combinations(items,i))
    return cbos

cbos = get_all_combos([3,2,1])
print(len(cbos)) # 2**len(items)-1
if len(cbos) < 1000:
    for cbo in sorted(cbos, key = lambda l: (len(l), l)):
        print(cbo)

8
()
(1,)
(2,)
(3,)
(1, 2)
(1, 3)
(2, 3)
(1, 2, 3)


## Generate Parses

In [22]:
def to_parse(lst):
    return tuple(sorted(lst))

def sample_top_parses(crel2maxprobs, top_n):

    max_parses = 2**len(crel2maxprobs) # maximum parse combinations
    assert max_parses > top_n, (max_parses, top_n) # otherwise brute force it

    top_parses = set([()]) # always seed with the empty parse
    probs = []
    while len(top_parses) < top_n:
        new_parse = []
        for crel, prob in crel2maxprobs.items():
            rand_val = np.random.random() # random number >= 0 and < 1
            if rand_val < prob:
                new_parse.append(crel)
        # make hashable and enforce consistent order
        top_parses.add(to_parse(new_parse))
    
    return list(top_parses)

def get_top_parses(crel2maxprobs, threshold=0.5):
    top_parse = [crel for crel, prob in crel2maxprobs.items() if prob >= threshold]
    if top_parse:
        return [()] + [to_parse(top_parse)]
    else:
        return [()]
    
def get_top_n_parses(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    for i in range(1, min(top_n, len(crel2maxprobs))+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
    return top_parses

def get_top_n_parses2(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    num_predicted = len([crel for crel in by_prob if crel2maxprobs[crel] >= 0.5])
    for i in range(num_predicted-1, len(by_prob)+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
        if len(top_parses) > top_n:
            break
    return top_parses

crel_probs = {
    "1->2":   0.8,
    "2->3":   0.01,
    "5->8":   0.25,
    "10->12": 0.75,
    "12->50": 0.99,
    "3->4":   0.50,
}

# important - should see a lot more of the more probable codes
# sample_top_parses(crel_probs, 8)
get_top_n_parses2(crel_probs, 1)

[(), ('1->2', '10->12', '12->50')]

## Parser Feature Extraction

In [66]:
from NgramGenerator import compute_ngrams

def to_short_tag(tag):
    return tag.replace("Causer:","").replace("Result:", "")

def build_chains_inner(tree, l, visited, depth=0):
    chains = []
    if l not in tree:
        return chains
    for r in tree[l]:
        if r in visited:
            continue
        visited.add(r) # needed to prevent cycles, which cause infinite recursion
        extensions = build_chains_inner(tree, r, visited, depth+1)
        visited.remove(r)
        for ch in extensions:
            chains.append([r] + ch)
        if not extensions:
            chains.append([r])
    return chains

def build_chains(tree):    
    lhs_items = set(tree.keys())
    rhs_items = set()
    for l,rhs in tree.items():        
        rhs_items.update(rhs)
    
    chains = []
    # starting positions of each chain are those appearing on the lhs but not the rhs
    start_codes = lhs_items - rhs_items    
    for l in start_codes:
        rhs = tree[l]
        for r in rhs:
            for ch in build_chains_inner(tree, r, {l,r}, 0):
                chains.append([l,r] + ch)
    return chains

def extend_chains(chains):
    ext_chains = set()
    for tokens in chains:
        ext_chains.add(",".join(tokens))
        ngrams = compute_ngrams(tokens,max_len=None, min_len=3)
        for t in ngrams:
            ext_chains.add(",".join(t))
    return ext_chains

def extract_features_from_parse(parse, crel2probs):
    
    feats = defaultdict(float)
    tree = defaultdict(set) # maps causers to effects for building chains
    max_probs = []    
    code_tally = defaultdict(float)
    ce_tally = defaultdict(float)
    
    pairs = set()
    inverted_count = 0
    for crel in parse:
        probs = crel2probs[crel]
        max_p = max(probs)
        max_probs.append(max_p)
        feats["{crel}-MAX(prob)".format(crel=crel)] = max_p
        feats["{crel}-MIN(prob)".format(crel=crel)] = min(probs)
        feats["{crel}-pred-count".format(crel=crel)] = len(probs)
        feats["{crel}-pred-count={count}".format(crel=crel, count=len(probs))] = 1
        
        # with type
        l,r = crel.split("->")
        ce_tally[l] +=1
        ce_tally[r] +=1
        
        # without type
        l_short, r_short = to_short_tag(l), to_short_tag(r)
        code_tally[l_short] +=1
        code_tally[r_short] +=1
        # ordering of the codes, ignoring the causal direction
        feats[l_short + ":" + r_short] = 1
        
        # build tree structure so we can retrieve the chains
        tree[l_short].add(r_short)
        
        # track whether the rule exists in the opposite direction
        pairs.add((l_short,r_short))
        if (r_short,l_short) in pairs:
            inverted_count += 1
            
    if inverted_count:
        feats["inverted"] = 1
        feats["num_inverted"] = inverted_count
    else:
        feats["not_inverted"] = 1
    
    # counts
    feats.update(ce_tally)
    feats.update(code_tally)
    
    if len(code_tally) > 0:
        max_valency = max(code_tally.values())
        feats["Max_Valency"] = max_valency
        for i in range(1,4):
            feats["Max_Valency<={i} = {truth_val}".format(i=i, truth_val = max_valency <= i)]

    if len(ce_tally) > 0:
        max_ce_valency = max(ce_tally.values())
        feats["Max_CE_Valency"] = max_ce_valency
        for i in range(1,4):
            feats["Max_CE_Valency<={i} = {truth_val}".format(i=i, truth_val = max_ce_valency <= i)]

    diffs = []
    num_b4 = 0
    num_after = 0
    num_same = 0
    for l,r in pairs:
        lnum = float(l.replace("b",""))
        rnum = float(r.replace("b",""))
        diffs.append(abs(lnum - rnum))
        if rnum > lnum:
            num_after +=1
        elif lnum > rnum:
            num_b4 += 1
        else:
            num_same += 1
        feats["Pair:" + ",".join((l,r))] = 1
        feats["Unique_Pair:" + ",".join(sorted((l,r)))] = 1

    if num_b4 > num_after:
        feats["More_B4"] = 1
    elif num_after > num_b4:
        feats["More_After"] = 1

    if num_same > 0:
        feats["SameToSame"] = 1
        feats["NumSame=" + str(num_same)] = 1

    if len(diffs) > 0:
        feats["avg-diff"] = np.mean(diffs)
        feats["med-diff"] = np.median(diffs)
        feats["min-diff"] = np.min(diffs)
        feats["max-diff"] = np.max(diffs)
    
    num_crels = len(parse)
    feats["num_crels"] = num_crels
    feats["num_crels="+str(len(parse))] = 1 # includes a tag for the empty parse
    for i in range(1,11):
        if num_crels <= i:
            feats["num_crels<={i}".format(i=i)] = 1
        else:
            feats["num_crels>{i}".format(i=i)] = 1
        
    # combination of crels
    # need to sort so that order of a and b is consistent across parses
    cbo_pairs = combinations(sorted(parse), r=2)
    for a, b in cbo_pairs:
        feats["{a}|{b}".format(a=a, b=b)] = 1
        
    #chains
    causer_chains = extend_chains(build_chains(tree))
    max_ch_len = 0
    for ch in causer_chains:
        feats["CChain:" + ch] = 1
        max_ch_len = max(max_ch_len, len(ch.split(",")))
    
    if max_ch_len > 0:
        feats["Max_Chain_Len"] = max_ch_len
    feats["Max_Chain_Len=" + str(max_ch_len)] = 1
    
    if max_probs: # might be an empty parse
        for cutoff in [0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95]:
            above =  len([p for p in max_probs if p >=cutoff])
            feats["Above-{cutoff}".format(cutoff=cutoff)] = above
            feats["%-Above-{cutoff}".format(cutoff=cutoff)] = above/len(max_probs)
            if above == len(max_probs):
                feats["All-Above-{cutoff}".format(cutoff=cutoff)] = 1
        
        feats["avg-prob"] = np.mean(max_probs)
        feats["med-prob"] = np.median(max_probs)
        feats["prod-prob"]= np.product(max_probs)
        feats["min-prob"] = np.min(max_probs)
        feats["max-prob"] = np.max(max_probs)
        for p in [5, 10, 25, 75, 90, 95]:
            feats["{p}%-prob".format(p=p)] = np.percentile(max_probs, p)
        # geometric mean
        feats["geo-mean"] = np.prod(max_probs)**(1/len(max_probs))
    return feats

In [24]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay
    
name2crels = essay_to_crels(all_essays)
assert len(name2crels) == len(all_essays)

In [25]:
def compute_costs(parser_input):
    opt_parse = parser_input.opt_parse
    other_parses = parser_input.other_parses

    other_costs = []
    op = set(opt_parse)
    for p in other_parses:
        p = set(p)
        fp = p - op
        fn = op - p
        cost = len(fp) + len(fn)
        other_costs.append(cost)
    return other_costs

def copy_dflt_dict(d):
    copy = defaultdict(d.default_factory)
    copy.update(d)
    return copy

class ParserInputs(object):
    def __init__(self, essay_name, opt_parse, all_parses, crel2probs, compute_feats=True):
        self.essay_name = essay_name
        self.opt_parse = opt_parse
        self.crel2probs = crel2probs
        
        if compute_feats:
            self.opt_features = extract_features_from_parse(opt_parse, crel2probs)
            
            other_parses = []
            other_feats_array = []
            all_feats_array = []
            for p in all_parses:
                feats = extract_features_from_parse(p, crel2probs)
                all_feats_array.append(feats)
                if p != opt_parse:
                    other_parses.append(p)
                    other_feats_array.append(feats)

            self.all_feats_array = all_feats_array
            self.other_parses = other_parses
            self.other_features_array = other_feats_array
            self.other_costs_array = compute_costs(self)
                    
        self.all_parses = all_parses
        
    def clone_without_feats(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, crel2probs=self.crel2probs, compute_feats=False)
        
        c.other_parses = self.other_parses
        c.other_costs_array = self.other_costs_array
        return c

    def clone(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, crel2probs=self.crel2probs, compute_feats=False)
        
        c.all_feats_array = [copy_dflt_dict(f) for f in self.all_feats_array]
        c.opt_features = copy_dflt_dict(self.opt_features)
        c.other_parses = self.other_parses
        c.other_features_array = [copy_dflt_dict(f) for f in self.other_features_array]
        c.other_costs_array = self.other_costs_array
        return c

def to_freq_feats(feats, freq_feats):
    new_feats = defaultdict(float)
    for f, v in feats.items():
        if f in freq_feats:
            new_feats[f] = v
    return new_feats

def filter_by_min_freq(xs, feat_freq, min_freq):
    if min_freq <= 1:
        return xs
    freq_feats = set((f for f, cnt in feat_freq.items() if cnt >= min_freq))
    for parser_input in xs:
        parser_input.opt_features = to_freq_feats(parser_input.opt_features, freq_feats)
        parser_input.other_features_array = [to_freq_feats(x, freq_feats)
                                             for x in parser_input.other_features_array]
    return xs

def accumulate_feat_vals(xs_train):
    def merge_feats(feats):
        for ft,val in feats.items():
            fts_vals[ft].append(val)
    
    fts_vals = defaultdict(list)
    cnt = 0
    for parser_input in xs_train:
        cnt+=1
        merge_feats(parser_input.opt_features)
        for x in parser_input.other_features_array:
            cnt+=1
            merge_feats(x)
    return fts_vals, cnt

def z_score_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_mean, fts_std = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))
        std = np.std(v_with_zeros)
        if std == 0.0:
            fts_mean[ft] = 0
            fts_std[ft] = vals[0]
        else:
            fts_mean[ft] = np.mean(v_with_zeros)
            fts_std[ft] =  np.std(v_with_zeros)
    
    def to_z_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_mean:
                new_val = (val - fts_mean[ft])/fts_std[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def z_score_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_z_score(parser_input.opt_features)
        clone.all_feats_array = [to_z_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_z_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [z_score_normalize(x) for x in xs_train]
    new_xs_test  = [z_score_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def min_max_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_min, fts_range = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))   
        min_val = np.min(v_with_zeros)
        range_val = np.max(v_with_zeros) - min_val
        fts_min[ft] = min_val
        fts_range[ft] = range_val
    
    def to_min_max_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_min and fts_range[ft] != 0:
                new_val = (val - fts_min[ft])/fts_range[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def min_max_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_min_max_score(parser_input.opt_features)
        clone.all_feats_array = [to_min_max_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_min_max_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [min_max_normalize(x) for x in xs_train]
    new_xs_test  = [min_max_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def get_crels_above(crel2maxprob, threshold):
    return [k for k, p in crel2maxprob.items() if p >= threshold]

def get_features_from_probabilities(essay2probs, top_n, min_feat_freq=1, min_prob=0.0):
    xs = []
    feat_freq = defaultdict(int)
    
    for ename, crel2probs in essay2probs.items():

        act_crels = name2crels[ename]
        crel2maxprob = get_max_probs(crel2probs)        
        crel2probs = dict(crel2probs)
        
        keys = list(crel2probs.keys())
        n_parses = 2 ** len(keys)
        
        increment = 0.05
        threshold = min_prob - increment
        while n_parses > 2 * top_n and threshold < 1.0:
            threshold += increment
            keys = get_crels_above(crel2maxprob, threshold)
            n_parses = 2 ** len(keys)

        if n_parses >  2 * top_n:
            print("n_parses={n_parses} still exceeded max={max_p} at p={p:.4f}".format(
                p=threshold, n_parses=n_parses, max_p=top_n))
            parses = get_top_parses(crel2maxprob)
        else:
            parses = get_all_combos(keys)

        # constrain optimal parse to only those crels that are predicted
        opt_parse = tuple(sorted(act_crels.intersection(crel2probs.keys())))
        x = ParserInputs(essay_name=ename, opt_parse=opt_parse, all_parses=parses, crel2probs=crel2probs)
        xs.append(x)

        # Get unique features for essay
        all_feats = set()
        for fts in x.all_feats_array:
            all_feats.update(fts.keys())

        for ft in all_feats:
            feat_freq[ft] += 1

    assert len(xs) == len(essay2probs), "Parses for all essays should be generated"
    return filter_by_min_freq(xs, feat_freq, min_feat_freq)

In [26]:
def add_cr_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def evaluate_ranker(model, xs, essay2crels, ys_bytag):
    clone = model.clone()
    if hasattr(model, "average_weights"):
        clone.average_weights()

    pred_ys_bytag = defaultdict(list)
    ename2inps = dict()
    for parser_input in xs:
        ename2inps[parser_input.essay_name] = parser_input
    
    for ename, act_crels in essay2crels.items():        
        if ename not in ename2inps:
            # no predicted crels for this essay
            highest_ranked = set()
        else:
            parser_input = ename2inps[ename]
            ixs = clone.rank(parser_input.all_feats_array)
            highest_ranked = parser_input.all_parses[ixs[0]] # type: Tuple[str]        
            
        add_cr_labels(set(highest_ranked), pred_ys_bytag)

    mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag, pred_ys_bytag)
    df = get_micro_metrics(metrics_to_df(mean_metrics))
    return df

def get_ranked_predictions(model, xs):
    clone = model.clone()
    if hasattr(model, "average_weights"):
        clone.average_weights()
        
    preds_by_essay = dict()
    for parser_input in xs:
        ixs = clone.rank(parser_input.all_feats_array)
        preds_by_essay[parser_input.essay_name] = (parser_input, ixs)        
    return preds_by_essay

In [27]:
from numpy.random import shuffle

def train_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, other_feats_array=parser_input.other_features_array)

def train_cost_sensitive_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, 
                other_feats_array=parser_input.other_features_array, other_costs_array=parser_input.other_costs_array)
    
def get_essays_for_data(xs):
    return [name2essay[x.essay_name] for x in xs]
    
def train_model(model, xs_train, xs_test, max_epochs=30, early_stop_iters=8, train_instance_fn=train_instance, verbose=True):
    test_accs = [-1]
    best_model = None
    best_test_accuracy = None
    num_declining_acc = 0

    train_essays = get_essays_for_data(xs_train)
    test_essays  = get_essays_for_data(xs_test)

    ys_by_tag_train = get_label_data_essay_level(train_essays)
    ys_by_tag_test  = get_label_data_essay_level(test_essays)

    essay2crels_train = essay_to_crels(train_essays)
    essay2crels_test  = essay_to_crels(test_essays)
    
    xs_train_copy = list(xs_train)    
    for i in range(max_epochs):
        shuffle(xs_train_copy)
        for parser_input in xs_train_copy:
            if len(parser_input.other_parses) > 0:
                train_instance_fn(parser_input, model)

        train_accuracy_df = evaluate_ranker(model, xs_train, essay2crels_train, ys_by_tag_train)
        test_accuracy_df  = evaluate_ranker(model, xs_test,  essay2crels_test,  ys_by_tag_test)
        train_accuracy = train_accuracy_df.iloc[0].to_dict()["f1_score"]
        test_accuracy  = test_accuracy_df.iloc[0].to_dict()["f1_score"]
        if verbose:
            print("Epoch: {epoch} Train Accuracy: {train_acc:.4f} Test Accuracy: {test_acc:.4f}".format(
            epoch=i,  train_acc=train_accuracy, test_acc=test_accuracy))
        if test_accuracy > max(test_accs):
            best_model = model.clone()
            best_test_accuracy = test_accuracy_df
            num_declining_acc = 0
        else:
            num_declining_acc += 1
            if num_declining_acc >= early_stop_iters:
                break
        test_accs.append(test_accuracy)
    if verbose:
        print("Best Test Acc: {acc:.4f}".format(acc=max(test_accs)))
    return best_model, best_test_accuracy

In [28]:
def get_essays2crels(essays, sr_model, top_n, search_mode_max_prob=False):
    trainessay2probs = defaultdict(list)
    for eix, essay in enumerate(essays):
        crel2probs = defaultdict(list)        
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            unq_ptags = set([t for t in predicted_tags if t != EMPTY])            
            if len(unq_ptags) >= 2:
                pred_parses = sr_model.generate_all_potential_parses_for_sentence(
                    tagged_sentence=taggged_sentence, predicted_tags=predicted_tags, top_n=top_n, search_mode_max_prob=search_mode_max_prob)
                cr2p = collapse_sent_parse(pred_parses)
                merge_crel_probs(crel2probs, cr2p)
    
        if len(crel2probs) > 0:
            trainessay2probs[essay.name] = dict(crel2probs)
        else:
            trainessay2probs[essay.name] = dict()
    return trainessay2probs

In [29]:
def essay_to_crels_cv(cv_folds, models, top_n, search_mode_max_prob=False):
    essay2crelprobs = defaultdict(list)
    assert len(cv_folds) == len(models)
    for (train, test), mdl in zip(cv_folds, models):
        test2probs = get_essays2crels(test, mdl, top_n, search_mode_max_prob)
        for k,v in test2probs.items():
            assert k not in essay2crelprobs
            essay2crelprobs[k] = v
    return essay2crelprobs

In [30]:
def shuffle_split_dict(dct, train_pct):
    items = list(dct.items())
    np.random.shuffle(items)
    num_train = int(len(items) * train_pct)
    train_items, test_items = items[:num_train], items[num_train:]
    return dict(train_items), dict(test_items)

In [63]:
def train_model_fold(xs_train, xs_test, C, pa_type, loss_type, max_update_items, return_model=False):
    
    mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, max_update_items=max_update_items, initial_weight=1)
    best_mdl, test_acc_df_ml = train_model(mdl, xs_train=xs_train, xs_test=xs_test, 
        max_epochs=20, early_stop_iters=5, train_instance_fn = train_cost_sensitive_instance, verbose=False)
    f1 = test_acc_df_ml["f1_score"].values[0]
    if return_model:
        return f1, best_mdl
    return f1

def train_model_parallel(cv_folds, C, pa_type, loss_type, max_update_items):
    try:
        f1s = Parallel(n_jobs=len(cv_folds))(delayed(train_model_fold)(train,test, C, pa_type, loss_type, max_update_items) 
                                         for (train,test) in cv_folds)
        return np.mean(f1s)
    except KeyboardInterrupt:
        print("Process stopped by user")
        
def train_model_serial(cv_folds, C, pa_type, loss_type, max_update_items):
    try:
        f1s = [train_model_fold(train,test, C, pa_type, loss_type, max_update_items) 
                for (train,test) in cv_folds]
        return np.mean(f1s)
    except KeyboardInterrupt:
        print("Process stopped by user")
        
def train_model_return_model_parallel(cv_folds, C, pa_type, loss_type, max_update_items):
    try:
        res = Parallel(n_jobs=len(cv_folds))(delayed(train_model_fold)(
            train,test, C, pa_type, loss_type, max_update_items, True) 
                                         for (train,test) in cv_folds)
        return res
    except KeyboardInterrupt:
        print("Process stopped by user")

In [32]:
# need to re-implement as assumes a parser input obj with other parses and we need for all parses
def get_costs(opt_parse, other_parses):
    other_costs = []
    op = set(opt_parse)
    for p in other_parses:
        p = set(p)
        fp = p - op
        fn = op - p
        cost = len(fp) + len(fn)
        other_costs.append(cost)
    return other_costs

## Test with Optimal Parameters and MM Nornalization

In [52]:
# initial settings for other params
best_top_n, best_C, best_max_upd, best_max_parses, best_min_prob = (2, 0.0025, 2, 300, 0.6)

In [53]:
%%time
xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=best_top_n, search_mode_max_prob=False)
xs = get_features_from_probabilities(xs_rerank, best_max_parses, min_feat_freq=1, min_prob=best_min_prob)

cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train,test) for (train,test) in cv_folds_rerank]
# Parallelizing this takes longer as it uses a lot of RAM
#     cf_folds_mm = Parallel(n_jobs=len(cv_folds_rerank))(delayed(min_max_normalize_feats)(train,test) for (train,test) in cv_folds_rerank)

CPU times: user 1min 48s, sys: 11.7 s, total: 1min 59s
Wall time: 2min 2s


In [54]:
len(xs_rerank), len(xs), len(cv_folds_mm)

(902, 902, 5)

In [67]:
%%time
f1 = train_model_parallel(cv_folds=cv_folds_mm, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)
print(f1) # 0.74159216523265383

0.7414964790306262
CPU times: user 1min 54s, sys: 23.6 s, total: 2min 18s
Wall time: 30min 37s


In [64]:
%%time
# Parallel is faster but not 5X
f1 = train_model_serial(cv_folds=cv_folds_mm, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)
print(f1) # 0.74159216523265383

0.7414964790306262
CPU times: user 10min 4s, sys: 3.72 s, total: 10min 8s
Wall time: 10min 11s


In [68]:
# results = train_model_return_model_parallel(cv_folds=cv_folds_mm, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)

train, test = cv_folds_mm[0]
f1, mdl = train_model_fold(train, test, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd, return_model=True)


In [75]:
preds = get_ranked_predictions(mdl, test)

In [76]:
ename2costs = dict()
for ename, (pi, ixs) in preds.items():
    all_costs = get_costs(pi.opt_parse, pi.all_parses)
    assert len(all_costs) == len(ixs)
    assert ename not in ename2costs, "Essay already processed: " + ename
    ename2costs[ename] = all_costs[ixs[0]]

In [77]:
train_essays = get_essays_for_data(train)
essay2crels_train = essay_to_crels(train_essays)

In [78]:
# Compute essay prior probability of crel from training data
tally_train_crel = defaultdict(int)
for ename, crels in essay2crels_train.items():
    for crel in crels:
        tally_train_crel[crel] += 1

ptrain_crels = defaultdict(float)
for crel, cnt in tally_train_crel.items():
    ptrain_crels[crel] = cnt / len(train)
        
# ptrain_crels

In [79]:
for ename, cost in sorted(ename2costs.items(), key = lambda tpl: -tpl[1])[0:10]:
    if cost == 0:
        continue
    (pi, ixs) = preds[ename]
    op = set(pi.opt_parse)
    highest_ranked = pi.all_parses[ixs[0]]
    p = set(highest_ranked)
    fp = p - op
    fn = op - p
    tp = op.intersection(p)
    print(cost, ename)
    print(pi.opt_parse)
    crel2probs = pi.crel2probs
    
    print(len(fp), "FP")
    for crel in fp:
        print("\t", crel.ljust(20), round(ptrain_crels[crel],5), "\t", sorted(crel2probs[crel]))
    
    print(len(fn), "FN:")
    for crel in fn:
        print("\t", crel.ljust(20), round(ptrain_crels[crel],5), "\t", sorted(crel2probs[crel]))
        
    print(len(tp), "TP:")
    for crel in tp:
        print("\t", crel.ljust(20), round(ptrain_crels[crel],5), "\t", sorted(crel2probs[crel]))
        
    print("*" * 120)

7 EBA1415_KNKC_3_CB_ES-05599.ann
('Causer:3->Result:4', 'Causer:6->Result:7')
7 FP
	 Causer:1->Result:2   0.1068 	 [0.751152411174496, 0.751152411174496]
	 Causer:11->Result:12 0.09154 	 [0.29981777519812597, 0.29981777519812597]
	 Causer:11->Result:13 0.09015 	 [0.637589255042635, 0.637589255042635, 0.7598836528356077, 0.7598836528356077]
	 Causer:11->Result:3  0.00555 	 [0.49973330151496564, 0.49973330151496564]
	 Causer:5->Result:5b  0.03606 	 [0.6512346037728104, 0.6512346037728104]
	 Causer:1->Result:7   0.00416 	 [0.6623580380462014, 0.6623580380462014]
	 Causer:1->Result:3   0.19556 	 [0.6700779538942299, 0.6700779538942299]
0 FN:
2 TP:
	 Causer:3->Result:4   0.14147 	 [0.7774053713797013, 0.7774053713797013]
	 Causer:6->Result:7   0.14286 	 [0.9848136734459447, 0.9848136734459447]
************************************************************************************************************************
6 EBA1415_BGJD_2_CB_ES-05751.ann
('Causer:3->Result:4', 'Causer:4->Result:14', 

In [80]:
sorted(mdl.weights.items(), key = lambda tpl: -tpl[1])[0:100]

[('num_crels<=4', 1.9249999999999803),
 ('num_crels<=3', 1.792499999999983),
 ('num_crels<=2', 1.7899999999999832),
 ('num_crels<=5', 1.7574999999999839),
 ('Max_Chain_Len=0', 1.7199999999999847),
 ('num_crels<=6', 1.5324999999999886),
 ('not_inverted', 1.517499999999989),
 ('num_crels<=1', 1.3999999999999915),
 ('num_crels=2', 1.3899999999999917),
 ('num_crels<=7', 1.374999999999992),
 ('All-Above-0.7', 1.3149999999999933),
 ('prod-prob', 1.2942094262027763),
 ('num_crels=0', 1.284999999999994),
 ('All-Above-0.8', 1.279999999999994),
 ('All-Above-0.9', 1.2449999999999948),
 ('num_crels=4', 1.1324999999999972),
 ('All-Above-0.95', 1.1274999999999973),
 ('num_crels=1', 1.1149999999999975),
 ('num_crels<=8', 1.1124999999999976),
 ('All-Above-0.5', 1.1124999999999976),
 ('%-Above-0.95', 1.0973005952381105),
 ('%-Above-0.9', 1.0612619047619152),
 ('min-prob', 1.0287657386023636),
 ('Causer:12->Result:13|Causer:3->Result:4', 1.0174999999999996),
 ('CChain:1,3,4,5,7', 1.0149999999999997),
 (

## TODO
- Feats
 - Valency - Num Causers and Effects for each code, and then highest count for each
 - Prior Prob per crel from training data - then usual prob feats from that array
 - Longest causal chain