In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries
from searn_parser_breadth_first import ParseActionResult, SearnModelBreadthFirst
from MIRA import MIRA, CostSensitiveMIRA
from joblib import Parallel, delayed

In [3]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [5]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:13->Result:50',
 'Causer:1->Result:50',
 'Causer:11->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [7]:
set_cr_tags = set(cr_tags)

In [8]:
def evaluate_model_essay_level(
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        train_sr_parser(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    parser_models = []
    for (model, num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in serial_results:
        number_of_feats.append(num_feats)

        parser_models.append(model)
        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return parser_models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag

In [9]:
def add_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def get_label_data_essay_level(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_essay = defaultdict(list)

    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        add_labels(unique_cr_tags, ys_bytag_essay)
    return dict(ys_bytag_essay) # convert to dict so no issue when iterating over if additional keys are present

def essay_to_crels(tagged_essays):
    global set_cr_tags
    # outputs
    name2crels = defaultdict(set)
    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        name2crels[essay.name] = unique_cr_tags
    return dict(name2crels)

In [10]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

def predict_essay_level(parser, essays):
    pred_ys_by_sent = defaultdict(list)
    for essay_ix, essay in enumerate(essays):
        unq_pre_relations = set()
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            pred_relations = parser.predict_sentence(taggged_sentence, predicted_tags)
            unq_pre_relations.update(pred_relations)
        # Store predictions for evaluation
        add_labels(unq_pre_relations, pred_ys_by_sent)
    return pred_ys_by_sent

In [11]:
LINE_WIDTH = 80

# other settings
DOWN_SAMPLE_RATE = 1.0  # For faster smoke testing the algorithm
BASE_LEARNER_FACT = None
COLLECTION_PREFIX = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_MOST_RECENT_CODE"

# some of the other extractors aren't functional if the system isn't able to do a basic parse
# so the base extractors are the MVP for getting to a basic parser, then additional 'meta' parse
# features from all_extractors can be included
base_extractors = [
    single_words,
    word_pairs,
    three_words,
    between_word_features
]

all_extractor_fns = base_extractors + [
    word_distance,
    valency,
    unigrams,
    third_order,
    label_set,
    size_features
]

all_cost_functions = [
    micro_f1_cost,
    micro_f1_cost_squared,
    micro_f1_cost_plusone,
    micro_f1_cost_plusepsilon,
    binary_cost,
    inverse_micro_f1_cost,
    uniform_cost
]

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [12]:
# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                                    'three_words', 'third_order', 'unigrams'] # type: List[str]

In [13]:
def train_sr_parser(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelBreadthFirst(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = get_label_data_essay_level(essays_TD)
    sent_vd_ys_bycode = get_label_data_essay_level(essays_VD)

    sent_td_pred_ys_bycode = predict_essay_level(parse_model, essays_TD)
    sent_vd_pred_ys_bycode = predict_essay_level(parse_model, essays_VD)

    return parse_model, num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode


In [14]:
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [15]:
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

## Essay Level Results

In [16]:
result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

## Train

In [17]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.985716,0.780533,0.760949,0.801153


## Test (CV On Training Data)

In [18]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.982968,0.742447,0.735401,0.749628


# Train Re-Ranker Model

In [19]:
from itertools import combinations

def get_possible_crels(predicted_tags):
    if len(predicted_tags) < 2:
        return set()
    predicted_tags = sorted(predicted_tags)
    pred_crels = set()
    for a,b in combinations(predicted_tags, 2):
        pred_crels.add("Causer:{a}->Result:{b}".format(a=a, b=b))
        pred_crels.add("Causer:{b}->Result:{a}".format(a=a, b=b))
    return pred_crels

def to_canonical_parse(crels):
    return tuple(sorted(crels))

def get_crels(parse):
    crels = set()
    p = parse
    while p:
        if p.relations:
            crels.update(p.relations)
        p = p.parent_action
    return crels

In [20]:
from searn_parser_breadth_first import geo_mean

def collapse_sent_parse(pred_parses):
    crel2prob = defaultdict(list)
    for pact in pred_parses:
        act_seq = pact.get_action_sequence()
        for act in act_seq:
            if not act.relations:
                continue

            assert act.lr_action_prob >= 0
            prob = geo_mean([act.action_prob * act.lr_action_prob])
            for r in act.relations:
                crel2prob[r].append(prob)
    return crel2prob

def merge_crel_probs(a, b):    
    for k,v in b.items():
        a[k].extend(v)
    return a

def get_max_probs(crel2probs):
    crel2max_prob = dict()
    for crel, probs in crel2probs.items():
        crel2max_prob[crel] = max(probs)
    return crel2max_prob

In [21]:
from itertools import combinations

def get_all_combos(items):
    # enforces a consistent ordering for the resulting tuples
    items = sorted(items) 
    cbos = [()] # seed with the empty combo
    for i in range(1, len(items)+1):
        cbos.extend(combinations(items,i))
    return cbos

cbos = get_all_combos([3,2,1])
print(len(cbos)) # 2**len(items)-1
if len(cbos) < 1000:
    for cbo in sorted(cbos, key = lambda l: (len(l), l)):
        print(cbo)

8
()
(1,)
(2,)
(3,)
(1, 2)
(1, 3)
(2, 3)
(1, 2, 3)


## Generate Parses

In [22]:
def to_parse(lst):
    return tuple(sorted(lst))

def sample_top_parses(crel2maxprobs, top_n):

    max_parses = 2**len(crel2maxprobs) # maximum parse combinations
    assert max_parses > top_n, (max_parses, top_n) # otherwise brute force it

    top_parses = set([()]) # always seed with the empty parse
    probs = []
    while len(top_parses) < top_n:
        new_parse = []
        for crel, prob in crel2maxprobs.items():
            rand_val = np.random.random() # random number >= 0 and < 1
            if rand_val < prob:
                new_parse.append(crel)
        # make hashable and enforce consistent order
        top_parses.add(to_parse(new_parse))
    
    return list(top_parses)

def get_top_parses(crel2maxprobs, threshold=0.5):
    top_parse = [crel for crel, prob in crel2maxprobs.items() if prob >= threshold]
    if top_parse:
        return [()] + [to_parse(top_parse)]
    else:
        return [()]

def get_top_n_parses(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    for i in range(1, min(top_n, len(crel2maxprobs))+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
    return top_parses

def get_top_n_parses2(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    num_predicted = len([crel for crel in by_prob if crel2maxprobs[crel] >= 0.5])
    for i in range(num_predicted-1, len(by_prob)+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
        if len(top_parses) > top_n:
            break
    return top_parses

crel_probs = {
    "1->2":   0.8,
    "2->3":   0.01,
    "5->8":   0.25,
    "10->12": 0.75,
    "12->50": 0.99,
    "3->4":   0.50,
}

# important - should see a lot more of the more probable codes
# sample_top_parses(crel_probs, 8)
get_top_n_parses2(crel_probs, 1)

[(), ('1->2', '10->12', '12->50')]

## Parser Feature Extraction

In [23]:
def generate_parses_from_sent_parses(sent_parses):
    if len(sent_parses) == 0:
        return []
    
    parses = []
    for parse, max_prob in sent_parses[0]:
        subsequent = generate_parses_from_sent_parses(sent_parses[1:])
        if not subsequent:
            parses.append([(parse, max_prob)])
        else:
            for items in generate_parses_from_sent_parses(sent_parses[1:]):
                parses.append([(parse, max_prob)] + items) 
    return parses
    
def flatten_sent_parse(sp):
    crels = set()
    probs = []
    for sent_parse, max_prob in sp:
        crels.update(sent_parse)
        probs.append(max_prob)
    return (to_canonical_parse(crels), geo_mean(probs))

# for a in generate_parses_from_sent_parses(x):
# #     print(a)
#     print(flatten_sent_parse(a))
#     print()

In [24]:
from NgramGenerator import compute_ngrams

def to_short_tag(tag):
    return tag.replace("Causer:","").replace("Result:", "")

def build_chains_inner(tree, l, visited, depth=0):
    chains = []
    if l not in tree:
        return chains
    for r in tree[l]:
        if r in visited:
            continue
        visited.add(r) # needed to prevent cycles, which cause infinite recursion
        extensions = build_chains_inner(tree, r, visited, depth+1)
        visited.remove(r)
        for ch in extensions:
            chains.append([r] + ch)
        if not extensions:
            chains.append([r])
    return chains

def build_chains(tree):    
    lhs_items = set(tree.keys())
    rhs_items = set()
    for l,rhs in tree.items():        
        rhs_items.update(rhs)
    
    chains = []
    # starting positions of each chain are those appearing on the lhs but not the rhs
    start_codes = lhs_items - rhs_items    
    for l in start_codes:
        rhs = tree[l]
        for r in rhs:
            for ch in build_chains_inner(tree, r, {l,r}, 0):
                chains.append([l,r] + ch)
    return chains

def extend_chains(chains):
    ext_chains = set()
    for tokens in chains:
        ext_chains.add(",".join(tokens))
        ngrams = compute_ngrams(tokens,max_len=None, min_len=3)
        for t in ngrams:
            ext_chains.add(",".join(t))
    return ext_chains

def extract_features_from_parse(parse, parse2probs):
    
    feats = defaultdict(float)
    tree = defaultdict(set) # maps causers to effects for building chains
    max_probs = []    
    code_tally = defaultdict(float)
    
    pairs = set()
    inverted_count = 0
    probs = parse2probs[parse]
    
    for crel in parse:
        # with type
        l,r = crel.split("->")
        code_tally[l] +=1
        code_tally[r] +=1
        
        # without type
        l_short, r_short = to_short_tag(l), to_short_tag(r)
        code_tally[l_short] +=1
        code_tally[r_short] +=1
        # ordering of the codes, ignoring the causal direction
        feats[l_short + ":" + r_short] = 1
        
        # build tree structure so we can retrieve the chains
        tree[l_short].add(r_short)
        
        # track whether the rule exists in the opposite direction
        pairs.add((l_short,r_short))
        if (r_short,l_short) in pairs:
            inverted_count += 1
            
    if inverted_count:
        feats["inverted"] = 1
        feats["num_inverted"] = inverted_count
    else:
        feats["not_inverted"] = 1
    
    # counts
    feats.update(code_tally)
    num_crels = len(parse)
    feats["num_crels"] = num_crels
    feats["num_crels="+str(len(parse))] = 1 # includes a tag for the empty parse
    for i in range(1,11):
        if num_crels <= i:
            feats["num_crels<={i}".format(i=i)] = 1
        else:
            feats["num_crels>{i}".format(i=i)] = 1
        
    # combination of crels
    # need to sort so that order of a and b is consistent across parses
    pairs = combinations(sorted(parse), r=2)
    for a, b in pairs:
        feats["{a}|{b}".format(a=a, b=b)] = 1
        
    #chains
    causer_chains = extend_chains(build_chains(tree))
    for ch in causer_chains:
        feats["CChain:" + ch] = 1
    
    if len(probs) > 0: # might be an empty parse
        for cutoff in [0.25, 0.5, 0.75, 0.9]:
            above =  len([p for p in probs if p >=cutoff])
            feats["Above-{cutoff}".format(cutoff=cutoff)] = above
            feats["%-Above-{cutoff}".format(cutoff=cutoff)] = above/len(probs)
            if above == len(max_probs):
                feats["All-Above-{cutoff}".format(cutoff=cutoff)] = 1
        
        feats["avg-prob"] = np.mean(probs)
        feats["med-prob"] = np.median(probs)
        feats["prod-prob"]= np.product(probs)
        feats["min-prob"] = np.min(probs)
        feats["max-prob"] = np.max(probs)
        # geometric mean
        feats["geo-mean"] = np.prod(probs)**(1/len(probs))
    return feats

In [25]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay
    
name2crels = essay_to_crels(all_essays)

assert len(name2crels) == len(all_essays)

In [26]:
def compute_costs(parser_input):
    opt_parse = parser_input.opt_parse
    other_parses = parser_input.other_parses

    other_costs = []
    op = set(opt_parse)
    for p in other_parses:
        p = set(p)
        fp = p - op
        fn = op - p
        cost = len(fp) + len(fn)
        other_costs.append(cost)
    return other_costs

def copy_dflt_dict(d):
    copy = defaultdict(d.default_factory)
    copy.update(d)
    return copy

class ParserInputs(object):
    def __init__(self, essay_name, opt_parse, all_parses, parse2probs, compute_feats=True):
        self.essay_name = essay_name
        self.opt_parse = opt_parse
        self.parse2probs = parse2probs
        
        if compute_feats:
            self.opt_features = extract_features_from_parse(opt_parse, parse2probs)
            
            other_parses = []
            other_feats_array = []
            all_feats_array = []
            for p in all_parses:
                feats = extract_features_from_parse(p, parse2probs)
                all_feats_array.append(feats)
                if p != opt_parse:
                    other_parses.append(p)
                    other_feats_array.append(feats)

            self.all_feats_array = all_feats_array
            self.other_parses = other_parses
            self.other_features_array = other_feats_array
            self.other_costs_array = compute_costs(self)
                    
        self.all_parses = all_parses
        
    def clone_without_feats(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, parse2probs=self.parse2probs, compute_feats=False)
        
        c.other_parses = self.other_parses
        c.other_costs_array = self.other_costs_array
        return c

    def clone(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, parse2probs=self.parse2probs, compute_feats=False)
        
        c.all_feats_array = [copy_dflt_dict(f) for f in self.all_feats_array]
        c.opt_features = copy_dflt_dict(self.opt_features)
        c.other_parses = self.other_parses
        c.other_features_array = [copy_dflt_dict(f) for f in self.other_features_array]
        c.other_costs_array = self.other_costs_array
        return c

def to_freq_feats(feats, freq_feats):
    new_feats = defaultdict(float)
    for f, v in feats.items():
        if f in freq_feats:
            new_feats[f] = v
    return new_feats

def filter_by_min_freq(xs, feat_freq, min_freq):
    if min_freq <= 1:
        return xs
    freq_feats = set((f for f, cnt in feat_freq.items() if cnt >= min_freq))
    for parser_input in xs:
        parser_input.opt_features = to_freq_feats(parser_input.opt_features, freq_feats)
        parser_input.other_features_array = [to_freq_feats(x, freq_feats)
                                             for x in parser_input.other_features_array]
    return xs

def accumulate_feat_vals(xs_train):
    def merge_feats(feats):
        for ft,val in feats.items():
            fts_vals[ft].append(val)
    
    fts_vals = defaultdict(list)
    cnt = 0
    for parser_input in xs_train:
        cnt+=1
        merge_feats(parser_input.opt_features)
        for x in parser_input.other_features_array:
            cnt+=1
            merge_feats(x)
    return fts_vals, cnt

def z_score_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_mean, fts_std = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))
        std = np.std(v_with_zeros)
        if std == 0.0:
            fts_mean[ft] = 0
            fts_std[ft] = vals[0]
        else:
            fts_mean[ft] = np.mean(v_with_zeros)
            fts_std[ft] =  np.std(v_with_zeros)
    
    def to_z_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_mean:
                new_val = (val - fts_mean[ft])/fts_std[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def z_score_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_z_score(parser_input.opt_features)
        clone.all_feats_array = [to_z_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_z_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [z_score_normalize(x) for x in xs_train]
    new_xs_test  = [z_score_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def min_max_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_min, fts_range = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))   
        min_val = np.min(v_with_zeros)
        range_val = np.max(v_with_zeros) - min_val
        fts_min[ft] = min_val
        fts_range[ft] = range_val
    
    def to_min_max_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_min and fts_range[ft] != 0:
                new_val = (val - fts_min[ft])/fts_range[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def min_max_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_min_max_score(parser_input.opt_features)
        clone.all_feats_array = [to_min_max_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_min_max_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [min_max_normalize(x) for x in xs_train]
    new_xs_test  = [min_max_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def get_features_from_probabilities(essay2sentparses, min_feat_freq=1):
    xs = []
    feat_freq = defaultdict(int)
    
    for ename, sentparses in essay2sentparses.items():
        act_crels = name2crels[ename]
            
        parses_wth_probs = generate_parses_from_sent_parses(sentparses)
        parse2probs = defaultdict(list)
        for pp in parses_wth_probs:
            parse, gm_prob = flatten_sent_parse(pp)
            parse = to_canonical_parse(parse)
            parse2probs[parse].append(gm_prob)

        # constrain optimal parse to only those crels that are predicted
        opt_parse = to_canonical_parse(act_crels.intersection(parse2probs.keys()))
        x = ParserInputs(essay_name=ename, opt_parse=opt_parse, all_parses=list(parse2probs.keys()), parse2probs=parse2probs)
        xs.append(x)

        # Get unique features for essay
        all_feats = set()
        for fts in x.all_feats_array:
            all_feats.update(fts.keys())

        for ft in all_feats:
            feat_freq[ft] += 1

    assert len(xs) == len(essay2sentparses), "Parses for all essays should be generated"
    return filter_by_min_freq(xs, feat_freq, min_feat_freq)

In [27]:
def add_cr_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def evaluate_ranker(model, xs, essay2crels, ys_bytag):
    clone = model.clone()
    if hasattr(model, "average_weights"):
        clone.average_weights()
    rank_acc = []
    pred_ys_bytag = defaultdict(list)
    ename2inps = dict()
    for parser_input in xs:
        ename2inps[parser_input.essay_name] = parser_input
    
    for ename, act_crels in essay2crels.items():        
        if ename not in ename2inps:
            # no predicted crels for this essay
            highest_ranked = set()
        else:
            parser_input = ename2inps[ename]
            ixs = clone.rank(parser_input.all_feats_array)
            highest_ranked = parser_input.all_parses[ixs[0]] # type: Tuple[str]        
            rank_acc.append(1 if highest_ranked == parser_input.opt_parse else 0)
            
        add_cr_labels(set(highest_ranked), pred_ys_bytag)

    mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag, pred_ys_bytag)
    df = get_micro_metrics(metrics_to_df(mean_metrics))
    df["rank_acc"] = np.mean(rank_acc)
    return df

In [28]:
from numpy.random import shuffle

def train_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, other_feats_array=parser_input.other_features_array)

def train_cost_sensitive_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, 
                other_feats_array=parser_input.other_features_array, other_costs_array=parser_input.other_costs_array)
    
def get_essays_for_data(xs):
    return [name2essay[x.essay_name] for x in xs]
    
def train_model(model, xs_train, xs_test, max_epochs=30, early_stop_iters=8, train_instance_fn=train_instance, verbose=True):
    test_accs = [-1]
    best_model = None
    best_test_accuracy = None
    num_declining_acc = 0

    train_essays = get_essays_for_data(xs_train)
    test_essays  = get_essays_for_data(xs_test)

    ys_by_tag_train = get_label_data_essay_level(train_essays)
    ys_by_tag_test  = get_label_data_essay_level(test_essays)

    essay2crels_train = essay_to_crels(train_essays)
    essay2crels_test  = essay_to_crels(test_essays)
    
    xs_train_copy = list(xs_train)    
    for i in range(max_epochs):
        shuffle(xs_train_copy)
        for parser_input in xs_train_copy:
            if len(parser_input.other_parses) > 0:
                train_instance_fn(parser_input, model)

        train_accuracy_df = evaluate_ranker(model, xs_train, essay2crels_train, ys_by_tag_train)
        test_accuracy_df  = evaluate_ranker(model, xs_test,  essay2crels_test,  ys_by_tag_test)
        train_accuracy = train_accuracy_df.iloc[0].to_dict()["f1_score"]
        test_accuracy  = test_accuracy_df.iloc[0].to_dict()["f1_score"]
        if verbose:
            print("Epoch: {epoch} Train Accuracy: {train_acc:.4f} Test Accuracy: {test_acc:.4f}".format(
            epoch=i,  train_acc=train_accuracy, test_acc=test_accuracy))
        if test_accuracy > max(test_accs):
            best_model = model.clone()
            best_test_accuracy = test_accuracy_df
            num_declining_acc = 0
        else:
            num_declining_acc += 1
            if num_declining_acc >= early_stop_iters:
                break
        test_accs.append(test_accuracy)
    if verbose:
        print("Best Test Acc: {acc:.4f}".format(acc=max(test_accs)))
    return best_model, best_test_accuracy

In [29]:
def get_top_sent_parses(pred_parses):
    max_parse_probs = defaultdict(float)
    for pp in pred_parses:
        crels = get_crels(pp)
        parse = to_canonical_parse(crels)
        max_parse_probs[parse] = max(max_parse_probs[parse], pp.cum_prob)

    return sorted(max_parse_probs.items(), key = lambda tpl: -tpl[1])


In [65]:
def get_essays2sentparses(essays, sr_model, beam_size, top_n_parses, search_mode_max_prob=False):
    trainessay2parses = defaultdict(list)
    for eix, essay in enumerate(essays):
        crel2probs = defaultdict(list)
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            unq_ptags = set([t for t in predicted_tags if t != EMPTY])            
            if len(unq_ptags) >= 2:
                pred_parses = sr_model.generate_all_potential_parses_for_sentence(
                    tagged_sentence=taggged_sentence, predicted_tags=predicted_tags, top_n=beam_size, search_mode_max_prob=search_mode_max_prob)
                top_parses = get_top_sent_parses(pred_parses)[:top_n_parses]
                if len(top_parses) > 0:
                    trainessay2parses[essay.name].append(top_parses)
        if essay.name not in trainessay2parses:
            trainessay2parses[essay.name].append( [((),1.0)])
    return trainessay2parses

In [66]:
def essay_to_sentparses_cv(cv_folds, models, beam_size, top_n_parses, search_mode_max_prob=False):
    essay2sentparses = defaultdict(list)
    assert len(cv_folds) == len(models)
    for (train, test), mdl in zip(cv_folds, models):
        test2sp = get_essays2sentparses(test, mdl, beam_size, top_n_parses, search_mode_max_prob)
        for k,v in test2sp.items():
            assert k not in essay2sentparses
            essay2sentparses[k] = v
    return essay2sentparses

In [67]:
def shuffle_split_dict(dct, train_pct):
    items = list(dct.items())
    np.random.shuffle(items)
    num_train = int(len(items) * train_pct)
    train_items, test_items = items[:num_train], items[num_train:]
    return dict(train_items), dict(test_items)

In [68]:
def train_model_fold(xs_train, xs_test, C, pa_type, loss_type, max_update_items):
    mdl = CostSensitiveMIRA(C=C, pa_type=pa_type, loss_type=loss_type, max_update_items=max_update_items, initial_weight=1)
    best_mdl, test_acc_df_ml = train_model(mdl, xs_train=xs_train, xs_test=xs_test, 
        max_epochs=20, early_stop_iters=5, train_instance_fn = train_cost_sensitive_instance, verbose=False)
    f1 = test_acc_df_ml["f1_score"].values[0]
    return f1

def train_model_parallel(cv_folds, C, pa_type, loss_type, max_update_items):
    try:
        f1s = Parallel(n_jobs=len(cv_folds))(delayed(train_model_fold)(train,test, C, pa_type, loss_type, max_update_items) 
                                         for (train,test) in cv_folds)
        return np.mean(f1s)
    except KeyboardInterrupt:
        print("Process stopped by user")

In [70]:
# %%time
# TOP_N = 10 # 10 is better
xs_rerank = essay_to_sentparses_cv(cv_folds, models, beam_size=2, top_n_parses=2)
assert len(xs_rerank) == len(pred_tagged_essays_train)

In [71]:
for td, vd in cv_folds:
    print(len(td), len(vd))

721 181
721 181
722 180
722 180
722 180


In [72]:
len(xs_rerank), len(pred_tagged_essays_train), type(xs_rerank)

(902, 902, collections.defaultdict)

In [73]:
# initial settings for other params
best_C, best_max_upd = (0.01, 1)

In [74]:
xs = get_features_from_probabilities(xs_rerank, min_feat_freq=1)

In [75]:
type(xs),len(xs)

(list, 902)

In [76]:
cv_folds_rerank = cross_validation(xs, 5)
cv_folds_mm = [min_max_normalize_feats(train,test) for (train,test) in cv_folds_rerank]

In [77]:
f1 = train_model_parallel(cv_folds=cv_folds_mm, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)
f1

0.7392884637765149

## Optimize for Beam Size and Top N Parses

In [None]:
best_beam_size = -1
best_top_n_parses = -1
best_f1 = -1

for top_nparses in [1,2,3,5]:
    for beam_size in [1,2,3,5,10]:
        if top_nparses > beam_size:
            continue
    
        print("Beam: {beam_size} TopNP: {top_nparses}".format(beam_size=beam_size, top_nparses=top_nparses))
        xs_rerank = essay_to_sentparses_cv(cv_folds, models, beam_size=beam_size, top_n_parses=top_nparses)
        xs = get_features_from_probabilities(xs_rerank, min_feat_freq=1)
        cv_folds_rerank = cross_validation(xs, 5)
        cv_folds_mm = [min_max_normalize_feats(train,test) for (train,test) in cv_folds_rerank]
        f1 = train_model_parallel(cv_folds=cv_folds_mm, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)
        
        print("F1: {f1:.4f}".format(f1=f1))
        if f1 > best_f1:
            best_f1 = f1
            print("*" * 80)

Beam: 1 TopNP: 1
F1: 0.7423
********************************************************************************
Beam: 2 TopNP: 1
F1: 0.7422
Beam: 3 TopNP: 1
F1: 0.7422
Beam: 5 TopNP: 1
F1: 0.7423
Beam: 10 TopNP: 1
F1: 0.7426
********************************************************************************
Beam: 2 TopNP: 2


## Test with Optimal Parameters and MM Nornalization

In [47]:
best_f1 = -1
for pa_type in [0, 1, 2]:
    for loss_type in ["ml", "pb"]:
        for C in [0.001, 0.01, 0.1,1]:
            f1 = train_model_parallel(cv_folds=cv_folds_mm, C=C, pa_type=pa_type, loss_type=loss_type, max_update_items=best_max_upd)
            print("F1={f1:.4f} \t C={C:.4f} \t PA={pa_type} \t Loss Type={loss_type}".format(C=C, f1=f1, pa_type=pa_type, loss_type=loss_type))
            if f1 > best_f1:
                best_f1 = f1
                print("*" * 80)

F1=0.7256 	 C=0.0010 	 PA=0 	 Loss Type=ml
********************************************************************************
F1=0.7256 	 C=0.0100 	 PA=0 	 Loss Type=ml
F1=0.7256 	 C=0.1000 	 PA=0 	 Loss Type=ml
F1=0.7256 	 C=1.0000 	 PA=0 	 Loss Type=ml
F1=0.7256 	 C=0.0010 	 PA=0 	 Loss Type=pb
F1=0.7256 	 C=0.0100 	 PA=0 	 Loss Type=pb
F1=0.7256 	 C=0.1000 	 PA=0 	 Loss Type=pb
F1=0.7256 	 C=1.0000 	 PA=0 	 Loss Type=pb
F1=0.7434 	 C=0.0010 	 PA=1 	 Loss Type=ml
********************************************************************************
F1=0.7418 	 C=0.0100 	 PA=1 	 Loss Type=ml
F1=0.7331 	 C=0.1000 	 PA=1 	 Loss Type=ml
F1=0.7303 	 C=1.0000 	 PA=1 	 Loss Type=ml
F1=0.7434 	 C=0.0010 	 PA=1 	 Loss Type=pb
F1=0.7418 	 C=0.0100 	 PA=1 	 Loss Type=pb
F1=0.7315 	 C=0.1000 	 PA=1 	 Loss Type=pb
F1=0.7287 	 C=1.0000 	 PA=1 	 Loss Type=pb
F1=0.7383 	 C=0.0010 	 PA=2 	 Loss Type=ml
F1=0.7157 	 C=0.0100 	 PA=2 	 Loss Type=ml
F1=0.7374 	 C=0.1000 	 PA=2 	 Loss Type=ml
F1=0.7321 	 C=1.0000 

## Optimize for Beam Size

In [None]:
%%time

max_f1 = -1
topn2metrics = defaultdict(list)
best_top_n = -1

for top_n in [2,3,5,10,20,50,100]:    
    print("top_n: {top_n}".format(top_n=top_n))
    
    xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=top_n)
    xs = get_features_from_probabilities(xs_rerank, best_max_parses, min_feat_freq=1, min_prob=best_min_prob)
    
    cv_folds_rerank = cross_validation(xs, 3)
    
    f1 = train_model_parallel(cv_folds=cv_folds_rerank, C=best_C, pa_type=1, loss_type="ml", 
                              max_update_items=best_max_upd)             
    topn2metrics[top_n].append(f1)
    print("F1: {f1:.4f}".format(f1=f1))
    if f1 > max_f1:
        print("*" * 80)
        max_f1 = f1
        best_top_n = top_n
        print("New Max F1: {f1:.4f} \tTop N: {top_n}".format(f1=max_f1, top_n=top_n))
    print()

## Optimize for C and Max Upd

In [64]:
%%time
max_f1 = -1

xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=best_top_n)
xs = get_features_from_probabilities(xs_rerank, best_max_parses, min_feat_freq=1, min_prob=best_min_prob)
    
cv_folds_rerank = cross_validation(xs, 3)

CPU times: user 43.8 s, sys: 314 ms, total: 44.1 s
Wall time: 44.1 s


In [65]:
best_C = -1
best_max_upd = -1
c2metrics = defaultdict(list)

for C in [0.0025, 0.005, 0.01, 0.025, 0.05, 0.1][::-1]:
    for max_upd in [1, 2, 3, 5, 10]:
        print("C: {c} Max_Upd:{max_upd}".format(c=C, max_upd=max_upd))

        f1 = train_model_parallel(cv_folds=cv_folds_rerank, C=C, pa_type=1, loss_type="ml", 
                              max_update_items=max_upd)

        c2metrics[(C,max_upd)].append(f1)
        print("F1: {f1:.4f}".format(f1=f1))
        if f1 > max_f1:
            print("*" * 80)
            max_f1 = f1
            best_C=C
            best_max_upd = max_upd
            print("New Max F1: {f1:.4f} \tC: {C} \tMax_Upd: {max_upd}".format(f1=max_f1, C=C, max_upd=max_upd))

C: 0.1 Max_Upd:1
F1: 0.7335
********************************************************************************
New Max F1: 0.7335 	C: 0.1 	Max_Upd: 1
C: 0.1 Max_Upd:2
F1: 0.7315
C: 0.1 Max_Upd:3
F1: 0.7323
C: 0.1 Max_Upd:5
F1: 0.7347
********************************************************************************
New Max F1: 0.7347 	C: 0.1 	Max_Upd: 5
C: 0.1 Max_Upd:10
F1: 0.7331
C: 0.05 Max_Upd:1
F1: 0.7373
********************************************************************************
New Max F1: 0.7373 	C: 0.05 	Max_Upd: 1
C: 0.05 Max_Upd:2
F1: 0.7359
C: 0.05 Max_Upd:3
F1: 0.7379
********************************************************************************
New Max F1: 0.7379 	C: 0.05 	Max_Upd: 3
C: 0.05 Max_Upd:5
F1: 0.7363
C: 0.05 Max_Upd:10
F1: 0.7335
C: 0.025 Max_Upd:1
F1: 0.7411
********************************************************************************
New Max F1: 0.7411 	C: 0.025 	Max_Upd: 1
C: 0.025 Max_Upd:2
F1: 0.7415
*************************************************

In [66]:
best_top_n, best_C, best_max_upd

(2, 0.01, 1)

## Optimize for MAX Parses

In [79]:
%%time

max_f1 = -1
topn2metrics2 = defaultdict(list)
best_max_parses = 300

xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=best_top_n)

for max_parses in [50, 75, 100, 150, 200, 300, 500]:
    
    print("max_parses: {max_parses}".format(max_parses=max_parses))
    xs = get_features_from_probabilities(xs_rerank, max_parses, min_feat_freq=1, min_prob=best_min_prob)    
    cv_folds_rerank = cross_validation(xs, 3)
    
    f1 = train_model_parallel(cv_folds=cv_folds_rerank, C=best_C, pa_type=1, loss_type="ml", 
                              max_update_items=best_max_upd)             
    print("F1: {f1:.4f}".format(f1=f1))
    topn2metrics2[max_parses].append(f1)
    if f1 > max_f1:
        print("*" * 80)
        max_f1 = f1
        best_max_parses = max_parses
        print("New Max F1: {f1:.4f} \tMax Parses: {max_parses}".format(f1=max_f1, max_parses=max_parses))

max_parses: 50
F1: 0.7195
********************************************************************************
New Max F1: 0.7195 	Max Parses: 50
max_parses: 75
F1: 0.7332
********************************************************************************
New Max F1: 0.7332 	Max Parses: 75
max_parses: 100
F1: 0.7332
max_parses: 150
F1: 0.7411
********************************************************************************
New Max F1: 0.7411 	Max Parses: 150
max_parses: 200
F1: 0.7411
max_parses: 300
F1: 0.7417
********************************************************************************
New Max F1: 0.7417 	Max Parses: 300
max_parses: 500
F1: 0.7417
CPU times: user 3min 4s, sys: 5.61 s, total: 3min 10s
Wall time: 30min 27s


In [80]:
best_max_parses

300

## Optimize Probability Threshold

In [81]:
%%time

max_f1 = -1
topprob2metrics = defaultdict(list)

max_parses = best_max_parses
best_min_prob = -1

xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=best_top_n)

for min_prob in [0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
    
    print("min_prob: {min_prob}".format(min_prob=min_prob))
    
    xs = get_features_from_probabilities(xs_rerank, best_max_parses, min_feat_freq=1, min_prob=min_prob)    
    cv_folds_rerank = cross_validation(xs, 3)
        
    f1 = train_model_parallel(cv_folds=cv_folds_rerank, C=best_C, pa_type=1, loss_type="ml", max_update_items=best_max_upd)             
    topprob2metrics[min_prob].append(f1)
    if f1 > max_f1:
        print("*" * 80)
        max_f1 = f1
        best_min_prob = min_prob
        print("New Max F1: {f1:.4f} \tMin P: {min_prob}".format(f1=max_f1, min_prob=min_prob))

min_prob: 0.0
********************************************************************************
New Max F1: 0.7402 	Min P: 0.0
min_prob: 0.05
min_prob: 0.1
min_prob: 0.2
min_prob: 0.3
min_prob: 0.4
min_prob: 0.5
min_prob: 0.6
********************************************************************************
New Max F1: 0.7405 	Min P: 0.6
CPU times: user 4min 47s, sys: 8.75 s, total: 4min 56s
Wall time: 31min 26s


In [82]:
best_top_n, best_C, best_max_upd, best_max_parses, best_min_prob

(2, 0.01, 1, 300, 0.6)

## TODO
- Try Sampling from the Predicted Parses