In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries
from searn_parser_breadth_first import ParseActionResult, SearnModelBreadthFirst

In [3]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [5]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:13->Result:50',
 'Causer:1->Result:50',
 'Causer:11->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [7]:
set_cr_tags = set(cr_tags)

In [8]:
def evaluate_model_essay_level(
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        train_sr_parser(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    parser_models = []
    for (model, num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in serial_results:
        number_of_feats.append(num_feats)

        parser_models.append(model)
        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return parser_models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag

In [9]:
def add_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def get_label_data_essay_level(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_essay = defaultdict(list)

    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        add_labels(unique_cr_tags, ys_bytag_essay)
    return dict(ys_bytag_essay) # convert to dict so no issue when iterating over if additional keys are present

def essay_to_crels(tagged_essays):
    global set_cr_tags
    # outputs
    name2crels = defaultdict(set)
    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        name2crels[essay.name] = unique_cr_tags
    return dict(name2crels)

In [10]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

def predict_essay_level(parser, essays):
    pred_ys_by_sent = defaultdict(list)
    for essay_ix, essay in enumerate(essays):
        unq_pre_relations = set()
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            pred_relations = parser.predict_sentence(taggged_sentence, predicted_tags)
            unq_pre_relations.update(pred_relations)
        # Store predictions for evaluation
        add_labels(unq_pre_relations, pred_ys_by_sent)
    return pred_ys_by_sent

In [11]:
LINE_WIDTH = 80

# other settings
DOWN_SAMPLE_RATE = 1.0  # For faster smoke testing the algorithm
BASE_LEARNER_FACT = None
COLLECTION_PREFIX = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_MOST_RECENT_CODE"

# some of the other extractors aren't functional if the system isn't able to do a basic parse
# so the base extractors are the MVP for getting to a basic parser, then additional 'meta' parse
# features from all_extractors can be included
base_extractors = [
    single_words,
    word_pairs,
    three_words,
    between_word_features
]

all_extractor_fns = base_extractors + [
    word_distance,
    valency,
    unigrams,
    third_order,
    label_set,
    size_features
]

all_cost_functions = [
    micro_f1_cost,
    micro_f1_cost_squared,
    micro_f1_cost_plusone,
    micro_f1_cost_plusepsilon,
    binary_cost,
    inverse_micro_f1_cost,
    uniform_cost
]

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [12]:
# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                                    'three_words', 'third_order', 'unigrams'] # type: List[str]

In [13]:
def train_sr_parser(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelBreadthFirst(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = get_label_data_essay_level(essays_TD)
    sent_vd_ys_bycode = get_label_data_essay_level(essays_VD)

    sent_td_pred_ys_bycode = predict_essay_level(parse_model, essays_TD)
    sent_vd_pred_ys_bycode = predict_essay_level(parse_model, essays_VD)

    return parse_model, num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode


In [14]:
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [15]:
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

## Essay Level Results

In [16]:
result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

## Train

In [17]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.985749,0.781099,0.761679,0.801536


## Test

In [18]:
models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = result_test_essay_level
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.982725,0.739338,0.733942,0.744815


# Train Re-Ranker Model

In [19]:
from itertools import combinations

def get_possible_crels(predicted_tags):
    if len(predicted_tags) < 2:
        return set()
    predicted_tags = sorted(predicted_tags)
    pred_crels = set()
    for a,b in combinations(predicted_tags, 2):
        pred_crels.add("Causer:{a}->Result:{b}".format(a=a, b=b))
        pred_crels.add("Causer:{b}->Result:{a}".format(a=a, b=b))
    return pred_crels

def to_canonical_parse(crels):
    return tuple(sorted(crels))

def get_crels(parse):
    crels = set()
    p = parse
    while p:
        if p.relations:
            crels.update(p.relations)
        p = p.parent_action
    return crels

In [20]:
from searn_parser_breadth_first import geo_mean

def collapse_sent_parse(pred_parses):
    crel2prob = defaultdict(list)
    for pact in pred_parses:
        act_seq = pact.get_action_sequence()
        for act in act_seq:
            if not act.relations:
                continue

            assert act.lr_action_prob >= 0
            prob = geo_mean([act.action_prob * act.lr_action_prob])
            for r in act.relations:
                crel2prob[r].append(prob)
    return crel2prob

def merge_crel_probs(a, b):    
    for k,v in b.items():
        a[k].extend(v)
    return a

def get_max_probs(crel2probs):
    crel2max_prob = dict()
    for crel, probs in crel2probs.items():
        crel2max_prob[crel] = max(probs)
    return crel2max_prob

In [21]:
from itertools import combinations

def get_all_combos(items):
    # enforces a consistent ordering for the resulting tuples
    items = sorted(items) 
    cbos = [()] # seed with the empty combo
    for i in range(1, len(items)+1):
        cbos.extend(combinations(items,i))
    return cbos

cbos = get_all_combos([3,2,1])
print(len(cbos)) # 2**len(items)-1
if len(cbos) < 1000:
    for cbo in sorted(cbos, key = lambda l: (len(l), l)):
        print(cbo)

8
()
(1,)
(2,)
(3,)
(1, 2)
(1, 3)
(2, 3)
(1, 2, 3)


## Generate Parses

In [22]:
def to_parse(lst):
    return tuple(sorted(lst))

def sample_top_parses(crel2maxprobs, top_n):

    max_parses = 2**len(crel2maxprobs) # maximum parse combinations
    assert max_parses > top_n, (max_parses, top_n) # otherwise brute force it

    top_parses = set([()]) # always seed with the empty parse
    probs = []
    while len(top_parses) < top_n:
        new_parse = []
        for crel, prob in crel2maxprobs.items():
            rand_val = np.random.random() # random number >= 0 and < 1
            if rand_val < prob:
                new_parse.append(crel)
        # make hashable and enforce consistent order
        top_parses.add(to_parse(new_parse))
    
    return list(top_parses)

def get_top_parses(crel2maxprobs, threshold=0.5):
    top_parse = [crel for crel, prob in crel2maxprobs.items() if prob >= threshold]
    if top_parse:
        return [to_parse(top_parse)]
    else:
        return [()]
    
def get_top_n_parses(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    for i in range(1, min(top_n, len(crel2maxprobs))+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
    return top_parses

def get_top_n_parses2(crel2maxprobs, top_n):
    top_parses = [()]
    by_prob = sorted(crel2maxprobs.keys(), key = lambda k: -crel2maxprobs[k])
    num_predicted = len([crel for crel in by_prob if crel2maxprobs[crel] >= 0.5])
    for i in range(num_predicted-1, len(by_prob)+1):
        parse = by_prob[:i]
        top_parses.append(to_parse(parse))
        if len(top_parses) > top_n:
            break
    return top_parses

crel_probs = {
    "1->2":   0.8,
    "2->3":   0.01,
    "5->8":   0.25,
    "10->12": 0.75,
    "12->50": 0.99,
    "3->4":   0.50,
}

# important - should see a lot more of the more probable codes
# sample_top_parses(crel_probs, 8)
get_top_n_parses2(crel_probs, 1)

[(), ('1->2', '10->12', '12->50')]

## Parser Feature Extraction

In [23]:
from NgramGenerator import compute_ngrams

def to_short_tag(tag):
    return tag.replace("Causer:","").replace("Result:", "")

def build_chains_inner(tree, l, visited, depth=0):
    chains = []
    if l not in tree:
        return chains
    for r in tree[l]:
        if r in visited:
            continue
        visited.add(r) # needed to prevent cycles, which cause infinite recursion
        extensions = build_chains_inner(tree, r, visited, depth+1)
        visited.remove(r)
        for ch in extensions:
            chains.append([r] + ch)
        if not extensions:
            chains.append([r])
    return chains

def build_chains(tree):    
    lhs_items = set(tree.keys())
    rhs_items = set()
    for l,rhs in tree.items():        
        rhs_items.update(rhs)
    
    chains = []
    # starting positions of each chain are those appearing on the lhs but not the rhs
    start_codes = lhs_items - rhs_items    
    for l in start_codes:
        rhs = tree[l]
        for r in rhs:
            for ch in build_chains_inner(tree, r, {l,r}, 0):
                chains.append([l,r] + ch)
    return chains

def extend_chains(chains):
    ext_chains = set()
    for tokens in chains:
        ext_chains.add(",".join(tokens))
        ngrams = compute_ngrams(tokens,max_len=None, min_len=3)
        for t in ngrams:
            ext_chains.add(",".join(t))
    return ext_chains

def extract_features_from_parse(parse, crel2probs):
    
    feats = defaultdict(float)
    tree = defaultdict(set) # maps causers to effects for building chains
    max_probs = []    
    code_tally = defaultdict(float)
    
    pairs = set()
    inverted_count = 0
    for crel in parse:
        probs = crel2probs[crel]
        max_p = max(probs)
        max_probs.append(max_p)
        feats["{crel}-MAX(prob)".format(crel=crel)] = max_p
        feats["{crel}-MIN(prob)".format(crel=crel)] = min(probs)
        feats["{crel}-pred-count".format(crel=crel)] = len(probs)
        feats["{crel}-pred-count={count}".format(crel=crel, count=len(probs))] = 1
        
        # with type
        l,r = crel.split("->")
        code_tally[l] +=1
        code_tally[r] +=1
        
        # without type
        l_short, r_short = to_short_tag(l), to_short_tag(r)
        code_tally[l_short] +=1
        code_tally[r_short] +=1
        # ordering of the codes, ignoring the causal direction
        feats[l_short + ":" + r_short] = 1
        
        # build tree structure so we can retrieve the chains
        tree[l_short].add(r_short)
        
        # track whether the rule exists in the opposite direction
        pairs.add((l_short,r_short))
        if (r_short,l_short) in pairs:
            inverted_count += 1
            
    if inverted_count:
        feats["inverted"] = 1
        feats["num_inverted"] = inverted_count
    else:
        feats["not_inverted"] = 1
    
    # counts
    feats.update(code_tally)
    num_crels = len(parse)
    feats["num_crels"] = num_crels
    feats["num_crels="+str(len(parse))] = 1 # includes a tag for the empty parse
    for i in range(1,11):
        if num_crels <= i:
            feats["num_crels<={i}".format(i=i)] = 1
        else:
            feats["num_crels>{i}".format(i=i)] = 1
        
    # combination of crels
    # need to sort so that order of a and b is consistent across parses
    pairs = combinations(sorted(parse), r=2)
    for a, b in pairs:
        feats["{a}|{b}".format(a=a, b=b)] = 1
        
    #chains
    causer_chains = extend_chains(build_chains(tree))
    for ch in causer_chains:
        feats["CChain:" + ch] = 1
    
    if max_probs: # might be an empty parse
        for cutoff in [0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95]:
            above =  len([p for p in max_probs if p >=cutoff])
            feats["Above-{cutoff}".format(cutoff=cutoff)] = above
            feats["%-Above-{cutoff}".format(cutoff=cutoff)] = above/len(max_probs)
            if above == len(max_probs):
                feats["All-Above-{cutoff}".format(cutoff=cutoff)] = 1
        
        feats["avg-prob"] = np.mean(max_probs)
        feats["med-prob"] = np.median(max_probs)
        feats["prod-prob"]= np.product(max_probs)
        feats["min-prob"] = np.min(max_probs)
        feats["max-prob"] = np.max(max_probs)
        for p in [5, 10, 25, 75, 90, 95]:
            feats["{p}%-prob".format(p=p)] = np.percentile(max_probs, p)
        # geometric mean
        feats["geo-mean"] = np.prod(max_probs)**(1/len(max_probs))
    return feats

In [51]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay
    
name2crels = essay_to_crels(all_essays)

assert len(name2crels) == len(all_essays)

In [52]:
def compute_costs(parser_input):
    opt_parse = parser_input.opt_parse
    other_parses = parser_input.other_parses

    other_costs = []
    op = set(opt_parse)
    for p in other_parses:
        p = set(p)
        fp = p - op
        fn = op - p
        cost = len(fp) + len(fn)
        other_costs.append(cost)
    return other_costs

def copy_dflt_dict(d):
    copy = defaultdict(d.default_factory)
    copy.update(d)
    return copy

class ParserInputs(object):
    def __init__(self, essay_name, opt_parse, all_parses, crel2probs, compute_feats=True):
        self.essay_name = essay_name
        self.opt_parse = opt_parse
        self.crel2probs = crel2probs
        
        if compute_feats:
            self.opt_features = extract_features_from_parse(opt_parse, crel2probs)
            
            other_parses = []
            other_feats_array = []
            all_feats_array = []
            for p in all_parses:
                feats = extract_features_from_parse(p, crel2probs)
                all_feats_array.append(feats)
                if p != opt_parse:
                    other_parses.append(p)
                    other_feats_array.append(feats)

            self.all_feats_array = all_feats_array
            self.other_parses = other_parses
            self.other_features_array = other_feats_array
            self.other_costs_array = compute_costs(self)
                    
        self.all_parses = all_parses
        
    def clone_without_feats(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, crel2probs=self.crel2probs, compute_feats=False)
        
        c.other_parses = self.other_parses
        c.other_costs_array = self.other_costs_array
        return c

    def clone(self):
        c = ParserInputs(essay_name=self.essay_name, opt_parse=self.opt_parse, 
                         all_parses=self.all_parses, crel2probs=self.crel2probs, compute_feats=False)
        
        c.all_feats_array = [copy_dflt_dict(f) for f in self.all_feats_array]
        c.opt_features = copy_dflt_dict(self.opt_features)
        c.other_parses = self.other_parses
        c.other_features_array = [copy_dflt_dict(f) for f in self.other_features_array]
        c.other_costs_array = self.other_costs_array
        return c

def to_freq_feats(feats, freq_feats):
    new_feats = defaultdict(float)
    for f, v in feats.items():
        if f in freq_feats:
            new_feats[f] = v
    return new_feats

def filter_by_min_freq(xs, feat_freq, min_freq):
    if min_freq <= 1:
        return xs
    freq_feats = set((f for f, cnt in feat_freq.items() if cnt >= min_freq))
    for parser_input in xs:
        parser_input.opt_features = to_freq_feats(parser_input.opt_features, freq_feats)
        parser_input.other_features_array = [to_freq_feats(x, freq_feats)
                                             for x in parser_input.other_features_array]
    return xs

def accumulate_feat_vals(xs_train):
    def merge_feats(feats):
        for ft,val in feats.items():
            fts_vals[ft].append(val)
    
    fts_vals = defaultdict(list)
    cnt = 0
    for parser_input in xs_train:
        cnt+=1
        merge_feats(parser_input.opt_features)
        for x in parser_input.other_features_array:
            cnt+=1
            merge_feats(x)
    return fts_vals, cnt

def z_score_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_mean, fts_std = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))
        std = np.std(v_with_zeros)
        if std == 0.0:
            fts_mean[ft] = 0
            fts_std[ft] = vals[0]
        else:
            fts_mean[ft] = np.mean(v_with_zeros)
            fts_std[ft] =  np.std(v_with_zeros)
    
    def to_z_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_mean:
                new_val = (val - fts_mean[ft])/fts_std[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def z_score_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_z_score(parser_input.opt_features)
        clone.all_feats_array = [to_z_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_z_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [z_score_normalize(x) for x in xs_train]
    new_xs_test  = [z_score_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def min_max_normalize_feats(xs_train, xs_test):
    fts_vals, cnt = accumulate_feat_vals(xs_train)
    
    fts_min, fts_range = dict(), dict()
    for ft, vals in fts_vals.items():
        v_with_zeros = vals + ([0] * (cnt-len(vals)))   
        min_val = np.min(v_with_zeros)
        range_val = np.max(v_with_zeros) - min_val
        fts_min[ft] = min_val
        fts_range[ft] = range_val
    
    def to_min_max_score(fts):
        new_fts = defaultdict(fts.default_factory)
        for ft, val in fts.items():
            if ft in fts_min:
                new_val = (val - fts_min[ft])/fts_range[ft]
                if new_val:
                    new_fts[ft] = new_val
        return new_fts
    
    def min_max_normalize(parser_input):
        clone = parser_input.clone_without_feats()
        clone.opt_features = to_min_max_score(parser_input.opt_features)
        clone.all_feats_array = [to_min_max_score(x) for x in parser_input.all_feats_array]
        clone.other_features_array = [to_min_max_score(x) for x in parser_input.other_features_array]
        return clone
    
    new_xs_train = [min_max_normalize(x) for x in xs_train]
    new_xs_test  = [min_max_normalize(x) for x in xs_test]
    return new_xs_train, new_xs_test

def get_crels_above(crel2maxprob, threshold):
    return [k for k, p in crel2maxprob.items() if p >= threshold]

def get_features_from_probabilities(essay2probs, top_n, min_feat_freq=1):
    xs = []
    feat_freq = defaultdict(int)
    
    for ename, crel2probs in essay2probs.items():

        act_crels = name2crels[ename]
        crel2maxprob = get_max_probs(crel2probs)        
        crel2probs = dict(crel2probs)

        num_crels = len(crel2probs)
        max_parses = 2 ** num_crels
        if max_parses > 2 * top_n:
            keys = list(crel2probs.keys())
            n_parses = 2 ** len(keys)
            threshold = 0.1
            while n_parses > max_parses:
                keys = get_crels_above(crel2maxprob, threshold)
                n_parses = 2 ** len(keys)
                threshold += 0.1

            parses = get_all_combos(keys)
            #parses = sample_top_parses(crel2maxprob, top_n)
            #parses.extend(get_top_parses(crel2maxprob))  # just get the predicted parses (probability >= 0.5)
            #parses = get_top_parses(crel2maxprob)  # just get the predicted parses (probability >= 0.5)
            #parses = get_top_n_parses2(crel2maxprob, top_n=2)
        else:
            # brute force it
            parses = get_all_combos(crel2probs.keys())

        # constrain optimal parse to only those crels that are predicted
        opt_parse = tuple(sorted(act_crels.intersection(crel2probs.keys())))
        x = ParserInputs(essay_name=ename, opt_parse=opt_parse, all_parses=parses, crel2probs=crel2probs)
        xs.append(x)

        # Get unique features for essay
        all_feats = set()
        for fts in x.all_feats_array:
            all_feats.update(fts.keys())

        for ft in all_feats:
            feat_freq[ft] += 1

    assert len(xs) == len(essay2probs), "Parses for all essays should be generated"
    return filter_by_min_freq(xs, feat_freq, min_feat_freq)

In [53]:
def add_cr_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)
            
def evaluate_ranker(model, xs, essay2crels, ys_bytag):
    clone = model.clone()
    if hasattr(model, "average_weights"):
        clone.average_weights()
    rank_acc = []
    pred_ys_bytag = defaultdict(list)
    ename2inps = dict()
    for parser_input in xs:
        ename2inps[parser_input.essay_name] = parser_input
    
    for ename, act_crels in essay2crels.items():        
        if ename not in ename2inps:
            # no predicted crels for this essay
            highest_ranked = set()
        else:
            parser_input = ename2inps[ename]
            ixs = clone.rank(parser_input.all_feats_array)
            highest_ranked = parser_input.all_parses[ixs[0]] # type: Tuple[str]        
            rank_acc.append(1 if highest_ranked == parser_input.opt_parse else 0)
            
        add_cr_labels(set(highest_ranked), pred_ys_bytag)

    mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag, pred_ys_bytag)
    df = get_micro_metrics(metrics_to_df(mean_metrics))
    df["rank_acc"] = np.mean(rank_acc)
    return df

In [54]:
from numpy.random import shuffle

def train_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, other_feats_array=parser_input.other_features_array)

def train_cost_sensitive_instance(parser_input, model):
    model.train(best_feats=parser_input.opt_features, 
                other_feats_array=parser_input.other_features_array, other_costs_array=parser_input.other_costs_array)
    
def get_essays_for_data(xs):
    return [name2essay[x.essay_name] for x in xs]
    
def train_model(model, xs_train, xs_test, max_epochs=30, early_stop_iters=8, train_instance_fn=train_instance):
    test_accs = [-1]
    best_model = None
    best_test_accuracy = None
    num_declining_acc = 0

    train_essays = get_essays_for_data(xs_train)
    test_essays  = get_essays_for_data(xs_test)

    ys_by_tag_train = get_label_data_essay_level(train_essays)
    ys_by_tag_test  = get_label_data_essay_level(test_essays)

    essay2crels_train = essay_to_crels(train_essays)
    essay2crels_test  = essay_to_crels(test_essays)
    
    xs_train_copy = list(xs_train)    
    for i in range(max_epochs):
        shuffle(xs_train_copy)
        for parser_input in xs_train_copy:
            if len(parser_input.other_parses) > 0:
                train_instance_fn(parser_input, model)

        train_accuracy_df = evaluate_ranker(model, xs_train, essay2crels_train, ys_by_tag_train)
        test_accuracy_df  = evaluate_ranker(model, xs_test,  essay2crels_test,  ys_by_tag_test)
        train_accuracy = train_accuracy_df.iloc[0].to_dict()["f1_score"]
        test_accuracy  = test_accuracy_df.iloc[0].to_dict()["f1_score"]
        print("Epoch: {epoch} Train Accuracy: {train_acc:.4f} Test Accuracy: {test_acc:.4f}".format(
            epoch=i,  train_acc=train_accuracy, test_acc=test_accuracy))
        if test_accuracy > max(test_accs):
            best_model = model.clone()
            best_test_accuracy = test_accuracy_df
            num_declining_acc = 0
        else:
            num_declining_acc += 1
            if num_declining_acc >= early_stop_iters:
                break
        test_accs.append(test_accuracy)
    print("Best Test Acc: {acc:.4f}".format(acc=max(test_accs)))
    return best_model, best_test_accuracy

In [55]:
def merge_essay2crelprobs(a,b):
    for ename, dct in b.items():
        a[ename]

In [56]:
def get_essays2crels(essays, sr_model, top_n):
    trainessay2probs = defaultdict(list)
    for eix, essay in enumerate(essays):
        crel2probs = defaultdict(list)        
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            unq_ptags = set([t for t in predicted_tags if t != EMPTY])            
            if len(unq_ptags) >= 2:
                pred_parses = sr_model.generate_all_potential_parses_for_sentence(
                    tagged_sentence=taggged_sentence, predicted_tags=predicted_tags, top_n=top_n)
                cr2p = collapse_sent_parse(pred_parses)
                merge_crel_probs(crel2probs, cr2p)
    
        if len(crel2probs) > 0:
            trainessay2probs[essay.name] = dict(crel2probs)
        else:
            trainessay2probs[essay.name] = dict()
    return trainessay2probs

In [57]:
def essay_to_crels_cv(cv_volds, models, top_n):
    essay2crelprobs = defaultdict(list)
    assert len(cv_folds) == len(models)
    for (train, test), mdl in zip(cv_folds, models):
        test2probs = get_essays2crels(test, mdl, top_n)
        for k,v in test2probs.items():
            assert k not in essay2crelprobs
            essay2crelprobs[k] = v
    return essay2crelprobs

In [58]:
def shuffle_split_dict(dct, train_pct):
    items = list(dct.items())
    np.random.shuffle(items)
    num_train = int(len(items) * train_pct)
    train_items, test_items = items[:num_train], items[num_train:]
    return dict(train_items), dict(test_items)

In [42]:
%%time
TOP_N = 10 # 10 is better

xs_rerank = essay_to_crels_cv(cv_folds, models, top_n=TOP_N)

CPU times: user 19 s, sys: 111 ms, total: 19.1 s
Wall time: 19.2 s


In [43]:
train2predcrels, test2predcrels = shuffle_split_dict(xs_rerank, 0.8)
len(train2predcrels), len(test2predcrels)

(721, 181)

In [261]:
# lens = []
# for ename, crel2probs in train2predcrels.items():
#     lens.append(len(crel2probs))
# min(lens), max(lens), np.mean(lens), np.median(lens), len([l for l in lens if l > 0])

In [47]:
len(name2crels)

91

In [None]:
%%time
MAX_CRELS = 500
xs_train = get_features_from_probabilities(train2predcrels, MAX_CRELS, min_feat_freq=1)
xs_test  = get_features_from_probabilities(test2predcrels,  MAX_CRELS, min_feat_freq=1)

In [None]:
len(xs_train), len(xs_test)

In [None]:
from MIRA import MIRA, CostSensitiveMIRA

# new record accuracy - ML, C=0.05, PA = I, with min max feats
model_ml = CostSensitiveMIRA(C=0.05, pa_type=1, loss_type="ml", max_update_items=1, initial_weight=1)
best_model_ml, test_acc_df_ml = train_model(model_ml, xs_train=xs_train, xs_test=xs_test,         
        max_epochs=50, early_stop_iters=10, train_instance_fn = train_cost_sensitive_instance)

In [240]:
%%time
xs_train_mm_norm, xs_test_mm_norm = min_max_normalize_feats(xs_train, xs_test)

CPU times: user 20.9 s, sys: 1.42 s, total: 22.3 s
Wall time: 22.5 s


In [245]:
from MIRA import MIRA, CostSensitiveMIRA

# new record accuracy - ML, C=0.05, PA = I, with min max feats
model_ml = CostSensitiveMIRA(C=0.05, pa_type=1, loss_type="ml", max_update_items=1, initial_weight=1)
best_model_ml, test_acc_df_ml = train_model(model_ml, xs_train=xs_train_mm_norm, xs_test=xs_test_mm_norm, 
        essay2crels_train=essay2crels_train, essay2crels_test=essay2crels_train, 
        max_epochs=50, early_stop_iters=10, train_instance_fn = train_cost_sensitive_instance)

Exception processing tag: Causer:4->Result:11


Exception: Both list must be the same size, actual - 902 expected - 226

In [242]:
%%time
sr_model = models[0]
MAX_CRELS = 500

max_f1 = -1
topn2metrics = defaultdict(list)
for top_n in [2,3,5,10,20,50,100]:
    print("top_n: {top_n}".format(top_n=top_n))
    train2predcrels = get_essays2crels(pred_tagged_essays_train, sr_model=sr_model, top_n=top_n)
    test2predcrels  = get_essays2crels(pred_tagged_essays_test,  sr_model=sr_model, top_n=top_n)
    
    xs_train = get_features_from_probabilities(train2predcrels, MAX_CRELS, min_feat_freq=1)
    xs_test  = get_features_from_probabilities(test2predcrels,  MAX_CRELS, min_feat_freq=1)
    xs_train_mm_norm, xs_test_mm_norm = min_max_normalize_feats(xs_train, xs_test)
    
    model_ml = CostSensitiveMIRA(C=0.05, pa_type=1, loss_type="ml", max_update_items=1, initial_weight=1)
    best_model_ml, test_acc_df_ml = train_model(model_ml, xs_train=xs_train_mm_norm, xs_test=xs_test_mm_norm, 
        essay2crels_train=essay2crels_train, essay2crels_test=essay2crels_test, 
        max_epochs=25, early_stop_iters=10, train_instance_fn = train_cost_sensitive_instance)
    
    topn2metrics[top_n].append(test_acc_df_ml)
    f1 = test_acc_df_ml["f1_score"].values[0]
    if f1 > max_f1:
        print("*" * 80)
        max_f1 = f1
        print("New Max F1: {f1:.4f} \tTop N: {top_n}".format(f1=max_f1, top_n=top_n))

top_n: 2
Epoch: 0 Train Accuracy: 0.7756 Test Accuracy: 0.7441
Epoch: 1 Train Accuracy: 0.7776 Test Accuracy: 0.7525
Epoch: 2 Train Accuracy: 0.7793 Test Accuracy: 0.7567
Epoch: 3 Train Accuracy: 0.7804 Test Accuracy: 0.7565
Epoch: 4 Train Accuracy: 0.7808 Test Accuracy: 0.7526
Epoch: 5 Train Accuracy: 0.7825 Test Accuracy: 0.7505
Epoch: 6 Train Accuracy: 0.7827 Test Accuracy: 0.7484
Epoch: 7 Train Accuracy: 0.7837 Test Accuracy: 0.7493
Epoch: 8 Train Accuracy: 0.7854 Test Accuracy: 0.7471
Epoch: 9 Train Accuracy: 0.7855 Test Accuracy: 0.7467
Epoch: 10 Train Accuracy: 0.7858 Test Accuracy: 0.7419
Epoch: 11 Train Accuracy: 0.7881 Test Accuracy: 0.7426
Epoch: 12 Train Accuracy: 0.7894 Test Accuracy: 0.7407
Best Test Acc: 0.7567
********************************************************************************
New Max F1: 0.7567 	Top N: 2
top_n: 3
Epoch: 0 Train Accuracy: 0.7686 Test Accuracy: 0.7403
Epoch: 1 Train Accuracy: 0.7726 Test Accuracy: 0.7498
Epoch: 2 Train Accuracy: 0.7743 Test 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Epoch: 7 Train Accuracy: 0.7820 Test Accuracy: 0.7484


KeyboardInterrupt: 

## TODO
- Try Sampling from the Predicted Parses