In [1]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries

In [2]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [4]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:1->Result:50',
 'Causer:11->Result:50',
 'Causer:13->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [7]:
def evaluate_model(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        model_train_predict(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    cv_td_preds_by_sent = []
    cv_vd_preds_by_sent = []
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent) in serial_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        
        cv_td_preds_by_sent.append(td_preds_by_sent)
        cv_vd_preds_by_sent.append(vd_preds_by_sent)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent

In [8]:
def predict_by_sent(tagged_essays, model):
    predict_by_sent = []
    for essay_ix, essay in enumerate(tagged_essays):
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            pred_relations = model.predict_sentence(taggged_sentence, predicted_tags)
            # Store predictions for evaluation
            predict_by_sent.append(pred_relations)
    return predict_by_sent

def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    td_preds_by_sent = predict_by_sent(essays_TD, parse_model)
    vd_preds_by_sent = predict_by_sent(essays_VD, parse_model)
    
    return num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent

In [20]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

In [9]:
LINE_WIDTH = 80

# other settings
DOWN_SAMPLE_RATE = 1.0  # For faster smoke testing the algorithm
BASE_LEARNER_FACT = None
COLLECTION_PREFIX = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_MOST_RECENT_CODE"

# some of the other extractors aren't functional if the system isn't able to do a basic parse
# so the base extractors are the MVP for getting to a basic parser, then additional 'meta' parse
# features from all_extractors can be included
base_extractors = [
    single_words,
    word_pairs,
    three_words,
    between_word_features
]

all_extractor_fns = base_extractors + [
    word_distance,
    valency,
    unigrams,
    third_order,
    label_set,
    size_features
]

all_cost_functions = [
    micro_f1_cost,
    micro_f1_cost_squared,
    micro_f1_cost_plusone,
    micro_f1_cost_plusepsilon,
    binary_cost,
    inverse_micro_f1_cost,
    uniform_cost
]

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

### Note that these are different for Skin Cancer dataset

In [10]:
ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

In [11]:
# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                                    'three_words', 'third_order', 'unigrams'] # type: List[str]

# Train for Test Set Eval

In [14]:
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [53]:
%%time
result_test = evaluate_model(
    collection_prefix=COLLECTION_PREFIX,
    folds=test_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

CPU times: user 29.2 s, sys: 242 ms, total: 29.5 s
Wall time: 29.4 s


### Test Metrics (All Codes Inc. Ana)

#### Train

In [54]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.997865,0.741951,0.713494,0.772773


#### Test

In [55]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.99776,0.701754,0.727848,0.677467


In [56]:
pred_crels_by_sent = cv_vd_preds_by_sent[0]
crels_by_sent = get_crel_tags_by_sent(pred_tagged_essays_test)
len(pred_crels_by_sent), len(crels_by_sent)

(1918, 1918)

In [82]:
fp_count = 0
fn_count = 0
tp_count = 0
lbl_count = 0
pred_count = 0
misses = 0
fp_ixs = []
fn_ixs = []
tp_ixs = []
for i, (act, pred) in enumerate(zip(crels_by_sent, pred_crels_by_sent)):
    false_neg = act - pred
    false_pos = pred - act
    true_pos = act.intersection(pred)
    
    fp_count += len(false_pos)
    fn_count += len(false_neg)
    tp_count += len(true_pos)
    lbl_count += len(act)
    pred_count += len(pred)
    
    if len(true_pos) == len(act) and len(act) > 0:
        tp_ixs.append(i)
    if false_neg:
        fn_ixs.append(i)
    if false_pos:
        fp_ixs.append(i)
    
    if false_neg or false_pos:
        misses += 1
        
print(fp_count, fn_count, tp_count, lbl_count, "\t", len(fp_ixs), len(fn_ixs), "\t", misses)

228 172 460 632 	 181 145 	 282


In [60]:
rec = tp_count/lbl_count
prec = tp_count/pred_count

2*rec*prec/(rec+prec), rec, prec

(0.696969696969697, 0.7278481012658228, 0.6686046511627907)

In [90]:
def get_act_cc_tags(sent_ix, essay):
    cc_tags = set()
    for wd, tags in essay.sentences[sent_ix]:
        for t in tags:
            if t[0].isdigit() and not "->" in t:
                cc_tags.add(t)
    return cc_tags

def get_all_cc_tags(sent_ix, essay):
    cc_tags = set()
    for wd, tags in essay.sentences[sent_ix]:
        for t in tags:
            if not "->" in t:
                cc_tags.add(t)
    return cc_tags

In [147]:
# use training data to get relative counts of crels and compute probabilities of each crel
tally_crels = defaultdict(int)
total = 0
for crels in get_crel_tags_by_sent(pred_tagged_essays_train):
    for cr in crels:
        tally_crels[cr] +=1
        total += 1
        
p_crels = defaultdict(float)
for cr, cnt in tally_crels.items():
    p_crels[cr] = cnt / total
    
def to_short_tag(tag):
    return tag.replace("Causer:","").replace("Result:", "")
    
# sorted(p_crels.items(), key = lambda tpl: -tpl[-1])
def print_crels_probs(crels, output_probs=True):
    cprobs = []
    for crel in crels:
        crel_short = to_short_tag(crel)
        if not output_probs:
            cprobs.append(crel_short)
        else:
            cprobs.append("{crel} - {prob:.3f}".format(crel=crel_short, prob=p_crels[crel]))
    return (", ".join(cprobs))

def print_sentence_tags(words, tags):
    sent = ""
    assert len(words) == len(tags)
    for wd, t in zip(words, tags):
        if type(t) == str:
            t = {t}
        tag_str = ""
        st = set([to_short_tag(tt) for tt in t])
        for tag in st:
            if "->" in tag:
                continue
            if tag[0].isdigit():
                tag_str += tag + ":"
        sent += tag_str + wd + " "        
    return sent.strip()

In [156]:
ixs = set(fn_ixs[:20])
essays = pred_tagged_essays_test
assert len(crels_by_sent) == len(pred_crels_by_sent), (len(crels_by_sent), len(pred_crels_by_sent))

sent_ix = -1
for e in essays:
    for six, sent in enumerate(e.sentences):
        sent_ix += 1
        if sent_ix not in ixs:
            continue
        
        act,pred = crels_by_sent[sent_ix], pred_crels_by_sent[sent_ix]
        
        pred_codes = set([c for c in e.pred_tagged_sentences[six] if c != EMPTY])
        act_codes     = get_act_cc_tags(six, e)
        all_act_codes = get_all_cc_tags(six, e)
        
        false_neg = act - pred
        false_pos = pred - act
        true_pos = act.intersection(pred)
        words, cc_tags = zip(*sent)
        print(sent_ix, e.name, six)
        print("ACT:  ", print_sentence_tags(words, cc_tags))
        print("PRED: ", print_sentence_tags(words, e.pred_tagged_sentences[six]))
        print("ACT CREL:", print_crels_probs(act, False))
        print("TP      :", print_crels_probs(true_pos))
        print("FP      :", print_crels_probs(false_pos))
        print("FN      :", print_crels_probs(false_neg))
#         print(act)
#         print(pred)
#         print(false_neg)
        print("ACT  CC :", act_codes)
        print("PRED CC :", pred_codes)
        print("ALL ACT :", all_act_codes)
        if six > 0:            
            print("PREV ACT:", get_all_cc_tags(six-1, e))
        if six < len(e.sentences) - 1:
            print("NEXT ACT:", get_all_cc_tags(six+1, e))
        print()
        
        prev_codes.append(all_act_codes)

13 EBA1415_BGJD_1_CB_ES-05725.ann 3
ACT:   corals depend on clear , shallow , tropical waters , coral tissues , 7:algae 7:called 7:zooxanthellae 5:need 5:light 5:for 5:the 5:process 5:of 5:photosynthesis .
PRED:  corals depend on clear , shallow , tropical waters , coral tissues , algae called zooxanthellae need light for the process of photosynthesis .
ACT CREL: 5->7
TP      : 
FP      : 
FN      : 5->7 - 0.004
ACT  CC : set()
PRED CC : set()
ALL ACT : {'explicit', 'Result', 'Causer', 'Result:7', 'Causer:5'}
PREV ACT: set()
NEXT ACT: {'Result:5b', 'explicit', 'Result', 'Causer', 'Causer:4'}

14 EBA1415_BGJD_1_CB_ES-05725.ann 4
ACT:   corals 4:need 4:chemicals ( 4:co2 ) that provide 5b:coral 5b:with 5b:the 5b:energy 5b:it 5b:needs 5b:to 5b:survive .
PRED:  corals need chemicals ( co2 ) that provide coral with the energy it needs to survive .
ACT CREL: 4->5b
TP      : 
FP      : 
FN      : 4->5b - 0.006
ACT  CC : set()
PRED CC : set()
ALL ACT : {'Result:5b', 'explicit', 'Result', 'Cause

## Notes / Observations
- A knowledge of the probability of the predicted Crel should definitely be useful to the model. Some crels that are predicted are never observed in the training data, and there are a few that are in the wrong direction (and the predicted direction is not observed or is less likely).
- Similarly, often when the CC tagger predicts codes the causal model connects them together incorrectly. Awareness of the confidence of those predictions is probably useful.
- Sometimes words like because appear in the sentence but not between the two codes, and so the model doesn't pick up the causality. 
- As observed before, occasionally one of the codes in the causal relation appears in the previous or subsequent sentence, but this doesn't appear to be that common. A lot of the errors are either the model not predicting a CC code, or predicting codes that aren't there.
- Make sure there is a feature (if missing already) that encodes the order of the codes in the sentence as often that informs the order of the causality, but not always.

## Count the Max Number of Predicted Tags Per Essay

In [165]:
essays = pred_tagged_essays_train

act_count = []
pred_count = []
for e in essays:
    unique_act = set()
    unique_pred = set()
    for six, sent in enumerate(e.sentences):
        pred_codes = set([c for c in e.pred_tagged_sentences[six] if c != EMPTY])
        act_codes  = get_act_cc_tags(six, e)
        unique_act.update(act_codes)
        unique_pred.update(pred_codes)
    act_count.append(len(unique_act))
    pred_count.append(len(unique_pred))

In [174]:
np.mean(act_count), np.mean([c for c in act_count if c]), np.max(act_count)

(4.128603104212861, 4.251141552511416, 11)

In [176]:
np.mean(pred_count), np.mean([c for c in pred_count if c]), np.max(pred_count)

(4.452328159645233, 4.568828213879408, 12)