## Goals
 - Take the merged predictions and evaluate the prediction accuracy using the 2 different approaches
 1. Look at the anaphora tags and then cross-reference co-reference labels
 2. Use the co-reference chains directly

In [1]:
import dill
from FindFiles import find_files
from Settings import Settings
from CoRefHelper import EMPTY
from collections import defaultdict
from BrattEssay import ANAPHORA

DATASET = "CoralBleaching" # CoralBleaching | SkinCancer
PARTITION = "Training" # Training | Test

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
merged_predictions_folder = root_folder + "CoReference/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [2]:
essay_files = find_files(merged_predictions_folder)
if PARTITION == "Training":
    essay_files = [e for e in essay_files if "train" in e]
assert len(essay_files) == 1
with open(essay_files[0], "rb") as f:
    essays = dill.load(f)
len(essays)

902

In [3]:
essay_files

['/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/training_processed.dill']

### Validate the Lengths

In [162]:
def validate_essays(essays):
    for e in essays:    
        # map coref ids to sent_ix, wd_ix tuples
        # now look for ana tags that are also corefs, and cross reference
        for sent_ix in range(len(e.sentences)):
            sent     = e.sentences[sent_ix]
            ana_tags = e.ana_tagged_sentences[sent_ix]
            coref_ids= e.pred_corefids[sent_ix]
            ner_tags = e.pred_ner_tags_sentences[sent_ix]
            pos_tags = e.pred_pos_tags_sentences[sent_ix]
            ptags    = e.pred_tagged_sentences[sent_ix]

            assert len(sent) == len(coref_ids)

            assert len(sent) == len(ana_tags) == len(coref_ids) == len(ner_tags) == len(pos_tags) == len(ptags),\
                (len(sent), len(ana_tags), len(coref_ids), len(ner_tags), len(pos_tags), len(ptags), e.name, sent_ix)
            assert len(sent) > 0
            
validate_essays(essays)

In [84]:
def tally_essay_attributes(essays, attribute_name="pred_pos_tags_sentences"):
    tally = defaultdict(int)
    for e in essays:
        nested_list = getattr(e, attribute_name)
        for lst in nested_list:
            for item in lst:
                if type(item) == str:
                    tally[item] +=1
                elif type(item) == set:
                    for i in item:
                        tally[i] +=1
                else:
                    raise Exception("Unexpected item type")
    return tally

In [85]:
ner_tally = tally_essay_attributes(essays, attribute_name="pred_ner_tags_sentences")
pos_tally = tally_essay_attributes(essays, attribute_name="pred_pos_tags_sentences")

In [86]:
# sorted(pos_tally.items())

## Look at the Anaphor Tags

In [5]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[13]',
 'Anaphor:[14]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[5b]',
 'Anaphor:[6]',
 'Anaphor:[7]']

In [6]:
len(all_ana_tags), len(reg_tags)

(13, 13)

In [7]:
def build_chain(e):
    """ Takes an essay object, and creats a map of Dict[str, List[Tuple{int,int}]]
        which maps a coref id (essay scope) to a list of (sent_ix,wd_ix) pairs
    """
    corefid_2_chain = defaultdict(list)
    for sent_ix in range(len(e.sentences)):
        sent     = e.sentences[sent_ix]
        coref_ids= e.pred_corefids[sent_ix]
        for wd_ix in range(len(sent)):
            wd_coref_ids = coref_ids[wd_ix] # Set[str]
            for cr_id in wd_coref_ids:
                pair = (sent_ix, wd_ix)
                corefid_2_chain[cr_id].append(pair)
    return corefid_2_chain

In [129]:
def build_segmented_chain(e):
    """ Takes an essay object, and creats a map of Dict[str, List[List[Tuple{int,int}]]
        which maps a coref id (essay scope) to a nested list of (sent_ix,wd_ix) pairs.
        The nested list has a separate inner list for every distinct coreference seq/phrase
    """

    corefid_2_chain = build_chain(e)
    corefid_2_segmented_chain = dict()
    for cref, pairs in corefid_2_chain.items():
        segmented = [[pairs[0]]]
        corefid_2_segmented_chain[cref] = segmented
        last_sent_ix, last_wd_ix = pairs[0]
        for pair in pairs[1:]:
            sent_ix, wd_ix = pair
            if sent_ix != last_sent_ix or (wd_ix - last_wd_ix) > 1:
                # create a new nested list
                segmented.append([])
            # append pair to last list item
            segmented[-1].append(pair)        
            last_sent_ix, last_wd_ix = pair
    return corefid_2_segmented_chain

corefid_2_segmented_chain = build_segmented_chain(essays[2])
for cref, seg_chain in sorted(corefid_2_segmented_chain.items()):
    print(cref)
    for lst in seg_chain:
        print("\t", str(lst))

1
	 [(0, 23), (0, 24), (0, 25)]
2
	 [(7, 0), (7, 1)]
	 [(7, 8)]
3
	 [(2, 0), (2, 1)]
	 [(3, 2)]
	 [(3, 9)]


In [149]:
from processessays import Essay

def get_ana_tagged_essays(essays, format_ana_tags=True, filter_to_predicted_tags=True, look_back_only=True,
                         max_cref_phrase_len=None
                         ):
    """
    Create a copy of essays, augmenting the pred_tagged_sentences object with additional anaphora tags
    
    essays:                   List[Essay] objects - merged tagged essays
    format_ana_tags:          bool - Add ana tags as Anaphor[xyz] or as just the regular concept codes
    filter_to_predicted_tags: bool - Filter to just the predicted anaphor tags
    look_back_only:           bool - Only look to coreferences occuring earlier in the essay
    max_cref_phrase_len:      Union(int,None) - if specified, maximum coreference length to consider
    """
    ana_tagged_essays = []
    for eix, e in enumerate(essays):

        ana_tagged_e = Essay(e.name, e.sentences)
        ana_tagged_e.pred_tagged_sentences = []
        ana_tagged_essays.append(ana_tagged_e)

        # map coref ids to sent_ix, wd_ix tuples
        corefid_2_chain = build_segmented_chain(e)

        # now look for ana tags that are also corefs, and cross reference
        for sent_ix in range(len(e.sentences)):
            ana_tagged_sent = []
            ana_tagged_e.pred_tagged_sentences.append(ana_tagged_sent)

            sent     = e.sentences[sent_ix]
            ana_tags = e.ana_tagged_sentences[sent_ix]
            coref_ids= e.pred_corefids[sent_ix]
            ner_tags = e.pred_ner_tags_sentences[sent_ix]
            pos_tags = e.pred_pos_tags_sentences[sent_ix]
            ptags    = e.pred_tagged_sentences[sent_ix]    

            for wd_ix in range(len(sent)):
                word = sent[wd_ix]
                is_ana_tag = ana_tags[wd_ix] == ANAPHORA
                wd_coref_ids = coref_ids[wd_ix] # Set[str]            
                
                word, wd_tags = sent[wd_ix]
                pred_cc_tag = ptags[wd_ix]

                wd_ptags = set()
                # add predicted concept code tag (filtered out by evaluation code, which filters to specific tags)
                if pred_cc_tag != EMPTY:
                    wd_ptags.add(pred_cc_tag)

                ana_tagged_sent.append(wd_ptags)
                if len(wd_coref_ids) == 0:
                    continue
                
                # Get codes for corresponding co-ref chain entries
                if ((filter_to_predicted_tags and is_ana_tag) or not filter_to_predicted_tags):                        
                    
                    for cr_id in wd_coref_ids:                        
                        segmented_chain = corefid_2_chain[cr_id]
                        for cref_phrase in segmented_chain: # iterate thru the list of sent_ix,wd_ix's
                                                            # in 1 cref phrase
                            if max_cref_phrase_len and len(cref_phrase) > max_cref_phrase_len:
                                continue
                            for ch_sent_ix, ch_wd_ix in cref_phrase:
                                # if it's the current word, skip
                                if ch_sent_ix == sent_ix and ch_wd_ix == wd_ix:
                                    continue
                                # for anaphors only - only look at chain ixs before the current word
                                # if's it's after the current word in the essay, skip
                                if look_back_only:
                                    # sentence later in the essay, or same sentence but word is after current word
                                    if ch_sent_ix > sent_ix or \
                                      (ch_sent_ix == sent_ix and ch_wd_ix >= wd_ix):
                                        continue

                                chain_ptag = e.pred_tagged_sentences[ch_sent_ix][ch_wd_ix]
                                if chain_ptag != EMPTY:
                                    code = chain_ptag
                                    if format_ana_tags:
                                        code =  "{anaphora}:[{code}]".format(
                                            anaphora=ANAPHORA, code=chain_ptag)
                                    wd_ptags.add(code)
    # validation check    
    #   check essay and sent lengths align
    for e in ana_tagged_essays:
        assert len(e.sentences) == len(e.pred_tagged_sentences)
        for ix in range(len(e.sentences)):
            assert len(e.sentences[ix]) == len(e.pred_tagged_sentences[ix])
    return ana_tagged_essays

In [139]:
from results_procesor import ResultsProcessor

# Modify this function from the Resultsprocessor so that it works with Set[str] of predicted tags 
# as well as scalar strings
def get_wd_level_preds(essays, expected_tags):
    expected_tags = set(expected_tags)
    ysbycode = defaultdict(list)
    for e in essays:
        for sentix in range(len(e.sentences)):
            p_ccodes = e.pred_tagged_sentences[sentix]
            for wordix in range(len(p_ccodes)):
                tags = p_ccodes[wordix]
                if type(tags) == str:
                    ptag_set = {tags}
                elif type(tags) in (set,list):
                    ptag_set = set(tags)   
                else:
                    raise Exception("Unrecognized tag type")
                for exp_tag in expected_tags:
                    ysbycode[exp_tag].append(ResultsProcessor._ResultsProcessor__get_label_(exp_tag, ptag_set))
    return ysbycode

In [133]:
from results_procesor import metrics_to_df

def get_df(mean_metrics):
    df = metrics_to_df(mean_metrics)
    df = df[["code","recall","precision","f1_score","data_points"]]
    df = df.sort_values("code")
    return df[~df.code.str.contains("MEAN")]

In [143]:
def get_metrics(essays, expected_tags, micro_only=False):
    act_ys_bycode  = ResultsProcessor.get_wd_level_lbs(essays,  expected_tags=expected_tags)
    pred_ys_bycode = get_wd_level_preds(essays, expected_tags=expected_tags)
    mean_metrics = ResultsProcessor.compute_mean_metrics(act_ys_bycode, pred_ys_bycode)
    df = get_df(mean_metrics)
    if micro_only:
        df = df[df.code == "MICRO_F1"]
    return df

## Map CoRefs To Tags

In [144]:
ana_backwd_tagged_essays = get_ana_tagged_essays(essays, look_back_only=True)
ana_both_tagged_essays = get_ana_tagged_essays(essays, look_back_only=False)
# map new tags to existing labels, not anaphora labels
collapsed_ana_tagged_essays = get_ana_tagged_essays(essays, format_ana_tags=False)

In [178]:
# don't filter to anaphora tags
cref_tagged_essays           = get_ana_tagged_essays(essays,
                                                     filter_to_predicted_tags=False, 
                                                     look_back_only=True) # worse if both directions
collapsed_cref_tagged_essays = get_ana_tagged_essays(essays, 
                                                     format_ana_tags=False,
                                                     filter_to_predicted_tags=False, 
                                                     look_back_only=True) # worse if both directions

# 1. Accuracy using Anaphora Predictions

## Compute Accuracy on Anaphora Tags Only - Word Level

#### Look Backwards Only

In [152]:
# Test
get_metrics(ana_backwd_tagged_essays , all_ana_tags,  micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.020349,0.241379,0.037534,1783158.0


In [137]:
# lopok backwards only (assume anaphora references are prior to anaphor)
get_metrics(ana_backwd_tagged_essays, all_ana_tags,  micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.020349,0.241379,0.037534,1783158.0


#### Look Forwards and Backwards

In [138]:
# look forward and backward (lower as expected - note that this only lowers precision, 
#   recall is the same so all backwards)
get_metrics(ana_both_tagged_essays, all_ana_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.020349,0.175,0.036458,1783158.0


## Accuracy with No Anaphora Tagging (i.e. Baseline Regular CC Tagging)

In [91]:
get_metrics(essays, expected_tags=reg_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.820049,0.846703,0.833163,1783158.0


## Accuracy with Anaphora Tagging (Regular Concept Codes) - Maps Anaphora[xyz] to Normal

In [111]:
get_metrics(collapsed_ana_tagged_essays, expected_tags=reg_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.820049,0.84602,0.832832,1783158.0


#### For CB Training Data, it Mildly Hurts the F1 Score (very slighly)

# 2. Accuracy Using Cref Predictions Only

## Compute Accuracy on Anaphora Tags only

In [179]:
get_metrics(cref_tagged_essays, expected_tags=all_ana_tags)

Unnamed: 0,code,recall,precision,f1_score,data_points
11,Anaphor:[11],0.666667,0.024096,0.046512,137166.0
12,Anaphor:[12],0.0,0.0,0.0,137166.0
8,Anaphor:[13],0.064516,0.012903,0.021505,137166.0
9,Anaphor:[14],0.178571,0.022222,0.039526,137166.0
0,Anaphor:[1],0.161765,0.017107,0.030942,137166.0
6,Anaphor:[2],0.0,0.0,0.0,137166.0
2,Anaphor:[3],0.025641,0.002564,0.004662,137166.0
7,Anaphor:[4],0.0,0.0,0.0,137166.0
3,Anaphor:[50],0.204545,0.003123,0.006152,137166.0
4,Anaphor:[5],0.0,0.0,0.0,137166.0


## Accuracy with No Tagging

In [180]:
get_metrics(essays, expected_tags=reg_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.820049,0.846703,0.833163,1783158.0


## Accuracy with Anaphora Tagging (Regular Concept Codes) - Maps Anaphora[xyz] to Normal

In [181]:
get_metrics(collapsed_cref_tagged_essays, expected_tags=reg_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.826212,0.775586,0.800099,1783158.0


#### Quite a Negative Impact on F1 Score overall (CB Train - drops from .83 to 0.80)

# Add Additional Filters to Improve Accuracy

## Try Length Filter

In [168]:
# Baseline
get_metrics(ana_backwd_tagged_essays, expected_tags=all_ana_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.020349,0.241379,0.037534,1783158.0


In [171]:
# precision is higher, but overall F1 is lower as recall dropped a lot
get_metrics(get_ana_tagged_essays(essays, max_cref_phrase_len=1), expected_tags=all_ana_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.002907,0.333333,0.005764,1783158.0


In [182]:
# baseline
get_metrics(cref_tagged_essays, expected_tags=all_ana_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.107558,0.006188,0.011703,1783158.0


In [186]:
#  Filter to single word references - better than with no length filter 
#  Note - this is still backwards looking
cref_tagged_essays_maxph_1 = get_ana_tagged_essays(essays, filter_to_predicted_tags=False, max_cref_phrase_len=1)
get_metrics(cref_tagged_essays_maxph_1, expected_tags=all_ana_tags, micro_only=True)

Unnamed: 0,code,recall,precision,f1_score,data_points
17,MICRO_F1,0.026163,0.015789,0.019694,1783158.0


In [None]:
# Length filter = 2 - worse than for length = 1
cref_tagged_essays_maxph_2 = get_ana_tagged_essays(essays, filter_to_predicted_tags=False, max_cref_phrase_len=2)
get_metrics(cref_tagged_essays_maxph_2, expected_tags=all_ana_tags, micro_only=True)

### TODO
- Try different filters on the cref chain mappings
- For the anaphora tag tagging approach, don't limit to looking backwards only
- Limit to NER tags, POS tag types, etc