### Changes For SC
- Change the parameters for anaphora resolution
- Use Stanford parser
- Update the files used for the taggged essays (different RNN params)

In [1]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA
from window_based_tagger_config import get_config
from results_procesor import ResultsProcessor, __MICRO_F1__

In [2]:
# Load the shared code from the results folder
import sys
sys.path.append("/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Co-Reference Resolution/Results")

from results_common import get_essays, validate_essays, tally_essay_attributes


In [3]:
settings = Settings()

DATASET = "SkinCancer"  # SkinCancer

root_folder = settings.data_directory +  DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)

stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


# Set Optimal Parameters (from Hyper Parameter Tuning)

In [8]:
# Berkeley best for CB
coref_predictions_folder = stanford_coref_predictions_folder

filter_to_predicted_tags = True

nearest_ref_only = True
pos_ana_key =     "None"
pos_ch_key  =     "None"
max_ana_phrase_len = None
max_cref_phrase_len = 10

coref_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/'

In [9]:
# folder for the output (input to the CRel parser model)
coref_root = root_folder + "CoReference/"
output_folder = coref_root + "CRel"
output_folder

'/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/CRel'

In [10]:
print("Co-Ref folder:", coref_predictions_folder)

coref_train_essays = get_essays(coref_predictions_folder, "Training")
coref_test_essays = get_essays(coref_predictions_folder, "Test")

len(coref_train_essays), len(coref_test_essays)

Co-Ref folder: /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/
Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/training_processed.dill
Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/test_processed.dill


(870, 218)

In [12]:
rnn_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/Predictions/Bi-LSTM-4-SEARN/'

In [11]:
train_fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = rnn_predictions_folder + "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)
    
len(pred_tagged_essays_train), len(pred_tagged_essays_test)

(870, 218)

In [13]:
from results_procesor import is_a_regular_code

reg_tally = defaultdict(int)
crel_tally = defaultdict(int)
crel_ana_tally = defaultdict(int)
for e in pred_tagged_essays_train + pred_tagged_essays_test:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                t_lower = t.lower()
                if "rhet" in t_lower or "change" in t_lower or "other" in t_lower:
                    continue
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if "->" in t and ("ana" not in t_lower and 
                                  "other" not in t_lower and 
                                  "rhet" not in t_lower and 
                                  "change" not in t_lower):
                    crel_tally[t] += 1
                if "->" in t and ANAPHORA in t:
                    crel_ana_tally[t] += 1
                    

reg_tags = sorted(reg_tally.keys())
crel_tags = sorted(crel_tally.keys())

cc_crel_tags_filter = set(reg_tags + crel_tags)

In [17]:
def names_the_same(essay_sets):
    unique_fnames = [] # list of sets of str (fnames)
    for essay_collection in essay_sets:
        names = set()
        for e in essay_collection:
            names.add(e.name)
        unique_fnames.append(names)
    for a in unique_fnames:
        print(len(a))
        for b in unique_fnames:
            assert len(a) == len(b), "lens don't match"
            assert a == b, "don't match"

In [18]:
def essays_2_hash_map(essays):
    lu = {}
    for e in essays:
        lu[e.name] = e
    return lu

In [19]:
# checks the number of words and sentences are the same for 2 sets of essays
def validate_tagged_essays(essays_a, essays_b, tags_filter):
    # make sure obj is not the same
    assert essays_a != essays_b
    print("Validating", len(essays_a), "essays")
    assert len(essays_a) == len(essays_b), "Lens don't match"
    
    a_hmap = essays_2_hash_map(essays_a)
    b_hmap = essays_2_hash_map(essays_b)
    
    # same essays?
    assert a_hmap.keys() == b_hmap.keys()
    intersect = set(a_hmap.keys()).intersection(b_hmap.keys())
    assert len(intersect) == len(a_hmap.keys())
    assert len(a_hmap.keys()) > 1    
    assert len(a_hmap.keys()) == len(b_hmap.keys())
    
    word_misses = 0
    
    for key, a_essay in a_hmap.items():
        b_essay = b_hmap[key]
        # assert NOT the same obj ref
        assert a_essay != b_essay
        assert len(a_essay.sentences) == len(b_essay.sentences)
        assert len(a_essay.sentences) > 0
        assert len(b_essay.sentences) > 0
        for i in range(len(a_essay.sentences)):
            a_sent = a_essay.sentences[i]
            b_sent = b_essay.sentences[i]
            # the same lists?
            #assert a_sent == b_sent
            assert len(a_sent) == len(b_sent)
            if not len(a_sent) == len(b_sent):
                print(key, "\tsent-ix:", i, "lens", len(a_sent), len(b_sent))
            for wd_ix, (a_wd, a_tags) in enumerate(a_sent):
                b_wd, b_tags = b_sent[wd_ix]
                if a_wd != b_wd:
                    word_misses+=1
                assert a_wd   == b_wd,   \
                    "Words don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                            a=a_wd, b=b_wd, essay=key, i=i)
                
                # SH - Make conditional, as untagged essays contain new anaphora tags
                filtered_a_tags = tags_filter.intersection(a_tags)
                filtered_b_tags = tags_filter.intersection(b_tags)

                assert filtered_a_tags == filtered_b_tags, \
                    "Tags don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                        a=str(filtered_a_tags), b=str(filtered_b_tags), essay=key, i=i)                
                        
    if word_misses:
        print("Word miss-matches: ", word_misses)
    print("Validation Passed")
    return None

## Get Non Anaphora Labels for Comparison (Should Match Across Essays)

In [20]:
names_the_same([coref_train_essays, pred_tagged_essays_train])

870
870


In [21]:
names_the_same([coref_test_essays, pred_tagged_essays_test])

218
218


In [22]:
validate_tagged_essays(essays_a=coref_train_essays, essays_b=pred_tagged_essays_train,
                       tags_filter=cc_crel_tags_filter)

Validating 870 essays
Validation Passed


In [23]:
validate_tagged_essays(essays_a=coref_test_essays, essays_b=pred_tagged_essays_test,
                       tags_filter=cc_crel_tags_filter)

Validating 218 essays
Validation Passed


## Combine Predictions from Crel Essays with CoRef Data from CoRef Essays

In [24]:
# Take the coref essays (used for predictions), and copy over the prediction tags from the 
# crel essays. We do this as we also need the Anaphora labels from the CoRef data
def combine_essays(crel_essays, coref_essays):
    
    crel_hmap = essays_2_hash_map(crel_essays)
    coref_hmap = essays_2_hash_map(coref_essays)
    
    new_essays = []
    for key, crel_essay in crel_hmap.items():
        coref_essay = coref_hmap[key]
        # clone from coref essay so we grab the anaphora labels (not present in those essays)
        new_essay = coref_essay.clone()
        
        # copy coref data from coref essay
        new_essay.ana_tagged_sentences    = coref_essay.ana_tagged_sentences
        new_essay.pred_corefids           = coref_essay.pred_corefids
        new_essay.pred_ner_tags_sentences = coref_essay.pred_ner_tags_sentences
        new_essay.pred_pos_tags_sentences = coref_essay.pred_pos_tags_sentences
        
        # BUT copy predictions from the crel essay
        new_essay.pred_tagged_sentences = crel_essay.pred_tagged_sentences
        new_essays.append(new_essay)
    return new_essays

In [25]:
merged_train_essays = combine_essays(crel_essays=pred_tagged_essays_train, coref_essays=coref_train_essays)
merged_test_essays  = combine_essays(crel_essays=pred_tagged_essays_test,  coref_essays=coref_test_essays)
len(merged_train_essays), len(merged_test_essays)

(870, 218)

## Get Anaphora Tags

In [26]:
ana_tally = defaultdict(int)
all_merged_essays = merged_train_essays + merged_test_essays
for e in all_merged_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:                
                if ANAPHORA in t and "other" not in t:
                    if "Anaphor:[" in t and "rhetorical" not in t and "->" not in t:
                        ana_tally[t] += 1

all_ana_tags = sorted(ana_tally.keys())
assert len(all_ana_tags) == len(reg_tags), "Number of anaphora tags doesn't match the number of regular tags"
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[6]']

In [27]:
crel_ana_tally = defaultdict(int)

for e in all_merged_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                t_lower = t.lower()
                if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
                    continue
                if "->" in t and ANAPHORA in t:
                    crel_ana_tally[t] += 1
crel_ana_tally

defaultdict(int,
            {'Causer:1->Result:Anaphor': 394,
             'Causer:1->Result:Anaphor[2]': 6,
             'Causer:1->Result:Anaphor[3]': 18,
             'Causer:1->Result:Anaphor[50]': 370,
             'Causer:11->Result:Anaphor': 61,
             'Causer:11->Result:Anaphor[12]': 3,
             'Causer:11->Result:Anaphor[3]': 5,
             'Causer:11->Result:Anaphor[50]': 53,
             'Causer:12->Result:Anaphor': 44,
             'Causer:12->Result:Anaphor[2]': 3,
             'Causer:12->Result:Anaphor[3]': 15,
             'Causer:12->Result:Anaphor[50]': 26,
             'Causer:2->Result:Anaphor': 602,
             'Causer:2->Result:Anaphor[1]': 8,
             'Causer:2->Result:Anaphor[3]': 29,
             'Causer:2->Result:Anaphor[4]': 36,
             'Causer:2->Result:Anaphor[50]': 483,
             'Causer:2->Result:Anaphor[5]': 43,
             'Causer:3->Result:Anaphor': 321,
             'Causer:3->Result:Anaphor[4]': 21,
             'Causer:3->R

In [28]:
pos_tally = tally_essay_attributes(all_merged_essays, attribute_name="pred_pos_tags_sentences")

In [29]:
pos_nouns = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "NN"])
pos_verbs = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "VB"])
pos_pronouns = {"PRP","PRP$", "WP", "WP$"}
pos_determiners = {"DT","WDT","PDT"} # the, a, which, that, etc
pos_pron_dt = pos_pronouns | pos_determiners
# for meaning of pen treebank tags - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_nouns, pos_verbs, pos_pronouns, pos_determiners, pos_pron_dt

({'NN', 'NNP', 'NNPS', 'NNS'},
 {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
 {'PRP', 'PRP$', 'WP', 'WP$'},
 {'DT', 'PDT', 'WDT'},
 {'DT', 'PDT', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$'})

In [30]:
dict_pos_filter = {
            "None": None,
            "PRN": pos_pronouns,
            "DT": pos_determiners,
            "PRN+DT": pos_pron_dt
}

dict_pos_ch_filter = {
    "None": None,
    "NN": pos_nouns,
    "VB": pos_verbs,
    "NN+VB": pos_nouns | pos_verbs
}

## How Well Would Ana Resolution work with the CRel Predictions?

In [32]:
from process_essays_coref import get_coref_processed_essays
from metrics import get_metrics_raw

format_ana_tags = True # use this as true to eval performance, but then change to False for the actual exercise
filter_to_predicted_tags = True # filter based on the anaphora predictions from the other RNN

pos_ana_filter = dict_pos_filter[pos_ana_key]
pos_ch_filter  = dict_pos_ch_filter[pos_ch_key]
    
processed_train_essays_ana = get_coref_processed_essays(
                            essays=merged_train_essays, 
                            format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)

metrics = get_metrics_raw(processed_train_essays_ana, expected_tags=all_ana_tags,  micro_only=True)    
pd.DataFrame([metrics["MICRO_F1"]])

Unnamed: 0,accuracy,data_points,f1_score,num_codes,precision,recall
0,0.999652,1309239.0,0.123077,473.0,0.680851,0.067653


## Get the Final Set of Essays
- Add in new predicted Anaphora tags as additional regular codes

In [50]:
format_ana_tags = False # Set to false so Anaphora codes are merged in with the regular codes
processed_train_essays_full = get_coref_processed_essays(
                            essays=merged_train_essays, 
                            format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)

processed_test_essays_full = get_coref_processed_essays(
                            essays=merged_test_essays, 
                            format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)

## Validate there are Differences in the New Essays

In [72]:
EMPTY = 'Empty'
for a,b in zip(merged_train_essays, processed_train_essays_full):
    assert len(a.sentences)  == len(b.sentences)
    assert a.name == b.name
    
    assert len(a.pred_tagged_sentences) == len(b.pred_tagged_sentences)
    
    sent_ix =- 1
    for atag_sent, btag_sent in zip(a.pred_tagged_sentences, b.pred_tagged_sentences):        
        sent_ix += 1
        word_ix = -1
        for atags, btags in zip(atag_sent, btag_sent):
            word_ix+=1
            atags = set([atags])
            if EMPTY in atags:
                atags.remove(EMPTY)
            if atags != btags:
                print(atags, btags, b.ana_tagged_sentences[sent_ix][word_ix])

set() {'50'} Anaphor
set() {'1'} Anaphor
set() {'1'} Anaphor
set() {'7'} Anaphor
set() {'14'} Anaphor
set() {'13'} Anaphor
set() {'13'} Anaphor
set() {'50'} Anaphor
set() {'50'} Anaphor
set() {'1'} Anaphor
set() {'3'} Anaphor
set() {'3'} Anaphor
set() {'14'} Anaphor
set() {'50'} Anaphor
set() {'1'} Anaphor
set() {'13'} Anaphor
set() {'6'} Anaphor
set() {'14'} Anaphor
set() {'7'} Anaphor
set() {'50'} Anaphor
set() {'11'} Anaphor
set() {'14'} Anaphor
set() {'1'} Anaphor
set() {'13'} Anaphor
set() {'1'} Anaphor


## Add in Additional Crel Codes (from Anaphora Codes mapped to Regular Crel Codes)

In [146]:
from results_procesor import is_a_regular_code

def get_anaphora_crel_codes(tags):
    
    additional_codes = set()
    prefix = "Anaphor["
    
    for k in tags:        
        if prefix in k: # has an Anaphor tag with an indentified code
            k_lower = k.lower()
            if "rhetorical" in k_lower or "other" in k_lower or "change" in k_lower:
                continue
            k_fixed = k.replace(prefix, "").replace("]","")
            if ANAPHORA not in k_fixed:
                l,r = k_fixed.split("->")
                l_code = l.replace("Causer:","")
                r_code = r.replace("Result:","")
                assert is_a_regular_code(l_code), l_code
                assert is_a_regular_code(r_code), r_code
                additional_codes.add(k_fixed)
    return additional_codes

add_crel_codes = get_anaphora_crel_codes(crel_ana_tally.keys())
# did we add any new unique tags?
add_crel_codes - set(crel_tags)

{'Causer:11->Result:1',
 'Causer:14->Result:11',
 'Causer:2->Result:4',
 'Causer:2->Result:7',
 'Causer:7->Result:6'}

In [147]:
def add_crel_ana_codes_as_regular_relations(essays):
    new_essays = []
    for e in essays:
        new_essay = e.clone()
        new_essays.append(new_essay)
        
        new_essay.ana_tagged_sentences    = e.ana_tagged_sentences
        new_essay.pred_corefids           = e.pred_corefids
        new_essay.pred_ner_tags_sentences = e.pred_ner_tags_sentences
        new_essay.pred_pos_tags_sentences = e.pred_pos_tags_sentences
        new_essay.pred_tagged_sentences   = e.pred_tagged_sentences
        
        new_essay.sentences = []
        for sent in e.sentences:
            new_sent = []
            new_essay.sentences.append(new_sent)
            for wd, tags in sent:
                new_tags = set(tags)
                addnl_crel_tags = get_anaphora_crel_codes(tags)
                if len(addnl_crel_tags) > 0:
                    new_tags.update(addnl_crel_tags)
                new_sent.append((wd, new_tags))
            assert len(new_sent) == len(sent)
        assert len(new_essay.sentences) == len(e.sentences)
    return new_essays      

In [149]:
processed_train_essays_full_with_codes = add_crel_ana_codes_as_regular_relations(processed_train_essays_full)
processed_test_essays_full_with_codes  = add_crel_ana_codes_as_regular_relations(processed_test_essays_full)

len(processed_train_essays_full_with_codes), len(processed_test_essays_full_with_codes)

(902, 226)

In [151]:
# make sure the new essays are different - this should blow up (it does)
# validate_tagged_essays(processed_train_essays_full, processed_train_essays_full_with_codes, set(crel_tags))
# validate_tagged_essays(processed_test_essays_full, processed_test_essays_full_with_codes, set(crel_tags))

In [152]:
rnn_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-SEARN/'

In [153]:
coref_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/'

## Serialize the Results

In [162]:
import pathlib

p = pathlib.Path(output_folder)
p.mkdir(parents=True, exist_ok=True)

In [173]:
import dill
fname = output_folder + "/training_crel_anatagged_essays.dill"
with open(fname, "wb+") as f:
    dill.dump(processed_train_essays_full_with_codes, f)

In [174]:
import dill
fname = output_folder + "/test_crel_anatagged_essays.dill"
with open(fname, "wb+") as f:
    dill.dump(processed_test_essays_full_with_codes, f)