In [2]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA

import sys
sys.path.append("/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Co-Reference Resolution/Results")

from results_common import get_essays, validate_essays, tally_essay_attributes
from process_essays_coref import get_coref_processed_essays, processed_essays_predict_most_recent_tag
from metrics import get_metrics_raw

# progress bar widget
from ipywidgets import IntProgress
from IPython.display import display

DATASET = "CoralBleaching" # CoralBleaching | SkinCancer

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"
# Which algorithm?
coref_predictions_folder = berkeley_coref_predictions_folder
print("CoRef Data: ", stanford_coref_predictions_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
CoRef Data:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/


In [3]:
training_essays = get_essays(coref_predictions_folder, "Training")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/training_processed.dill


In [4]:
test_essays = get_essays(coref_predictions_folder, "Test")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


In [5]:
all_essays = training_essays + test_essays

## Look at the Anaphor Tags

In [6]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in all_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
assert len(reg_tags) == len(all_ana_tags)
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[13]',
 'Anaphor:[14]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[5b]',
 'Anaphor:[6]',
 'Anaphor:[7]']

In [7]:
def blank_if_none(val):
    return "-" if (val is None or not val or str(val).lower() == "none") else val

def replace_if_blank(val, replace):
    if val == "" or val == "-":
        return replace
    return val

# Grid Search With Anaphora Prediction Filters

In [12]:
from process_essays_coref import *
def processed_essays_predict_most_recent_tag(essays, format_ana_tags=True):

    """
    Uses the most recently predicted concept code as the predicted tag
    
            essays:                   List[Essay] objects - merged tagged essays
    """

    ana_tagged_essays = []
    for eix, e in enumerate(essays):

        fix_coref_ids(e)
        
        # following are flattened so they span sentences
        seq_pred_tags  = [] # all predicted tags
        seq_is_ana_tag = [] # is ana tag
        seq_ix = -1
        
        ana_tagged_e = Essay(e.name, e.sentences)
        ana_tagged_e.pred_tagged_sentences = []
        ana_tagged_e.pred_pos_tags_sentences = list(e.pred_pos_tags_sentences)
        ana_tagged_e.pred_ner_tags_sentences = list(e.pred_pos_tags_sentences)
        ana_tagged_e.ana_tagged_sentences    = list(e.ana_tagged_sentences)
        ana_tagged_e.pred_corefids           = list(e.pred_corefids)
        ana_tagged_essays.append(ana_tagged_e)

        # now look for ana tags that are also corefs, and cross reference
        for sent_ix in range(len(e.sentences)):
            ana_tagged_sent = []
            ana_tagged_e.pred_tagged_sentences.append(ana_tagged_sent)

            sent = e.sentences[sent_ix]

            # SENTENCE LEVEL TAGS / PREDICTIONS
            ana_tags = e.ana_tagged_sentences[sent_ix]
            coref_ids = e.pred_corefids[sent_ix]
            # ner_tags = e.pred_ner_tags_sentences[sent_ix]
            pos_tags = e.pred_pos_tags_sentences[sent_ix]
            ptags = e.pred_tagged_sentences[sent_ix]

            for wd_ix in range(len(sent)):
                seq_ix +=1
                
                pos_tag = pos_tags[wd_ix]  # POS tag

                word, _ = sent[wd_ix]  # ignore actual tags
                pred_cc_tag = ptags[wd_ix]  # predict cc tag
                seq_pred_tags.append(pred_cc_tag)

                is_ana_tag = ana_tags[wd_ix] == ANAPHORA
                seq_is_ana_tag.append(is_ana_tag)
                
                wd_coref_ids = coref_ids[wd_ix]  # Set[str]

                # note we are changing this to a set rather than a single string
                wd_ptags = set()
                # initialize predicted tags, inc. cc tag
                # DON'T run continue until after this point
                ana_tagged_sent.append(wd_ptags)

                # add predicted concept code tag (filtered out by evaluation code, which filters to specific tags)
                if pred_cc_tag != EMPTY:
                    wd_ptags.add(pred_cc_tag)
                # else here because we don't want to assign additional cc tags if there are already ones
                elif is_ana_tag and pred_cc_tag == EMPTY: # and current tag is EMPTY
                    code = find_previous_predicted_tag(seq_ix, seq_pred_tags, seq_is_ana_tag)  
                    if code is None:
                    	code = EMPTY              
                    if format_ana_tags:
                        code = "{anaphora}:[{code}]".format(anaphora=ANAPHORA, code=code)
                    wd_ptags.add(code)

    # validation check
    #   check essay and sent lengths align
    for e in ana_tagged_essays:
        assert len(e.sentences) == len(e.pred_tagged_sentences)
        for ix in range(len(e.sentences)):
            assert len(e.sentences[ix]) == len(e.pred_tagged_sentences[ix])

    return ana_tagged_essays

In [13]:
def grid_search(essays, expected_tags, format_ana_tags=True):
    rows_ana = []
    proc_essays = processed_essays_predict_most_recent_tag(essays=essays, format_ana_tags=format_ana_tags)

    metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags,  micro_only=True)
    row = metrics["MICRO_F1"]
    rows_ana.append(row)

    df_results = pd.DataFrame(rows_ana)
    return df_results

### Training

In [14]:
df_train = grid_search(essays=training_essays, expected_tags=all_ana_tags)
df_train # 0.262 0.287, 0.241

Unnamed: 0,accuracy,data_points,f1_score,num_codes,precision,recall
0,0.999738,1783158.0,0.262243,344.0,0.287197,0.241279


## Test

In [15]:
df_test = grid_search(essays=test_essays, expected_tags=all_ana_tags)
df_test

Unnamed: 0,accuracy,data_points,f1_score,num_codes,precision,recall
0,0.999885,399087.0,0.323529,39.0,0.37931,0.282051


In [22]:
def get_predicted_tags_by_word(essays_a):
    diff_by_sent = defaultdict(list)
    i = -1
    for ea in essays_a:
        for asent in ea.pred_tagged_sentences:
            i += 1
            for tags in asent:
                for t in tags:
                    if t.startswith("Ana"):
                        diff_by_sent[i].append(t)
    return diff_by_sent

In [25]:
pred_test_essays = processed_essays_predict_most_recent_tag(essays=test_essays, format_ana_tags=True)
tally = get_predicted_tags_by_word(pred_test_essays)
tally

defaultdict(list,
            {28: ['Anaphor:[50]'],
             41: ['Anaphor:[50]'],
             241: ['Anaphor:[50]'],
             410: ['Anaphor:[1]'],
             455: ['Anaphor:[4]'],
             545: ['Anaphor:[13]'],
             615: ['Anaphor:[13]'],
             657: ['Anaphor:[50]'],
             695: ['Anaphor:[3]'],
             808: ['Anaphor:[2]'],
             886: ['Anaphor:[50]'],
             950: ['Anaphor:[50]'],
             967: ['Anaphor:[6]'],
             1009: ['Anaphor:[50]'],
             1045: ['Anaphor:[1]'],
             1071: ['Anaphor:[1]', 'Anaphor:[1]'],
             1110: ['Anaphor:[14]'],
             1111: ['Anaphor:[11]'],
             1112: ['Anaphor:[13]'],
             1121: ['Anaphor:[1]'],
             1123: ['Anaphor:[4]'],
             1154: ['Anaphor:[5]'],
             1213: ['Anaphor:[4]'],
             1229: ['Anaphor:[1]'],
             1551: ['Anaphor:[1]'],
             1596: ['Anaphor:[11]'],
             1694: ['Anaphor:[1]'

In [27]:
count = 0
for i, tags in tally.items():
    for t in tags:
        count +=1
count

29