## Goals
 - Take the merged predictions and evaluate the prediction accuracy using the 2 different approaches
 1. Look at the anaphora tags and then cross-reference co-reference labels
 2. Use the co-reference chains directly

In [1]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA

from results_procesor import ResultsProcessor
from results_common import get_essays, validate_essays, tally_essay_attributes
from process_essays_coref import get_coref_processed_essays

# progress bar widget
from ipywidgets import IntProgress
from IPython.display import display

DATASET = "SkinCancer" # CoralBleaching | SkinCancer

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"
# Which algorithm?
coref_predictions_folder = berkeley_coref_predictions_folder
print("CoRef Data: ", coref_predictions_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
CoRef Data:  /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/


In [2]:
# def get_wd_level_preds(essays, expected_tags):
#     expected_tags = set(expected_tags)
#     ysbycode = defaultdict(list)
#     for e in essays:
#         for sentix in range(len(e.sentences)):
#             p_ccodes = e.pred_tagged_sentences[sentix]
#             for wordix in range(len(p_ccodes)):
#                 tags = p_ccodes[wordix]
#                 if type(tags) == str:
#                     ptag_set = {tags}
#                 elif type(tags) in (set,list):
#                     ptag_set = set(tags)   
#                 else:
#                     raise Exception("Unrecognized tag type")
#                 for exp_tag in expected_tags:
#                     ysbycode[exp_tag].append(ResultsProcessor._ResultsProcessor__get_label_(exp_tag, ptag_set))
#     return ysbycode

# def get_metrics_raw(essays, expected_tags):
#     act_ys_bycode  = ResultsProcessor.get_wd_level_lbs(essays,  expected_tags=expected_tags)
#     pred_ys_bycode = get_wd_level_preds(essays, expected_tags=expected_tags)
#     mean_metrics = ResultsProcessor.compute_mean_metrics(act_ys_bycode, pred_ys_bycode)
#     return mean_metrics

In [3]:
training_essays = get_essays(coref_predictions_folder, "Training")

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/training_processed.dill


In [4]:
test_essays = get_essays(coref_predictions_folder, "Test")

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


In [5]:
all_essays = training_essays + test_essays

### Validate the Lengths

In [6]:
validate_essays(training_essays)
validate_essays(test_essays)

Essays validated
Essays validated


In [7]:
# ner_tally = tally_essay_attributes(all_essays, attribute_name="pred_ner_tags_sentences")
pos_tally = tally_essay_attributes(all_essays, attribute_name="pred_pos_tags_sentences")

## Look at the Anaphor Tags

In [8]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in all_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t and "rhetorical" not in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
assert len(reg_tags) == len(all_ana_tags)
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[6]']

In [9]:
len(reg_tags), len(all_ana_tags)

(9, 9)

In [10]:
NEAREST_REF_ONLY = "Nearest reference"
MAX_ANA_PHRASE = "Max ana phrase"
MAX_CHAIN_PHRASE = "Max chain phrase"
POS_ANA_FLTR = "POS ana filter"
POS_CHAIN_FLTR = "Pos chain filter"

def blank_if_none(val):
    return "-" if (val is None or not val or str(val).lower() == "none") else val

def replace_if_blank(val, replace):
    if val == "" or val == "-":
        return replace
    return val

def process_sort_results(df_results):
    df_disp = df_results[["f1_score","precision","recall", 
                          NEAREST_REF_ONLY, MAX_ANA_PHRASE, MAX_CHAIN_PHRASE, POS_ANA_FLTR, POS_CHAIN_FLTR]]
    return df_disp.sort_values("f1_score", ascending=False)

def filter_test_results_to_best_training_results(df_train_raw, df_test_raw):
    # make sure sorted to the top result
    df_train_raw_sorted = df_train_raw.sort_values("f1_score", ascending=False, inplace=False)
    top_row = df_train_raw_sorted.iloc[0]
    filtered_df = df_test_raw[df_test_raw[NEAREST_REF_ONLY] == top_row[NEAREST_REF_ONLY]]
    filtered_df = filtered_df[filtered_df[MAX_ANA_PHRASE]   == top_row[MAX_ANA_PHRASE]]
    filtered_df = filtered_df[filtered_df[MAX_CHAIN_PHRASE] == top_row[MAX_CHAIN_PHRASE]]
    filtered_df = filtered_df[filtered_df[POS_ANA_FLTR]     == top_row[POS_ANA_FLTR]]
    filtered_df = filtered_df[filtered_df[POS_CHAIN_FLTR]   == top_row[POS_CHAIN_FLTR]]
    print(len(filtered_df))
    return process_sort_results(filtered_df)

## Prepare POS Tag Filters

In [11]:
pos_nouns = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "NN"])
pos_verbs = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "VB"])
pos_pronouns = {"PRP","PRP$", "WP", "WP$"}
pos_determiners = {"DT","WDT","PDT"} # the, a, which, that, etc
pos_pron_dt = pos_pronouns | pos_determiners
# for meaning of pen treebank tags - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_nouns, pos_verbs, pos_pronouns, pos_determiners, pos_pron_dt

({'NN', 'NNP', 'NNS'},
 {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
 {'PRP', 'PRP$', 'WP', 'WP$'},
 {'DT', 'PDT', 'WDT'},
 {'DT', 'PDT', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$'})

In [12]:
dict_pos_filter = {
            "None": None,
            "PRN": pos_pronouns,
            "DT": pos_determiners,
            "PRN+DT": pos_pron_dt
}

dict_pos_ch_filter = {
    "None": None,
    "NN": pos_nouns,
    "VB": pos_verbs,
    "NN+VB": pos_nouns | pos_verbs
}

In [13]:
phrase_len = [None,1,2,3,5,10,20]
nearest_ref_only_values = [True,False]

# Grid Search With Anaphora Prediction Filters

In [14]:
def grid_search(essays, format_ana_tags, filter_to_predicted_tags, expected_tags):

    # set up progress bar
    max_count = len(nearest_ref_only_values) * len(phrase_len) * len(phrase_len) * len(dict_pos_filter) * len(dict_pos_ch_filter)
    iprogress_bar = IntProgress(min=0, max=max_count) # instantiate the bar
    display(iprogress_bar) # display the bar

    rows_ana = []
    
    for nearest_ref_only in nearest_ref_only_values:
        for pos_ana_key, pos_ana_filter in dict_pos_filter.items():
            for pos_ch_key, pos_ch_filter in dict_pos_ch_filter.items():                
                for max_ana_phrase_len in phrase_len:
                    for max_cref_phrase_len in phrase_len:

                        proc_essays = get_coref_processed_essays(
                            essays=essays, format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)
                        
                        metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags)
                        row = metrics["MICRO_F1"]
                        row[NEAREST_REF_ONLY] = blank_if_none(nearest_ref_only)
                        row[MAX_ANA_PHRASE]   = blank_if_none(max_ana_phrase_len)
                        row[MAX_CHAIN_PHRASE] = blank_if_none(max_cref_phrase_len)
                        row[POS_ANA_FLTR]     = blank_if_none(pos_ana_key)
                        row[POS_CHAIN_FLTR]   = blank_if_none(pos_ch_key)
                        rows_ana.append(row)
                        iprogress_bar.value += 1

    df_results = pd.DataFrame(rows_ana)
    return df_results

In [15]:
def get_all_metrics_with_optimal_settings(train_essays, test_essays, filter_to_predicted_tags,
                    nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len):

    # Anaphora tags train and test
    expected_tags = all_ana_tags
    format_ana_tags=True
    
    df = get_metrics(essays=train_essays, 
                     expected_tags=expected_tags, 
                     format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_train = process_sort_results(df)
    
    df = get_metrics(essays=test_essays, 
                    expected_tags=expected_tags,
                    format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_test = process_sort_results(df)
    
    # CC Accuracy without Anaphora Resolution
    expected_tags = reg_tags
    format_ana_tags = True # Set this to true so that we ignore anaphora resolution for the next 2
    
    df = get_metrics(essays=train_essays, 
                     expected_tags=expected_tags, 
                     format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_train_cc_reg = process_sort_results(df)
    
    df = get_metrics(essays=test_essays, 
                    expected_tags=expected_tags,
                    format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_test_cc_reg = process_sort_results(df)
    
    # CC Accuracy with Anaphora Resolution
    format_ana_tags=False
    
    df = get_metrics(essays=train_essays, 
                     expected_tags=expected_tags, 
                     format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_train_cc_ana = process_sort_results(df)
    
    df = get_metrics(essays=test_essays, 
                    expected_tags=expected_tags,
                    format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
    df_test_cc_ana = process_sort_results(df)
    
    df_concat = pd.concat([
        df_train, df_test,
        df_train_cc_reg, df_train_cc_ana,
        df_test_cc_reg, df_test_cc_ana,
    ])
    df_concat= df_concat[["f1_score","precision", "recall"]]
    
    experiment_type = "Ana" if filter_to_predicted_tags else "Cref"
    df_concat["Data Set"] = [experiment_type + " Train",     experiment_type + " Test", 
                             "CC Train Reg",   "CC Train " + experiment_type,
                             "CC Test Reg",    "CC Test  " + experiment_type]
    return df_concat
    

In [16]:
def get_predictions(essays, format_ana_tags, filter_to_predicted_tags, expected_tags,
                    nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len):
    
    pos_ana_filter = dict_pos_filter[pos_ana_key]
    pos_ch_filter  = dict_pos_ch_filter[pos_ch_key]
    
    proc_essays = get_coref_processed_essays(
                            essays=essays, format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)
                        
    act_ys_bycode  = ResultsProcessor.get_wd_level_lbs(proc_essays,  expected_tags=expected_tags)
    pred_ys_bycode = get_wd_level_preds(proc_essays, expected_tags=expected_tags)
    return act_ys_bycode, pred_ys_bycode

In [17]:
def get_metrics(essays, format_ana_tags, filter_to_predicted_tags, expected_tags,
                    nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len):
    
    pos_ana_filter = dict_pos_filter[pos_ana_key]
    pos_ch_filter  = dict_pos_ch_filter[pos_ch_key]
    
    proc_essays = get_coref_processed_essays(
                            essays=essays, format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)
                        
    metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags)
    row = metrics["MICRO_F1"]
    row[NEAREST_REF_ONLY] = blank_if_none(nearest_ref_only)
    row[MAX_ANA_PHRASE]   = blank_if_none(max_ana_phrase_len)
    row[MAX_CHAIN_PHRASE] = blank_if_none(max_cref_phrase_len)
    row[POS_ANA_FLTR]     = blank_if_none(pos_ana_key)
    row[POS_CHAIN_FLTR]   = blank_if_none(pos_ch_key)
    df_results = pd.DataFrame([row])
    return df_results

In [18]:
def get_wd_level_preds(essays, expected_tags):
    expected_tags = set(expected_tags)
    ysbycode = defaultdict(list)
    for e in essays:
        for sentix in range(len(e.sentences)):
            p_ccodes = e.pred_tagged_sentences[sentix]
            for wordix in range(len(p_ccodes)):
                tags = p_ccodes[wordix]
                if type(tags) == str:
                    ptag_set = {tags}
                elif type(tags) in (set,list):
                    ptag_set = set(tags)   
                else:
                    raise Exception("Unrecognized tag type")
                for exp_tag in expected_tags:
                    ysbycode[exp_tag].append(ResultsProcessor._ResultsProcessor__get_label_(exp_tag, ptag_set))
    return ysbycode

def get_metrics_raw(essays, expected_tags):
    act_ys_bycode  = ResultsProcessor.get_wd_level_lbs(essays,  expected_tags=expected_tags)
    pred_ys_bycode = get_wd_level_preds(essays, expected_tags=expected_tags)
    mean_metrics = ResultsProcessor.compute_mean_metrics(act_ys_bycode, pred_ys_bycode)
    return mean_metrics

## Evaluate Optimal Parameters

In [19]:
nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len = (False, 'None', 'NN+VB', 5, 2)

In [20]:
filter_to_predicted_tags=True

df_all_ana = get_all_metrics_with_optimal_settings(train_essays=training_essays, test_essays=test_essays,
        filter_to_predicted_tags=filter_to_predicted_tags, nearest_ref_only=nearest_ref_only, 
        pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
        max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)

In [21]:
"""
"Ana Train", "Ana Test" - Train/Test with Anaphor predicted tags

"CC Train Reg",  "CC Test Reg" - Concept codes withour anaphora tagging
"CC Train Ana", "CC Test Ana"  - Concept codes with anaphora tagging
"""
df_all_ana[["f1_score", "recall", "precision", "Data Set"]]

Unnamed: 0,f1_score,recall,precision,Data Set
0,0.035785,0.019027,0.3,Ana Train
0,0.129032,0.074766,0.470588,Ana Test
0,0.822463,0.805007,0.840692,CC Train Reg
0,0.82204,0.805007,0.83981,CC Train Ana
0,0.840573,0.830139,0.851273,CC Test Reg
0,0.839524,0.830139,0.849124,CC Test Ana


In [22]:
expected_tags = all_ana_tags
format_ana_tags=True
    
tr_ysbycode, tr_predsbycode = get_predictions(essays=training_essays, expected_tags=expected_tags, 
                    format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)

In [23]:
expected_tags = all_ana_tags
format_ana_tags=True
    
test_ysbycode, test_predsbycode = get_predictions(essays=test_essays, expected_tags=expected_tags, 
                    format_ana_tags=format_ana_tags,
                    filter_to_predicted_tags=filter_to_predicted_tags, 
                    nearest_ref_only=nearest_ref_only, pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
                               max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)

In [24]:
mean_metrics = ResultsProcessor.compute_mean_metrics(tr_ysbycode, tr_predsbycode)
mean_metrics["MICRO_F1"]

{'recall': 0.019027484143763214,
 'precision': 0.3,
 'f1_score': 0.03578528827037773,
 'accuracy': 0.9996295557953896,
 'num_codes': 473.0,
 'data_points': 1309239.0}

In [25]:
mean_metrics = ResultsProcessor.compute_mean_metrics(test_ysbycode, test_predsbycode)
mean_metrics["MICRO_F1"]

{'recall': 0.07476635514018691,
 'precision': 0.47058823529411764,
 'f1_score': 0.1290322580645161,
 'accuracy': 0.9996610360996554,
 'num_codes': 107.0,
 'data_points': 318618.0}

In [26]:
ResultsProcessor.persist_predictions("COREF_SC_GRID_BERKELEY_TD", tr_predsbycode,   tr_ysbycode)
ResultsProcessor.persist_predictions("COREF_SC_GRID_BERKELEY_VD", test_predsbycode, test_ysbycode)