## Goals
 - Take the merged predictions and evaluate the prediction accuracy using the 2 different approaches
 1. Look at the anaphora tags and then cross-reference co-reference labels
 2. Use the co-reference chains directly

In [1]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA

from results_common import get_essays, validate_essays, tally_essay_attributes
from process_essays_coref import get_coref_processed_essays
from metrics import get_metrics_raw

# progress bar widget
from ipywidgets import IntProgress
from IPython.display import display

DATASET = "SkinCancer" # CoralBleaching | SkinCancer

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
stanford_coref_predictions_folder = root_folder + "CoReference/Berkeley/"
print("CoRef Data: ", stanford_coref_predictions_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
CoRef Data:  /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/


In [2]:
training_essays = get_essays(stanford_coref_predictions_folder, "Training")

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/training_processed.dill


In [3]:
test_essays = get_essays(stanford_coref_predictions_folder, "Test")

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


In [4]:
all_essays = training_essays + test_essays

In [5]:
len(all_essays), len(training_essays), len(test_essays)

(1088, 870, 218)

### Validate the Lengths

In [6]:
validate_essays(training_essays)
validate_essays(test_essays)

Essays validated
Essays validated


In [7]:
# ner_tally = tally_essay_attributes(all_essays, attribute_name="pred_ner_tags_sentences")
pos_tally = tally_essay_attributes(all_essays, attribute_name="pred_pos_tags_sentences")

## Look at the Anaphor Tags

In [8]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in all_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
#assert len(reg_tags) == len(all_ana_tags)
print(len(all_ana_tags), len(reg_tags))
all_ana_tags

10 9


['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[6]',
 'Anaphor:[rhetorical]']

## Prepare POS Tag Filters

In [9]:
pos_nouns = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "NN"])
pos_verbs = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "VB"])
pos_pronouns = {"PRP","PRP$", "WP", "WP$"}
pos_determiners = {"DT","WDT","PDT"} # the, a, which, that, etc
pos_pron_dt = pos_pronouns | pos_determiners
# for meaning of pen treebank tags - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_nouns, pos_verbs, pos_pronouns, pos_determiners, pos_pron_dt

({'NN', 'NNP', 'NNS'},
 {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
 {'PRP', 'PRP$', 'WP', 'WP$'},
 {'DT', 'PDT', 'WDT'},
 {'DT', 'PDT', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$'})

In [10]:
dict_pos_filter = {
            "None": None,
            "PRN": pos_pronouns,
            "DT": pos_determiners,
            "PRN+DT": pos_pron_dt
}

dict_pos_ch_filter = {
    "None": None,
    "NN": pos_nouns,
    "VB": pos_verbs
}

In [11]:
def blank_if_none(val):
    return "-" if (val is None or not val or str(val).lower() == "none") else val

def process_sort_results(df_results):
    df_disp = df_results[["f1_score","precision","recall", LOOK_BACK, MAX_PHRASE, POS_FLTR, POS_CHAIN_FLTR]]
    return df_disp.sort_values("f1_score", ascending=False)

In [12]:
phrase_len = [None] + list(range(1,11))
look_back_vals = [True,False]

In [13]:
LOOK_BACK = "Look back"
MAX_PHRASE = "Max phrase"
POS_FLTR = "POS filter"
POS_CHAIN_FLTR = "Pos chain filter"

In [23]:
def filter_test_results_to_best_training_results(df_train_raw, df_test_raw):
    # make sure sorted to the top result
    df_train_raw_sorted = df_train_raw.sort_values("f1_score", ascending=False, inplace=False)
    top_row = df_train_raw_sorted.iloc[0]
    filtered_df = df_test_raw[df_test_raw[LOOK_BACK] == top_row[LOOK_BACK]]
    filtered_df = filtered_df[filtered_df[MAX_PHRASE] == top_row[MAX_PHRASE]]
    filtered_df = filtered_df[filtered_df[POS_FLTR] == top_row[POS_FLTR]]
    filtered_df = filtered_df[filtered_df[POS_CHAIN_FLTR] == top_row[POS_CHAIN_FLTR]]
    print(len(filtered_df))
    return process_sort_results(filtered_df)

# Grid Search With Anaphora Prediction Filters

### Training

In [14]:
def grid_search(essays, format_ana_tags, filter_to_predicted_tags, expected_tags):

    # set up progress bar
    max_count = len(look_back_vals)  * len(phrase_len) * len(dict_pos_filter) * len(dict_pos_ch_filter)
    iprogress_bar = IntProgress(min=0, max=max_count) # instantiate the bar
    display(iprogress_bar) # display the bar

    rows_ana = []
    for look_back_only in look_back_vals:
        for pos_key, pos_filter in dict_pos_filter.items():
            for pos_ch_key, pos_ch_filter in dict_pos_ch_filter.items():                
                for max_cref_phrase_len in phrase_len:
                    proc_essays = get_coref_processed_essays(
                        essays=essays, format_ana_tags=format_ana_tags, 
                        filter_to_predicted_tags=filter_to_predicted_tags, look_back_only=look_back_only,
                        max_cref_phrase_len=max_cref_phrase_len, ner_ch_filter=None, 
                        pos_filter=pos_filter, pos_ch_filter=pos_ch_filter)
                    metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags,  micro_only=True)
                    row = metrics["MICRO_F1"]
                    row[LOOK_BACK] = look_back_only
                    row[MAX_PHRASE] = blank_if_none(max_cref_phrase_len)
                    row[POS_FLTR] = blank_if_none(pos_key)
                    row[POS_CHAIN_FLTR] = blank_if_none(pos_ch_key)
                    rows_ana.append(row)
                    iprogress_bar.value += 1

    df_results = pd.DataFrame(rows_ana)
    return df_results

In [15]:
filter_to_predicted_tags = True
format_ana_tags=True # Format tags with Anaphora[xyz]

df_train_raw = grid_search(essays=training_essays, 
                           expected_tags=all_ana_tags,
                           filter_to_predicted_tags=filter_to_predicted_tags, format_ana_tags=format_ana_tags)

In [16]:
process_sort_results(df_train_raw).head()

Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
11,0.116854,0.125604,0.109244,True,-,-,NN
0,0.113369,0.115468,0.111345,True,-,-,-
8,0.111258,0.150538,0.088235,True,8,-,-
19,0.110953,0.171806,0.081933,True,8,-,NN
20,0.110193,0.16,0.084034,True,9,-,NN


### Test

In [17]:
filter_to_predicted_tags = True
format_ana_tags=True # Format tags with Anaphora[xyz]

df_test_raw = grid_search(essays=test_essays, 
                          expected_tags=all_ana_tags,
                           filter_to_predicted_tags=filter_to_predicted_tags, format_ana_tags=format_ana_tags)

In [18]:
process_sort_results(df_test_raw).head()

Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
0,0.167702,0.127962,0.243243,True,-,-,-
99,0.165414,0.141935,0.198198,True,-,PRN+DT,-
110,0.163934,0.150376,0.18018,True,-,PRN+DT,NN
11,0.163823,0.131868,0.216216,True,-,-,NN
231,0.154303,0.115044,0.234234,False,-,PRN+DT,-


In [24]:
filter_test_results_to_best_training_results(df_train_raw, df_test_raw)

1


Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
11,0.163823,0.131868,0.216216,True,-,-,NN


## Grid Search without Anaphora Predictions

### Training

In [19]:
filter_to_predicted_tags = False
format_ana_tags=True # Format tags with Anaphora[xyz]

df_train_raw_cref = grid_search(essays=training_essays, 
                                expected_tags=all_ana_tags,
                           filter_to_predicted_tags=filter_to_predicted_tags, format_ana_tags=format_ana_tags)

In [20]:
process_sort_results(df_train_raw_cref).head()

Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
46,0.03381,0.024482,0.054622,True,2,PRN,NN
45,0.032258,0.035714,0.029412,True,1,PRN,NN
47,0.030409,0.02107,0.054622,True,3,PRN,NN
177,0.029851,0.028355,0.031513,False,1,PRN,NN
178,0.029182,0.019404,0.058824,False,2,PRN,NN


### Test

In [21]:
filter_to_predicted_tags = False
format_ana_tags=True # Format tags with Anaphora[xyz]

df_test_raw_cref = grid_search(essays=test_essays, 
                               expected_tags=all_ana_tags,
                           filter_to_predicted_tags=filter_to_predicted_tags, format_ana_tags=format_ana_tags)

In [22]:
process_sort_results(df_test_raw_cref).head()

Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
46,0.042683,0.032258,0.063063,True,2,PRN,NN
45,0.040201,0.045455,0.036036,True,1,PRN,NN
178,0.039702,0.027397,0.072072,False,2,PRN,NN
47,0.038567,0.027778,0.063063,True,3,PRN,NN
177,0.034483,0.033058,0.036036,False,1,PRN,NN


In [25]:
filter_test_results_to_best_training_results(df_train_raw_cref, df_test_raw_cref)

1


Unnamed: 0,f1_score,precision,recall,Look back,Max phrase,POS filter,Pos chain filter
46,0.042683,0.032258,0.063063,True,2,PRN,NN
