In [99]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA

from results_common import get_essays, validate_essays
from process_essays_coref import get_coref_processed_essays
from metrics import get_metrics_raw

# Configure

In [106]:
# set optimal parameters
filter_to_predicted_tags=True

nearest_ref_only = True
pos_ana_key =     "None"
pos_ch_key  =     "None"
max_ana_phrase_len = None
max_cref_phrase_len = None

DATASET = "CoralBleaching" # CoralBleaching | SkinCancer

In [100]:
settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


## Coref Folder?

In [101]:
# Which algorithm?
coref_predictions_folder = berkeley_coref_predictions_folder
print("CoRef Data: ", stanford_coref_predictions_folder)

CoRef Data:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/


## Load Essays

In [2]:
training_essays = get_essays(coref_predictions_folder, "Training")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/training_processed.dill


In [3]:
test_essays = get_essays(coref_predictions_folder, "Test")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


In [4]:
all_essays = training_essays + test_essays

In [5]:
# ner_tally = tally_essay_attributes(all_essays, attribute_name="pred_ner_tags_sentences")
pos_tally = tally_essay_attributes(all_essays, attribute_name="pred_pos_tags_sentences")

## Look at the Anaphor Tags

In [6]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in all_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
assert len(reg_tags) == len(all_ana_tags)
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[13]',
 'Anaphor:[14]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[5b]',
 'Anaphor:[6]',
 'Anaphor:[7]']

In [98]:
NEAREST_REF_ONLY = "Nearest reference"
MAX_ANA_PHRASE = "Max ana phrase"
MAX_CHAIN_PHRASE = "Max chain phrase"
POS_ANA_FLTR = "POS ana filter"
POS_CHAIN_FLTR = "Pos chain filter"

def blank_if_none(val):
    return "-" if (val is None or not val or str(val).lower() == "none") else val

def process_sort_results(df_results):
    df_disp = df_results[["f1_score","precision","recall", 
                          NEAREST_REF_ONLY, MAX_ANA_PHRASE, MAX_CHAIN_PHRASE, POS_ANA_FLTR, POS_CHAIN_FLTR]]
    return df_disp.sort_values("f1_score", ascending=False)

## Prepare POS Tag Filters

In [8]:
pos_nouns = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "NN"])
pos_verbs = set([pos for pos in pos_tally.keys() if pos.strip()[:2] == "VB"])
pos_pronouns = {"PRP","PRP$", "WP", "WP$"}
pos_determiners = {"DT","WDT","PDT"} # the, a, which, that, etc
pos_pron_dt = pos_pronouns | pos_determiners
# for meaning of pen treebank tags - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_nouns, pos_verbs, pos_pronouns, pos_determiners, pos_pron_dt

({'NN', 'NNP', 'NNPS', 'NNS'},
 {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
 {'PRP', 'PRP$', 'WP', 'WP$'},
 {'DT', 'PDT', 'WDT'},
 {'DT', 'PDT', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$'})

In [9]:
dict_pos_filter = {
            "None": None,
            "PRN": pos_pronouns,
            "DT": pos_determiners,
            "PRN+DT": pos_pron_dt
}

dict_pos_ch_filter = {
    "None": None,
    "NN": pos_nouns,
    "VB": pos_verbs,
    "NN+VB": pos_nouns | pos_verbs
}

In [27]:
def get_metrics(essays, format_ana_tags, filter_to_predicted_tags, expected_tags,
                    nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len):
    
    pos_ana_filter = dict_pos_filter[pos_ana_key]
    pos_ch_filter  = dict_pos_ch_filter[pos_ch_key]
    
    proc_essays = get_coref_processed_essays(
                            essays=essays, format_ana_tags=format_ana_tags, 
                            ner_ch_filter=None, look_back_only=True,
                            filter_to_predicted_tags=filter_to_predicted_tags, 
                            max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len, 
                            pos_ana_filter=pos_ana_filter, pos_ch_filter=pos_ch_filter, 
                            nearest_ref_only=nearest_ref_only)
                        
    metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags,  micro_only=True)
    row = dict(metrics["MICRO_F1"])
    row[NEAREST_REF_ONLY] = blank_if_none(nearest_ref_only)
    row[MAX_ANA_PHRASE]   = blank_if_none(max_ana_phrase_len)
    row[MAX_CHAIN_PHRASE] = blank_if_none(max_cref_phrase_len)
    row[POS_ANA_FLTR]     = blank_if_none(pos_ana_key)
    row[POS_CHAIN_FLTR]   = blank_if_none(pos_ch_key)
    df_results = pd.DataFrame([row])
    return df_results, metrics

In [16]:
filter_to_predicted_tags, nearest_ref_only, pos_ana_key, pos_ch_key, max_ana_phrase_len, max_cref_phrase_len

(True, True, 'None', 'None', None, None)

In [93]:
def aggregate_metrics(metrics_dict, codes):
    aggregate = {'data_points': 0,
                 'num_codes': 0,             
                 'fn': 0.0,
                 'fp': 0.0,
                 'tn': 0.0,
                 'tp': 0.0}
    for tag in sorted(codes):    
        m = metrics_dict[tag]
        for k in list(aggregate.keys()):
            aggregate[k] += m[k]
    return aggregate

def combine_metrics(metricsa, metricsb):
    aggregate = dict(metricsa)    
    for k in list(aggregate.keys()):
        aggregate[k] += metricsb[k]
    return aggregate

In [90]:
def calc_metrics_from_counts(aggregate):
    tp, tn, fp, fn = aggregate["tp"], aggregate["tn"],aggregate["fp"],aggregate["fn"]
    prec = tp / (tp + fp)
    rec =  tp / (tp + fn)
    f1 = 2 * prec * rec / (prec + rec)

    return {"f1": f1, "prec": prec, "rec": rec}

## Get Data From Mongo

In [44]:
import numpy as np
import pandas as pd
import pymongo

client = pymongo.MongoClient()
db = client.metrics_codes

In [45]:
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)
import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(lambda s: str(s),vals)).encode('utf-8') 
    return hashlib.sha224(joined).hexdigest()

def get_df_sorted_by_f1score(collection, params=None, filter_cols=True):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "macro_f1_score":   "$MACRO_F1",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    
    # PARAMETERS            
            "window_size":    "$parameters.window_size",
            "feats":          "$parameters.extractors",
            "count": {        "$size" : "$parameters.extractors" },
            "asof" :          "$asof",
            "_id":1
    }
    
    # No count for HMM
    if "_hmm" in collection.lower():
        del project["count"]
    
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
        
    if filter_cols:
        cols = ["micro_f1_score", "micro_recall" ,"micro_precision", "macro_f1_score" ] + params
        return df[cols]
    return df, 

In [82]:
def get_best_row(collection):
    df = get_df_sorted_by_f1score(collection)
    top_row = df.iloc[0]

    c = db[collection]
    rows = list(c.find())
    best_row = None
    for r in rows:
        micro_f1 = r["MICRO_F1"]
        if micro_f1["f1_score"] == top_row["micro_f1_score"]:
            assert best_row is None
            best_row = r
    return best_row

## Get Tp, Fp, Fn, Tn Counts for Best Anaphora Model

In [127]:
df_train_raw, df_train_metrics = get_metrics(essays=training_essays, 
        filter_to_predicted_tags=filter_to_predicted_tags, 
        format_ana_tags=True, expected_tags=all_ana_tags,
        nearest_ref_only=nearest_ref_only, 
        pos_ana_key=pos_ana_key, pos_ch_key=pos_ch_key, 
        max_ana_phrase_len=max_ana_phrase_len, max_cref_phrase_len=max_cref_phrase_len)
df_train = process_sort_results(df_train_raw)

In [121]:
aggregate_train_ana = aggregate_metrics(df_train_metrics, all_ana_tags)
aggregate_train_ana

{'data_points': 1783158,
 'fn': 333.0,
 'fp': 18.0,
 'num_codes': 344,
 'tn': 1782796.0,
 'tp': 11.0}

## Get Counts from Best CC Tagging Model

In [117]:
best_model = "RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING"

In [118]:
PREFIX = "CB" if DATASET == "CoralBleaching" else "SC"
collection_train = PREFIX + "_TAGGING_TD_" + best_model
collection_train

'CB_TAGGING_TD_RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING'

In [119]:
best_row = get_best_row(collection)
aggregate_train_codes = aggregate_metrics(best_row, reg_tags)
aggregate_train_codes

{'data_points': 1783158,
 'fn': 5915.0,
 'fp': 4707.0,
 'num_codes': 33259,
 'tn': 1745192.0,
 'tp': 27344.0}

In [122]:
aggregate_train_combined = combine_metrics(aggregate_train_codes, aggregate_train_ana)
aggregate_train_combined

{'data_points': 3566316,
 'fn': 6248.0,
 'fp': 4725.0,
 'num_codes': 33603,
 'tn': 3527988.0,
 'tp': 27355.0}

## Training Metrics

### Without Anaphora Resolution

In [125]:
calc_metrics_from_counts(aggregate_train_codes)

{'f1': 0.837360281733272,
 'prec': 0.8531403076347072,
 'rec': 0.8221534020866532}

### With Anaphora Resolution

In [126]:
calc_metrics_from_counts(aggregate_train_combined)

{'f1': 0.8329400301447863,
 'prec': 0.8527119700748129,
 'rec': 0.8140642204565068}