In [1]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA

from results_common import get_essays, validate_essays
from process_essays_coref import get_coref_processed_essays, processed_essays_predict_most_recent_tag
from metrics import get_metrics_raw
from results_common import tally_essay_attributes

# Configure
- Set values from the hyper parameter tuning results

In [2]:
DATASET = "CoralBleaching" # CoralBleaching | SkinCancer

In [3]:
settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
# Which algorithm?
coref_predictions_folder = berkeley_coref_predictions_folder
print("CoRef Data: ", coref_predictions_folder)

CoRef Data:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/


## Load Essays

In [5]:
training_essays = get_essays(coref_predictions_folder, "Training")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/training_processed.dill


In [6]:
test_essays = get_essays(coref_predictions_folder, "Test")

Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


In [7]:
all_essays = training_essays + test_essays

## Look at the Anaphor Tags

In [8]:
from results_procesor import is_a_regular_code

cc_tally = defaultdict(int)
cr_tally = defaultdict(int)
reg_tally = defaultdict(int)
for e in all_essays:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if ANAPHORA in t and "other" not in t:
                    if "->" in t:
                        cr_tally[t] += 1
                    elif "Anaphor:[" in t and "rhetorical" not in t:
                        cc_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
all_ana_tags = sorted(cc_tally.keys())
assert len(reg_tags) == len(all_ana_tags)
all_ana_tags

['Anaphor:[11]',
 'Anaphor:[12]',
 'Anaphor:[13]',
 'Anaphor:[14]',
 'Anaphor:[1]',
 'Anaphor:[2]',
 'Anaphor:[3]',
 'Anaphor:[4]',
 'Anaphor:[50]',
 'Anaphor:[5]',
 'Anaphor:[5b]',
 'Anaphor:[6]',
 'Anaphor:[7]']

In [9]:
def blank_if_none(val):
    return "-" if (val is None or not val or str(val).lower() == "none") else val

In [10]:
def get_metrics(essays, format_ana_tags, expected_tags):
    
    proc_essays = processed_essays_predict_most_recent_tag(
        essays=essays, format_ana_tags=format_ana_tags)                
    
    metrics = get_metrics_raw(proc_essays, expected_tags=expected_tags,  micro_only=True)
    return metrics

In [11]:
def aggregate_metrics(metrics_dict, codes):
    aggregate = {'data_points': 0,
                 'num_codes': 0,             
                 'fn': 0.0,
                 'fp': 0.0,
                 'tn': 0.0,
                 'tp': 0.0}
    for tag in sorted(codes):    
        m = metrics_dict[tag]
        for k in list(aggregate.keys()):
            aggregate[k] += m[k]
    return aggregate

def combine_metrics(metricsa, metricsb):
    aggregate = dict(metricsa)    
    for k in list(aggregate.keys()):
        aggregate[k] += metricsb[k]
    return aggregate

In [12]:
def calc_metrics_from_counts(aggregate):
    tp, tn, fp, fn = aggregate["tp"], aggregate["tn"],aggregate["fp"],aggregate["fn"]
    prec = tp / (tp + fp)
    rec =  tp / (tp + fn)
    f1 = 2 * prec * rec / (prec + rec)

    return {"f1": f1, "prec": prec, "rec": rec}

## Get Data From Mongo

In [13]:
import numpy as np
import pandas as pd
import pymongo

client = pymongo.MongoClient()
db = client.metrics_codes

In [14]:
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)
import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(lambda s: str(s),vals)).encode('utf-8') 
    return hashlib.sha224(joined).hexdigest()

def get_df_sorted_by_f1score(collection, params=None, filter_cols=True):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "macro_f1_score":   "$MACRO_F1",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    
    # PARAMETERS            
            "window_size":    "$parameters.window_size",
            "feats":          "$parameters.extractors",
            "count": {        "$size" : "$parameters.extractors" },
            "asof" :          "$asof",
            "_id":1
    }
    
    # No count for HMM
    if "_hmm" in collection.lower():
        del project["count"]
    
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
        
    if filter_cols:
        cols = ["micro_f1_score", "micro_recall" ,"micro_precision", "macro_f1_score" ] + params
        return df[cols]
    return df, 

In [15]:
def get_best_row(collection):
    df = get_df_sorted_by_f1score(collection)
    top_row = df.iloc[0]

    c = db[collection]
    rows = list(c.find())
    best_row = None
    for r in rows:
        micro_f1 = r["MICRO_F1"]
        if micro_f1["f1_score"] == top_row["micro_f1_score"]:
            assert best_row is None
            best_row = r
    return best_row

In [16]:
def get_aggregate_counts_from_mongo(collection):
    best_row = get_best_row(collection)
    return aggregate_metrics(best_row, reg_tags)

In [17]:
from numpy import dtype

def round_data(df, places=3):
    df_copy = df.copy()
    dict_types = dict(df_copy.dtypes)
    fmt_str = "{0:." + str(places) + "f}"
    cols = set([v for v in df_copy.columns.values])
    for c in cols:
        dtyp = dict_types[c]
        if dtyp == dtype('float64'):
            df_copy[c] = df[c].apply(lambda d: fmt_str.format(d))  
    return df_copy

## Get Counts from Best CC Tagging Model

## NOTE on Methodology
- To back in to the metrics, we need to compute the raw fp, fn, tp, tn counts from the different experiment runs
- Then we take the run on the CC codes, and add those raw counts to the counts from the anaphora resolution. There is one minor issue - for one or two words these codes likely overlap but that's very rare (and may not happen)
- Then we re-compute the mirco metrics from these counts

In [18]:
def counts_to_metrics_df(counts):
    return pd.DataFrame([calc_metrics_from_counts(counts)])

## CV Metrics

#### Get CC Only Counts from Mongo

In [19]:
PREFIX = "CB" if DATASET == "CoralBleaching" else "SC"
vd_collection = PREFIX + "_TAGGING_VD_RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING"

In [20]:
vd_cc_counts = get_aggregate_counts_from_mongo(collection=vd_collection)
vd_cc_counts

{'data_points': 1783158,
 'fn': 5915.0,
 'fp': 4707.0,
 'num_codes': 33259,
 'tn': 1745192.0,
 'tp': 27344.0}

In [21]:
counts_to_metrics_df(vd_cc_counts)

Unnamed: 0,f1,prec,rec
0,0.83736,0.85314,0.822153


#### Get Ana Resolution Counts

In [22]:
df_vd_metrics_ana = get_metrics(essays=training_essays, 
        format_ana_tags=True, expected_tags=all_ana_tags)

vd_ana_counts = aggregate_metrics(df_vd_metrics_ana, all_ana_tags)
vd_ana_counts

{'data_points': 1783158,
 'fn': 261.0,
 'fp': 206.0,
 'num_codes': 344,
 'tn': 1782608.0,
 'tp': 83.0}

In [23]:
vd_ana_counts_no_algo = dict(vd_ana_counts)
vd_ana_counts_no_algo["tp"] = 0
vd_ana_counts_no_algo["tn"] = 0
vd_ana_counts_no_algo["fp"] = vd_ana_counts["num_codes"] # all incorrect

In [24]:
counts_to_metrics_df(vd_ana_counts)

Unnamed: 0,f1,prec,rec
0,0.262243,0.287197,0.241279


#### Combine Metrics

In [25]:
vd_counts_combined = combine_metrics(vd_cc_counts, vd_ana_counts)
counts_to_metrics_df(vd_counts_combined)

Unnamed: 0,f1,prec,rec
0,0.83184,0.848083,0.816207


In [26]:
vd_counts_combined_no_algo = combine_metrics(vd_cc_counts, vd_ana_counts_no_algo)
counts_to_metrics_df(vd_counts_combined_no_algo)

Unnamed: 0,f1,prec,rec
0,0.829675,0.844081,0.815752


## Test Metrics

In [27]:
PREFIX = "CB" if DATASET == "CoralBleaching" else "SC"
test_collection = "TEST_" + PREFIX + "_TAGGING_VD_RNN_MOST_COMMON_TAG"

In [28]:
test_cc_counts = get_aggregate_counts_from_mongo(collection=test_collection)
test_cc_counts

{'data_points': 399087,
 'fn': 1157.0,
 'fp': 953.0,
 'num_codes': 6792,
 'tn': 391342.0,
 'tp': 5635.0}

In [29]:
counts_to_metrics_df(test_cc_counts)

Unnamed: 0,f1,prec,rec
0,0.842302,0.855343,0.829653


In [30]:
df_test_metrics_ana = get_metrics(essays=test_essays, 
        format_ana_tags=True, expected_tags=all_ana_tags)

test_ana_counts = aggregate_metrics(df_test_metrics_ana, all_ana_tags)
test_ana_counts

{'data_points': 399087,
 'fn': 28.0,
 'fp': 18.0,
 'num_codes': 39,
 'tn': 399030.0,
 'tp': 11.0}

In [31]:
counts_to_metrics_df(test_ana_counts)

Unnamed: 0,f1,prec,rec
0,0.323529,0.37931,0.282051


In [32]:
test_counts_combined = combine_metrics(test_cc_counts, test_ana_counts)
test_counts_combined

{'data_points': 798174,
 'fn': 1185.0,
 'fp': 971.0,
 'num_codes': 6831,
 'tn': 790372.0,
 'tp': 5646.0}

In [33]:
counts_to_metrics_df(test_counts_combined)

Unnamed: 0,f1,prec,rec
0,0.839679,0.853257,0.826526


In [34]:
test_ana_counts_no_algo = dict(test_ana_counts)
test_ana_counts_no_algo["tp"] = 0
test_ana_counts_no_algo["tn"] = 0
test_ana_counts_no_algo["fp"] = test_ana_counts["num_codes"] # all incorrect

In [35]:
test_counts_combined_no_algo = combine_metrics(test_cc_counts, test_ana_counts_no_algo)
counts_to_metrics_df(test_counts_combined_no_algo)

Unnamed: 0,f1,prec,rec
0,0.838105,0.850309,0.826246


### Without Anaphora Resolution

In [36]:
vd_cc = calc_metrics_from_counts(vd_cc_counts)
vd_both = calc_metrics_from_counts(vd_counts_combined)

df_vd = pd.DataFrame([vd_cc, vd_both, 
                      calc_metrics_from_counts(vd_counts_combined_no_algo),
                      calc_metrics_from_counts(vd_ana_counts)
                     ])
df_vd["Dataset"] = "Validation"
df_vd["With Ana Algo"] = [False, True, False, True]
df_vd["With Ana Labels"] = [False, True, True, True]
df_vd["Ana Only"]      = [False, False, False, True]

In [37]:
test_cc = calc_metrics_from_counts(test_cc_counts)
test_both = calc_metrics_from_counts(test_counts_combined)

df_test = pd.DataFrame([test_cc, test_both, 
                        calc_metrics_from_counts(test_counts_combined_no_algo),
                        calc_metrics_from_counts(test_ana_counts)
                       ])
df_test["Dataset"] = "Test"
df_test["With Ana Algo"] = [False, True, False, True]
df_test["With Ana Labels"] = [False, True, True, True]
df_test["Ana Only"] = [False, False, False, True]

# Results

In [38]:
df = pd.concat([df_vd, df_test])[["Dataset", "With Ana Algo", "With Ana Labels", "Ana Only", "f1", "prec", "rec"]]
df

Unnamed: 0,Dataset,With Ana Algo,With Ana Labels,Ana Only,f1,prec,rec
0,Validation,False,False,False,0.83736,0.85314,0.822153
1,Validation,True,True,False,0.83184,0.848083,0.816207
2,Validation,False,True,False,0.829675,0.844081,0.815752
3,Validation,True,True,True,0.262243,0.287197,0.241279
0,Test,False,False,False,0.842302,0.855343,0.829653
1,Test,True,True,False,0.839679,0.853257,0.826526
2,Test,False,True,False,0.838105,0.850309,0.826246
3,Test,True,True,True,0.323529,0.37931,0.282051


## Round Data

In [40]:
round_data(df, 3)

Unnamed: 0,Dataset,With Ana Algo,With Ana Labels,Ana Only,f1,prec,rec
0,Validation,False,False,False,0.837,0.853,0.822
1,Validation,True,True,False,0.832,0.848,0.816
2,Validation,False,True,False,0.83,0.844,0.816
3,Validation,True,True,True,0.262,0.287,0.241
0,Test,False,False,False,0.842,0.855,0.83
1,Test,True,True,False,0.84,0.853,0.827
2,Test,False,True,False,0.838,0.85,0.826
3,Test,True,True,True,0.324,0.379,0.282
